Added split training data

This commit is contained in:
Michael Duong 2023-08-15 23:13:34 +00:00
parent 6d7d03b3f0
commit d28aeddb8b
3 changed files with 15 additions and 3 deletions

View file

@ -99,18 +99,30 @@ def training(train_filepath: str, test_filepath: str) -> None:
# logger.info('Split data into train and validation')
logger.info('Build Model')
data = TabularDataset(data=train_df)
data = TabularDataset(data=train_filepath)
data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE'])
TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT']
# top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))]
data = data[['RDSAP_CHANGE'] + top_features.to_list()]
# data = TabularDataset(data=train_df)
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
subsample_size = round(len(data)/4)
subsample_size = round(len(data)/20)
data = data.sample(subsample_size, random_state=RANDOM_SEED)
# Add custom metric class MAPE
# Have a look at temporal features
target_column = 'RDSAP_CHANGE'
predictor_RDSAP = TabularPredictor(
label=target_column,
path="agModels-predictRDSAP",
problem_type="regression",
eval_metric='mean_absolute_error'
).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN'])
).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN'])
logger.info('Evaluate matrics')