diff --git a/model_data/simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet b/model_data/simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet new file mode 100644 index 00000000..ac5249ce Binary files /dev/null and b/model_data/simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet differ diff --git a/model_data/simulation_system/model_build_data/change_data/rdsap_full/train_validation_data.parquet b/model_data/simulation_system/model_build_data/change_data/rdsap_full/train_validation_data.parquet new file mode 100644 index 00000000..e7b2eb4a Binary files /dev/null and b/model_data/simulation_system/model_build_data/change_data/rdsap_full/train_validation_data.parquet differ diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index cde310a3..da2c6f4a 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -99,18 +99,30 @@ def training(train_filepath: str, test_filepath: str) -> None: # logger.info('Split data into train and validation') logger.info('Build Model') - data = TabularDataset(data=train_df) + + data = TabularDataset(data=train_filepath) + data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE']) + TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT'] + # top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))] + + data = data[['RDSAP_CHANGE'] + top_features.to_list()] + # data = TabularDataset(data=train_df) # data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float) - subsample_size = round(len(data)/4) + subsample_size = round(len(data)/20) data = data.sample(subsample_size, random_state=RANDOM_SEED) + # Add custom metric class MAPE + # Have a look at temporal features + target_column = 'RDSAP_CHANGE' predictor_RDSAP = TabularPredictor( label=target_column, path="agModels-predictRDSAP", problem_type="regression", eval_metric='mean_absolute_error' - ).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN']) + ).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN']) + + logger.info('Evaluate matrics')