diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index ed7e057..75d784f 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -87,7 +87,8 @@ def prepare_data( if train_proportion == 1: train = data - test = None + # Sample 10% of the data for testing + test = data.sample(round(len(data) * 0.1)) else: train, test = train_test_split( data, train_size=train_proportion, test_size=(1 - train_proportion) diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index 7ca4951..09e5910 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -26,9 +26,12 @@ prepare_data_params = settings.prepare_data build_model_params = settings.build_model feature_process_params = settings.feature_processor generate_metrics_params = settings.generate_metrics +generate_predictions_params = settings.generate_predictions model_type = build_model_params["model_type"] target = feature_process_params["feature_processor_config"]["target"] +fit_predictions_filepath = build_model_params["fit_predictions_filepath"] +predictions_column_name = generate_predictions_params["predictions_column_name"] identifier_columns = feature_process_params["feature_processor_config"][ "identifier_columns" ] @@ -60,6 +63,8 @@ def build_model( identifier_columns: List[str], model_save_location: str, model_hyperparameters: dict, + fit_predictions_filepath: str, + predictions_column_name: str, fit_metrics_filepath: str, train_filepath: Union[str, None] = None, test_filepath: Union[str, None] = None, @@ -93,6 +98,15 @@ def build_model( data=train_data, post_prediction_logic=post_prediction_logic ) + logger.info("--- Saving fit predictions ---") + + predictions_df = pd.DataFrame(fit_predictions) + predictions_df.columns = [predictions_column_name] + + dataclient.save_data( + obj=predictions_df, location=fit_predictions_filepath, save_config=None + ) + logger.info("--- Generating fit metrics ---") metrics_output = metrics.generate_metrics( @@ -128,6 +142,8 @@ if __name__ == "__main__": train_filepath=train_filepath, test_filepath=test_filepath, fit_metrics_filepath=fit_metrics_filepath, + fit_predictions_filepath=fit_predictions_filepath, + predictions_column_name=predictions_column_name, ) logger.info(f"--- {__file__} - Complete! ---") diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 9c97ef0..fcec7f7 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -3,6 +3,7 @@ default: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: null @@ -13,8 +14,8 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 600 + time_limit: 4000 presets: medium_quality - excluded_model_types: ['KNN', 'RF'] + excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index 69e9575..d4b1896 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -5,7 +5,7 @@ import pandas as pd def clip_predictions_to_minimum_value( - data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1 + data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0 ) -> pd.Series: series_name = predictions.name @@ -13,7 +13,8 @@ def clip_predictions_to_minimum_value( predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement replace_index = ( - predictions_df["predictions"] > predictions_df["heat_demand_starting"] - 1 + predictions_df["predictions"] + > predictions_df["heat_demand_starting"] - minimum_value ) predictions_df.loc[replace_index, "predictions"] = ( predictions_df.loc[replace_index, "heat_demand_starting"] - minimum_value diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 09792cf..35816b6 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,8 +21,9 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - train_proportion: 0.9 + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index c1bb042..97c9335 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: 896d3d88a4a9f68d174efe71dc089517 - size: 4222 + md5: 11a3b8bfdfe199ab7ecc39ccc5652649 + size: 4298 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -20,29 +20,29 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: heat_demand_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 0.9 + default.prepare_data.train_proportion: 1 outs: - path: data/prepared_data/ hash: md5 - md5: 613ddd198a29002e6e05a2d60275d924.dir - size: 32746979 + md5: dcd41f841c67b474a81a14e683646237.dir + size: 36317761 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: b824822475c222521516493e68eef9c5 - size: 4149 + md5: 7231450b78920b0c5e7c6bada496b24a + size: 4820 - path: data/prepared_data hash: md5 - md5: 613ddd198a29002e6e05a2d60275d924.dir - size: 32746979 + md5: dcd41f841c67b474a81a14e683646237.dir + size: 36317761 nfiles: 2 params: configs/build_model.yaml: @@ -51,6 +51,7 @@ stages: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear @@ -58,23 +59,32 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 600 + time_limit: 4000 presets: medium_quality excluded_model_types: - - KNN - RF + - FASTAI + - CAT + - NN_TORCH + - KNN + - XT infer_limit: 0.05 infer_limit_batch_size: 10000 outs: + - path: data/fit_predictions/ + hash: md5 + md5: 89063bb3b725afe61b6ed5edb724bb06.dir + size: 3090627 + nfiles: 1 - path: data/model/ hash: md5 - md5: 837a42a0655862229620495c645d5fed.dir - size: 342382387 - nfiles: 26 + md5: c90eef03b5a76175506c048e88a401dd.dir + size: 783489255 + nfiles: 32 - path: metrics/fit_metrics.json hash: md5 - md5: f8a394b86c33dc1b3ce97abed803c8f1 - size: 220 + md5: 33f18fa6b7dda535de09733d4792c0fc + size: 217 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +94,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 837a42a0655862229620495c645d5fed.dir - size: 342382387 - nfiles: 26 + md5: c90eef03b5a76175506c048e88a401dd.dir + size: 783489255 + nfiles: 32 - path: data/prepared_data hash: md5 - md5: 613ddd198a29002e6e05a2d60275d924.dir - size: 32746979 + md5: dcd41f841c67b474a81a14e683646237.dir + size: 36317761 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +112,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 75f8326e99eb9e1032728208229ec37b.dir - size: 314002 + md5: 406e2ebe33d6abed9042f137d8c0d2bf.dir + size: 520735 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +124,13 @@ stages: size: 3448 - path: data/predictions hash: md5 - md5: 75f8326e99eb9e1032728208229ec37b.dir - size: 314002 + md5: 406e2ebe33d6abed9042f137d8c0d2bf.dir + size: 520735 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 613ddd198a29002e6e05a2d60275d924.dir - size: 32746979 + md5: dcd41f841c67b474a81a14e683646237.dir + size: 36317761 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +140,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 269e89593f5e7ceb507c31dac2c2dd35 - size: 220 + md5: cc1ad408f2d9d3128df71822a38ea85e + size: 218 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index ccdd779..58889cc 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -38,6 +38,7 @@ stages: - configs/build_model.yaml: outs: - data/model/ + - data/fit_predictions/ - metrics/fit_metrics.json always_changed: true generate_predictions: