diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index ed7e057..75d784f 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -87,7 +87,8 @@ def prepare_data( if train_proportion == 1: train = data - test = None + # Sample 10% of the data for testing + test = data.sample(round(len(data) * 0.1)) else: train, test = train_test_split( data, train_size=train_proportion, test_size=(1 - train_proportion) diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index 7ca4951..09e5910 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -26,9 +26,12 @@ prepare_data_params = settings.prepare_data build_model_params = settings.build_model feature_process_params = settings.feature_processor generate_metrics_params = settings.generate_metrics +generate_predictions_params = settings.generate_predictions model_type = build_model_params["model_type"] target = feature_process_params["feature_processor_config"]["target"] +fit_predictions_filepath = build_model_params["fit_predictions_filepath"] +predictions_column_name = generate_predictions_params["predictions_column_name"] identifier_columns = feature_process_params["feature_processor_config"][ "identifier_columns" ] @@ -60,6 +63,8 @@ def build_model( identifier_columns: List[str], model_save_location: str, model_hyperparameters: dict, + fit_predictions_filepath: str, + predictions_column_name: str, fit_metrics_filepath: str, train_filepath: Union[str, None] = None, test_filepath: Union[str, None] = None, @@ -93,6 +98,15 @@ def build_model( data=train_data, post_prediction_logic=post_prediction_logic ) + logger.info("--- Saving fit predictions ---") + + predictions_df = pd.DataFrame(fit_predictions) + predictions_df.columns = [predictions_column_name] + + dataclient.save_data( + obj=predictions_df, location=fit_predictions_filepath, save_config=None + ) + logger.info("--- Generating fit metrics ---") metrics_output = metrics.generate_metrics( @@ -128,6 +142,8 @@ if __name__ == "__main__": train_filepath=train_filepath, test_filepath=test_filepath, fit_metrics_filepath=fit_metrics_filepath, + fit_predictions_filepath=fit_predictions_filepath, + predictions_column_name=predictions_column_name, ) logger.info(f"--- {__file__} - Complete! ---") diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 9c97ef0..fcec7f7 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -3,6 +3,7 @@ default: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: null @@ -13,8 +14,8 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 600 + time_limit: 4000 presets: medium_quality - excluded_model_types: ['KNN', 'RF'] + excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 026191c..103168d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -40,6 +40,11 @@ def keep_flats(df): return df +def keep_non_zero_rdsap(df): + df = df[df["rdsap_change"] != 0] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -49,6 +54,7 @@ def keep_flats(df): # return df business_logic = { + # "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, # "remove_floor_height_ending": remove_floor_height_ending diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index c1b8ebd..643231a 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -5,14 +5,16 @@ import pandas as pd def clip_predictions_to_minimum_value( - data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1 + data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0 ) -> pd.Series: series_name = predictions.name predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = predictions_df["sap_starting"] + 1 > predictions_df["predictions"] + replace_index = ( + predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"] + ) predictions_df.loc[replace_index, "predictions"] = ( predictions_df.loc[replace_index, "sap_starting"] + minimum_value ) diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index bcc8802..4327e64 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -22,8 +22,9 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - train_proportion: 0.9 + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 39314dc..f15978f 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: 896d3d88a4a9f68d174efe71dc089517 - size: 4222 + md5: 1793a35e71751d3c84f9affc67ecb9a8 + size: 4296 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -20,29 +20,29 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 0.9 + default.prepare_data.train_proportion: 1 outs: - path: data/prepared_data/ hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: b824822475c222521516493e68eef9c5 - size: 4149 + md5: 7231450b78920b0c5e7c6bada496b24a + size: 4820 - path: data/prepared_data hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 params: configs/build_model.yaml: @@ -51,6 +51,7 @@ stages: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear @@ -58,23 +59,32 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 600 + time_limit: 4000 presets: medium_quality excluded_model_types: - - KNN - RF + - FASTAI + - CAT + - NN_TORCH + - KNN + - XT infer_limit: 0.05 infer_limit_batch_size: 10000 outs: + - path: data/fit_predictions/ + hash: md5 + md5: ede187e9d0bffdef054f573f3c2bd222.dir + size: 3578590 + nfiles: 1 - path: data/model/ hash: md5 - md5: 0ad794c5498acfcc79893a371b29be62.dir - size: 372199625 - nfiles: 24 + md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir + size: 814720415 + nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: 534fa836074bdd9795b5879f0c479681 - size: 225 + md5: c45b84f12971a0156e4f3d85d3e725f5 + size: 218 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +94,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 0ad794c5498acfcc79893a371b29be62.dir - size: 372199625 - nfiles: 24 + md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir + size: 814720415 + nfiles: 31 - path: data/prepared_data hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +112,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 25ac7334855d5eacc5fd9e2879900f33.dir - size: 367393 + md5: 5e60ca251af51de6fef3d0c659f8bb27.dir + size: 627416 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +124,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 25ac7334855d5eacc5fd9e2879900f33.dir - size: 367393 + md5: 5e60ca251af51de6fef3d0c659f8bb27.dir + size: 627416 nfiles: 1 - path: data/prepared_data hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +140,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: a6fa095b4cc44e6dd7828708f8cca18b - size: 222 + md5: 033efa4d4044b6b6fc92dd37194727fa + size: 225 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index ccdd779..58889cc 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -38,6 +38,7 @@ stages: - configs/build_model.yaml: outs: - data/model/ + - data/fit_predictions/ - metrics/fit_metrics.json always_changed: true generate_predictions: