diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index ed7e057..75d784f 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -87,7 +87,8 @@ def prepare_data( if train_proportion == 1: train = data - test = None + # Sample 10% of the data for testing + test = data.sample(round(len(data) * 0.1)) else: train, test = train_test_split( data, train_size=train_proportion, test_size=(1 - train_proportion) diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index 7ca4951..09e5910 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -26,9 +26,12 @@ prepare_data_params = settings.prepare_data build_model_params = settings.build_model feature_process_params = settings.feature_processor generate_metrics_params = settings.generate_metrics +generate_predictions_params = settings.generate_predictions model_type = build_model_params["model_type"] target = feature_process_params["feature_processor_config"]["target"] +fit_predictions_filepath = build_model_params["fit_predictions_filepath"] +predictions_column_name = generate_predictions_params["predictions_column_name"] identifier_columns = feature_process_params["feature_processor_config"][ "identifier_columns" ] @@ -60,6 +63,8 @@ def build_model( identifier_columns: List[str], model_save_location: str, model_hyperparameters: dict, + fit_predictions_filepath: str, + predictions_column_name: str, fit_metrics_filepath: str, train_filepath: Union[str, None] = None, test_filepath: Union[str, None] = None, @@ -93,6 +98,15 @@ def build_model( data=train_data, post_prediction_logic=post_prediction_logic ) + logger.info("--- Saving fit predictions ---") + + predictions_df = pd.DataFrame(fit_predictions) + predictions_df.columns = [predictions_column_name] + + dataclient.save_data( + obj=predictions_df, location=fit_predictions_filepath, save_config=None + ) + logger.info("--- Generating fit metrics ---") metrics_output = metrics.generate_metrics( @@ -128,6 +142,8 @@ if __name__ == "__main__": train_filepath=train_filepath, test_filepath=test_filepath, fit_metrics_filepath=fit_metrics_filepath, + fit_predictions_filepath=fit_predictions_filepath, + predictions_column_name=predictions_column_name, ) logger.info(f"--- {__file__} - Complete! ---") diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 9c97ef0..fcec7f7 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -3,6 +3,7 @@ default: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: null @@ -13,8 +14,8 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 600 + time_limit: 4000 presets: medium_quality - excluded_model_types: ['KNN', 'RF'] + excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index e4bd13b..051e34e 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,8 +21,9 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - train_proportion: 0.9 + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index d8da73d..dc996b7 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: 896d3d88a4a9f68d174efe71dc089517 - size: 4222 + md5: 11a3b8bfdfe199ab7ecc39ccc5652649 + size: 4298 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -20,29 +20,29 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: carbon_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 0.9 + default.prepare_data.train_proportion: 1 outs: - path: data/prepared_data/ hash: md5 - md5: 70d79ba4a6f0648439dc55031c944d47.dir - size: 32673907 + md5: 44737880f5437e23143479a7818a3136.dir + size: 36064622 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: b824822475c222521516493e68eef9c5 - size: 4149 + md5: 090bfb7dbaff39f45784b7fe332a9b8e + size: 4819 - path: data/prepared_data hash: md5 - md5: 70d79ba4a6f0648439dc55031c944d47.dir - size: 32673907 + md5: 44737880f5437e23143479a7818a3136.dir + size: 36064622 nfiles: 2 params: configs/build_model.yaml: @@ -51,6 +51,7 @@ stages: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear @@ -58,23 +59,32 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 600 + time_limit: 4000 presets: medium_quality excluded_model_types: - - KNN - RF + - FASTAI + - CAT + - NN_TORCH + - KNN + - XT infer_limit: 0.05 infer_limit_batch_size: 10000 outs: + - path: data/fit_predictions/ + hash: md5 + md5: 7b74ae1174ae2c7fab03ee0ce0a8ae71.dir + size: 3877514 + nfiles: 1 - path: data/model/ hash: md5 - md5: 2fc9223da8b72e61d81f06665e75019e.dir - size: 324532985 - nfiles: 27 + md5: d2ebaa73a894387f85083c49e58637bc.dir + size: 798349514 + nfiles: 32 - path: metrics/fit_metrics.json hash: md5 - md5: 7d2f226251ce6f8e92af73d50dadb890 - size: 228 + md5: 51c9c678bbd19bc9f7e16f0bf5df3fef + size: 229 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +94,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 2fc9223da8b72e61d81f06665e75019e.dir - size: 324532985 - nfiles: 27 + md5: d2ebaa73a894387f85083c49e58637bc.dir + size: 798349514 + nfiles: 32 - path: data/prepared_data hash: md5 - md5: 70d79ba4a6f0648439dc55031c944d47.dir - size: 32673907 + md5: 44737880f5437e23143479a7818a3136.dir + size: 36064622 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +112,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 8bfc33c14aba5abf5ac4bdba32ff3c4c.dir - size: 412880 + md5: ac0a698f14fb9002b337b1b163997333.dir + size: 638033 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +124,13 @@ stages: size: 3485 - path: data/predictions hash: md5 - md5: 8bfc33c14aba5abf5ac4bdba32ff3c4c.dir - size: 412880 + md5: ac0a698f14fb9002b337b1b163997333.dir + size: 638033 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 70d79ba4a6f0648439dc55031c944d47.dir - size: 32673907 + md5: 44737880f5437e23143479a7818a3136.dir + size: 36064622 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +140,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 9a0b57244dfdbd6dab0392a4fd618123 - size: 225 + md5: 47aa4601e71a93163d2cc1b85d0eda91 + size: 228 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index ccdd779..58889cc 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -38,6 +38,7 @@ stages: - configs/build_model.yaml: outs: - data/model/ + - data/fit_predictions/ - metrics/fit_metrics.json always_changed: true generate_predictions: