From efb84723bb7bf6e742c408854fdaecf918f8e133 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 23 Jan 2024 19:27:53 +0000 Subject: [PATCH 1/4] test model with 1 percent o change records --- .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/settings.yaml | 3 +- modules/ml-pipeline/src/pipeline/dvc.lock | 44 +++++++++---------- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 9c97ef0..4c72487 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 600 + time_limit: 400 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index bcc8802..4ba4779 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -22,7 +22,8 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-datalake-dev/dataset_with0.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 39314dc..19173d2 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -20,7 +20,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 1b1f7467e4abc12e6febbf2a84756914.dir + size: 39780684 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 1b1f7467e4abc12e6febbf2a84756914.dir + size: 39780684 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 600 + time_limit: 400 presets: medium_quality excluded_model_types: - KNN @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 0ad794c5498acfcc79893a371b29be62.dir - size: 372199625 + md5: c83b4cf0c51bd433bfb38307e978ed39.dir + size: 344485548 nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 534fa836074bdd9795b5879f0c479681 - size: 225 + md5: 3105f9cf71b69b5b0f5675b2c169273c + size: 223 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 0ad794c5498acfcc79893a371b29be62.dir - size: 372199625 + md5: c83b4cf0c51bd433bfb38307e978ed39.dir + size: 344485548 nfiles: 24 - path: data/prepared_data hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 1b1f7467e4abc12e6febbf2a84756914.dir + size: 39780684 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 25ac7334855d5eacc5fd9e2879900f33.dir - size: 367393 + md5: f914cf31400e228ee6e1386155b68e7c.dir + size: 369783 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 25ac7334855d5eacc5fd9e2879900f33.dir - size: 367393 + md5: f914cf31400e228ee6e1386155b68e7c.dir + size: 369783 nfiles: 1 - path: data/prepared_data hash: md5 - md5: d047420c632d91203199b9a93b6b0134.dir - size: 39476967 + md5: 1b1f7467e4abc12e6febbf2a84756914.dir + size: 39780684 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: a6fa095b4cc44e6dd7828708f8cca18b - size: 222 + md5: c23b7f0628473bf42eef126167e8928e + size: 224 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From ca2a3d362352cb901cd85d00e9850df454ddbd41 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 23 Jan 2024 21:46:24 +0000 Subject: [PATCH 2/4] longer run model --- .../src/pipeline/configs/build_model.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 44 +++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 4c72487..354b2ca 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 400 + time_limit: 800 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 19173d2..4d669b5 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 1b1f7467e4abc12e6febbf2a84756914.dir - size: 39780684 + md5: 12b7939c38b6a201063b063ed64d521b.dir + size: 39840424 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: 1b1f7467e4abc12e6febbf2a84756914.dir - size: 39780684 + md5: 12b7939c38b6a201063b063ed64d521b.dir + size: 39840424 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 400 + time_limit: 800 presets: medium_quality excluded_model_types: - KNN @@ -68,12 +68,12 @@ stages: outs: - path: data/model/ hash: md5 - md5: c83b4cf0c51bd433bfb38307e978ed39.dir - size: 344485548 - nfiles: 24 + md5: 7d062363a9de5a659df638de1541d9ee.dir + size: 383515358 + nfiles: 26 - path: metrics/fit_metrics.json hash: md5 - md5: 3105f9cf71b69b5b0f5675b2c169273c + md5: 06c50da7ca7fdb631896790b76a5e19d size: 223 generate_predictions: cmd: python 3_generate_predictions.py @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: c83b4cf0c51bd433bfb38307e978ed39.dir - size: 344485548 - nfiles: 24 + md5: 7d062363a9de5a659df638de1541d9ee.dir + size: 383515358 + nfiles: 26 - path: data/prepared_data hash: md5 - md5: 1b1f7467e4abc12e6febbf2a84756914.dir - size: 39780684 + md5: 12b7939c38b6a201063b063ed64d521b.dir + size: 39840424 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: f914cf31400e228ee6e1386155b68e7c.dir - size: 369783 + md5: d6c97ad17146677fe705ccd7bcbb4873.dir + size: 369475 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: f914cf31400e228ee6e1386155b68e7c.dir - size: 369783 + md5: d6c97ad17146677fe705ccd7bcbb4873.dir + size: 369475 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 1b1f7467e4abc12e6febbf2a84756914.dir - size: 39780684 + md5: 12b7939c38b6a201063b063ed64d521b.dir + size: 39840424 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: c23b7f0628473bf42eef126167e8928e - size: 224 + md5: 6bb037ff29c7119576c8818b395d32f6 + size: 225 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From d356fbfed0b22be77da90a9d00d67e240c18d015 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 24 Jan 2024 10:29:56 +0000 Subject: [PATCH 3/4] test model with all permutation and zero records --- .../configs/feature_processor_logic.py | 6 +++ .../src/pipeline/configs/settings.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 42 +++++++++---------- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 026191c..2d14dc4 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -40,6 +40,11 @@ def keep_flats(df): return df +def keep_non_zero_rdsap(df): + df = df[df["rdsap_change"] != 0] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -49,6 +54,7 @@ def keep_flats(df): # return df business_logic = { + "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, # "remove_floor_height_ending": remove_floor_height_ending diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 4ba4779..ba05d38 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -23,7 +23,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - data_filepath: s3://retrofit-datalake-dev/dataset_with0.parquet + data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 4d669b5..dde6078 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -20,7 +20,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0.parquet + default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 12b7939c38b6a201063b063ed64d521b.dir - size: 39840424 + md5: 312d09b682ce0c973eabcec40e2741fe.dir + size: 39832060 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: 12b7939c38b6a201063b063ed64d521b.dir - size: 39840424 + md5: 312d09b682ce0c973eabcec40e2741fe.dir + size: 39832060 nfiles: 2 params: configs/build_model.yaml: @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 7d062363a9de5a659df638de1541d9ee.dir - size: 383515358 + md5: 7708d5705a2db2d621dae73338a641ae.dir + size: 393761847 nfiles: 26 - path: metrics/fit_metrics.json hash: md5 - md5: 06c50da7ca7fdb631896790b76a5e19d - size: 223 + md5: f7c3a5d39644d41cf60872baad7797b2 + size: 222 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 7d062363a9de5a659df638de1541d9ee.dir - size: 383515358 + md5: 7708d5705a2db2d621dae73338a641ae.dir + size: 393761847 nfiles: 26 - path: data/prepared_data hash: md5 - md5: 12b7939c38b6a201063b063ed64d521b.dir - size: 39840424 + md5: 312d09b682ce0c973eabcec40e2741fe.dir + size: 39832060 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: d6c97ad17146677fe705ccd7bcbb4873.dir - size: 369475 + md5: dade2114bb2be2769cf0648b8046f705.dir + size: 369115 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: d6c97ad17146677fe705ccd7bcbb4873.dir - size: 369475 + md5: dade2114bb2be2769cf0648b8046f705.dir + size: 369115 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 12b7939c38b6a201063b063ed64d521b.dir - size: 39840424 + md5: 312d09b682ce0c973eabcec40e2741fe.dir + size: 39832060 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 6bb037ff29c7119576c8818b395d32f6 - size: 225 + md5: 3315792b9f7e6f55d59a39db03ee7093 + size: 222 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From 353b62bc77f507c7b5278a62fd4b4419a788634b Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 29 Jan 2024 09:03:36 +0000 Subject: [PATCH 4/4] test model with all data, using interal cross validation, all dataset with permuation and 0, test data is just a random 10 percent sample of the training data --- .../src/pipeline/1_prepare_data.py | 3 +- .../ml-pipeline/src/pipeline/2_build_model.py | 16 +++++ .../src/pipeline/configs/build_model.yaml | 5 +- .../configs/feature_processor_logic.py | 2 +- .../pipeline/configs/post_prediction_logic.py | 6 +- .../src/pipeline/configs/settings.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 68 +++++++++++-------- modules/ml-pipeline/src/pipeline/dvc.yaml | 1 + 8 files changed, 67 insertions(+), 36 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index ed7e057..75d784f 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -87,7 +87,8 @@ def prepare_data( if train_proportion == 1: train = data - test = None + # Sample 10% of the data for testing + test = data.sample(round(len(data) * 0.1)) else: train, test = train_test_split( data, train_size=train_proportion, test_size=(1 - train_proportion) diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index 7ca4951..09e5910 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -26,9 +26,12 @@ prepare_data_params = settings.prepare_data build_model_params = settings.build_model feature_process_params = settings.feature_processor generate_metrics_params = settings.generate_metrics +generate_predictions_params = settings.generate_predictions model_type = build_model_params["model_type"] target = feature_process_params["feature_processor_config"]["target"] +fit_predictions_filepath = build_model_params["fit_predictions_filepath"] +predictions_column_name = generate_predictions_params["predictions_column_name"] identifier_columns = feature_process_params["feature_processor_config"][ "identifier_columns" ] @@ -60,6 +63,8 @@ def build_model( identifier_columns: List[str], model_save_location: str, model_hyperparameters: dict, + fit_predictions_filepath: str, + predictions_column_name: str, fit_metrics_filepath: str, train_filepath: Union[str, None] = None, test_filepath: Union[str, None] = None, @@ -93,6 +98,15 @@ def build_model( data=train_data, post_prediction_logic=post_prediction_logic ) + logger.info("--- Saving fit predictions ---") + + predictions_df = pd.DataFrame(fit_predictions) + predictions_df.columns = [predictions_column_name] + + dataclient.save_data( + obj=predictions_df, location=fit_predictions_filepath, save_config=None + ) + logger.info("--- Generating fit metrics ---") metrics_output = metrics.generate_metrics( @@ -128,6 +142,8 @@ if __name__ == "__main__": train_filepath=train_filepath, test_filepath=test_filepath, fit_metrics_filepath=fit_metrics_filepath, + fit_predictions_filepath=fit_predictions_filepath, + predictions_column_name=predictions_column_name, ) logger.info(f"--- {__file__} - Complete! ---") diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 354b2ca..fcec7f7 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -3,6 +3,7 @@ default: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: null @@ -13,8 +14,8 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 800 + time_limit: 4000 presets: medium_quality - excluded_model_types: ['KNN', 'RF'] + excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 2d14dc4..103168d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -54,7 +54,7 @@ def keep_non_zero_rdsap(df): # return df business_logic = { - "keep_non_zero_rdsap": keep_non_zero_rdsap, + # "keep_non_zero_rdsap": keep_non_zero_rdsap, # "keep_flats": keep_flats, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, # "remove_floor_height_ending": remove_floor_height_ending diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index c1b8ebd..643231a 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -5,14 +5,16 @@ import pandas as pd def clip_predictions_to_minimum_value( - data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1 + data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0 ) -> pd.Series: series_name = predictions.name predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = predictions_df["sap_starting"] + 1 > predictions_df["predictions"] + replace_index = ( + predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"] + ) predictions_df.loc[replace_index, "predictions"] = ( predictions_df.loc[replace_index, "sap_starting"] + minimum_value ) diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index ba05d38..4327e64 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -24,7 +24,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet - train_proportion: 0.9 + train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index dde6078..f15978f 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: 896d3d88a4a9f68d174efe71dc089517 - size: 4222 + md5: 1793a35e71751d3c84f9affc67ecb9a8 + size: 4296 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -25,24 +25,24 @@ stages: default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 0.9 + default.prepare_data.train_proportion: 1 outs: - path: data/prepared_data/ hash: md5 - md5: 312d09b682ce0c973eabcec40e2741fe.dir - size: 39832060 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: b824822475c222521516493e68eef9c5 - size: 4149 + md5: 7231450b78920b0c5e7c6bada496b24a + size: 4820 - path: data/prepared_data hash: md5 - md5: 312d09b682ce0c973eabcec40e2741fe.dir - size: 39832060 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 params: configs/build_model.yaml: @@ -51,6 +51,7 @@ stages: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear @@ -58,23 +59,32 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 800 + time_limit: 4000 presets: medium_quality excluded_model_types: - - KNN - RF + - FASTAI + - CAT + - NN_TORCH + - KNN + - XT infer_limit: 0.05 infer_limit_batch_size: 10000 outs: + - path: data/fit_predictions/ + hash: md5 + md5: ede187e9d0bffdef054f573f3c2bd222.dir + size: 3578590 + nfiles: 1 - path: data/model/ hash: md5 - md5: 7708d5705a2db2d621dae73338a641ae.dir - size: 393761847 - nfiles: 26 + md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir + size: 814720415 + nfiles: 31 - path: metrics/fit_metrics.json hash: md5 - md5: f7c3a5d39644d41cf60872baad7797b2 - size: 222 + md5: c45b84f12971a0156e4f3d85d3e725f5 + size: 218 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +94,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 7708d5705a2db2d621dae73338a641ae.dir - size: 393761847 - nfiles: 26 + md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir + size: 814720415 + nfiles: 31 - path: data/prepared_data hash: md5 - md5: 312d09b682ce0c973eabcec40e2741fe.dir - size: 39832060 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +112,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: dade2114bb2be2769cf0648b8046f705.dir - size: 369115 + md5: 5e60ca251af51de6fef3d0c659f8bb27.dir + size: 627416 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +124,13 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: dade2114bb2be2769cf0648b8046f705.dir - size: 369115 + md5: 5e60ca251af51de6fef3d0c659f8bb27.dir + size: 627416 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 312d09b682ce0c973eabcec40e2741fe.dir - size: 39832060 + md5: 84fa631bd02686b052d6a7144eafd38e.dir + size: 43859225 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +140,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 3315792b9f7e6f55d59a39db03ee7093 - size: 222 + md5: 033efa4d4044b6b6fc92dd37194727fa + size: 225 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index ccdd779..58889cc 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -38,6 +38,7 @@ stages: - configs/build_model.yaml: outs: - data/model/ + - data/fit_predictions/ - metrics/fit_metrics.json always_changed: true generate_predictions: