From c7aedcde048acbc8fa8427a9457fc436dafcfacc Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 30 May 2024 21:44:21 +0100 Subject: [PATCH] add new model for new data --- .../src/pipeline/configs/scenarios.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 189 ++++++++++++++++++ 2 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 modules/ml-pipeline/src/pipeline/dvc.lock diff --git a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml index 0d4ee07..dd146eb 100644 --- a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml @@ -8,6 +8,6 @@ default: # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet - - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet + # - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet comparison_output_filepath: ./metrics/scenario_table.md metrics_output_filepath: ./metrics/scenario_metrics.md diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock new file mode 100644 index 0000000..0f744d5 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -0,0 +1,189 @@ +schema: '2.0' +stages: + startup_cleanup: + cmd: python 0_startup_cleanup.py + deps: + - path: 0_startup_cleanup.py + hash: md5 + md5: b1b12f6b6393fbf8b83d23684df0a3d4 + size: 1220 + params: + configs/settings.yaml: + default.startup_cleanup.artefacts: ./data + default.startup_cleanup.metrics: ./metrics + prepare_data: + cmd: python 1_prepare_data.py + deps: + - path: 1_prepare_data.py + hash: md5 + md5: 11a3b8bfdfe199ab7ecc39ccc5652649 + size: 4298 + params: + configs/settings.yaml: + default.feature_processor.feature_processor_config.drop_columns: + - heat_demand_change + - carbon_change + - rdsap_change + - heat_demand_ending + - sap_ending + - days_to_starting + - days_to_ending + - number_habitable_rooms_starting + - number_habitable_rooms_ending + - number_heated_rooms_starting + - number_heated_rooms_ending + - number_habitable_rooms + - number_heated_rooms + default.feature_processor.feature_processor_config.retain_features: + default.feature_processor.feature_processor_config.subsample_amount: + default.feature_processor.feature_processor_config.subsample_seed: 0 + default.feature_processor.feature_processor_config.target: carbon_ending + default.feature_processor.feature_processor_type: dataframe + default.prepare_data.data_filepath: + s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet + default.prepare_data.input_dataclient_type: aws-s3 + default.prepare_data.output_dataclient_type: local + default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet + default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet + default.prepare_data.train_proportion: 0.9 + outs: + - path: data/prepared_data/ + hash: md5 + md5: e2efac20634b919381adfb962a42d40a.dir + size: 36961727 + nfiles: 2 + build_model: + cmd: python 2_build_model.py + deps: + - path: 2_build_model.py + hash: md5 + md5: 7231450b78920b0c5e7c6bada496b24a + size: 4820 + - path: data/prepared_data + hash: md5 + md5: e2efac20634b919381adfb962a42d40a.dir + size: 36961727 + nfiles: 2 + params: + configs/build_model.yaml: + default: + build_model: + model_type: AutogluonAutoML + model_save_filepath: ./data/model/optimised/ + fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet + SKLearnLinearRegression: + SKLearnSVMRegression: + kernel: linear + AutogluonAutoML: + output_filepath: ./data/model/allmodels/ + problem_type: regression + eval_metric: mean_squared_error + time_limit: 1800 + presets: medium_quality + excluded_model_types: + - RF + - CAT + - NN_TORCH + - KNN + - XT + infer_limit: 0.05 + infer_limit_batch_size: 10000 + ag_args_ensemble: + num_folds_parallel: 2 + outs: + - path: data/fit_predictions/ + hash: md5 + md5: d2568a3244df4d3444b6190599f74b96.dir + size: 3661106 + nfiles: 1 + - path: data/model/ + hash: md5 + md5: 756100e033e0bd4445a437e43f4c53af.dir + size: 730442848 + nfiles: 36 + - path: metrics/fit_metrics.json + hash: md5 + md5: 3bcb3b9728521cd341eb71af109ca778 + size: 227 + generate_predictions: + cmd: python 3_generate_predictions.py + deps: + - path: 3_generate_predictions.py + hash: md5 + md5: 0a70ad4dfe99414a75d1261c75a177b9 + size: 2464 + - path: data/model + hash: md5 + md5: 756100e033e0bd4445a437e43f4c53af.dir + size: 730442848 + nfiles: 36 + - path: data/prepared_data + hash: md5 + md5: e2efac20634b919381adfb962a42d40a.dir + size: 36961727 + nfiles: 2 + params: + configs/settings.yaml: + default.generate_predictions.input_dataclient_type: local + default.generate_predictions.output_dataclient_type: local + default.generate_predictions.predictions_column_name: predictions + default.generate_predictions.predictions_output_filepath: ./data/predictions/predictions.parquet + default.generate_predictions.test_data_filepath: ./data/prepared_data/test.parquet + outs: + - path: data/predictions/ + hash: md5 + md5: 09f3584d6fbd447dd2714eb2774139d5.dir + size: 499683 + nfiles: 1 + generate_metrics: + cmd: python 4_generate_metrics.py + deps: + - path: 4_generate_metrics.py + hash: md5 + md5: 4fedb86d89d528f0a6597934ba3890a0 + size: 3484 + - path: data/predictions + hash: md5 + md5: 09f3584d6fbd447dd2714eb2774139d5.dir + size: 499683 + nfiles: 1 + - path: data/prepared_data + hash: md5 + md5: e2efac20634b919381adfb962a42d40a.dir + size: 36961727 + nfiles: 2 + params: + configs/settings.yaml: + default.generate_metrics.dataclient_type: local + default.generate_metrics.metrics_output_filepath: ./metrics/metrics.json + default.generate_metrics.metrics_type: Regression + outs: + - path: metrics/metrics.json + hash: md5 + md5: abf8720d06f073f47501aa1172527e9e + size: 225 + generate_scenerio_metrics: + cmd: python 5_generate_scenarios.py + deps: + - path: 5_generate_scenarios.py + hash: md5 + md5: 40506749fefd926d47c60ff5b16db307 + size: 5337 + params: + configs/scenarios.yaml: + default.scenarios: + input_dataclient_type: aws-s3 + output_dataclient_type: local + scenario_data_filepaths: + comparison_output_filepath: ./metrics/scenario_table.md + metrics_output_filepath: ./metrics/scenario_metrics.md + outs: + - path: metrics/scenario_metrics.md + hash: md5 + md5: d41d8cd98f00b204e9800998ecf8427e + size: 0 + - path: metrics/scenario_table.md + hash: md5 + md5: d41d8cd98f00b204e9800998ecf8427e + size: 0