From ad2c2667279abfad2bfdab4852d323424c128b9d Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 9 Oct 2023 17:52:47 +0000 Subject: [PATCH 01/15] initial model for heat-dev --- .../src/pipeline/configs/build_model.yaml | 2 +- .../configs/feature_processor_logic.py | 6 +++ .../pipeline/configs/post_prediction_logic.py | 6 ++- .../src/pipeline/configs/settings.yaml | 4 +- .../src/pipeline/core/MLMetrics.py | 15 +++++- modules/ml-pipeline/src/pipeline/dvc.lock | 46 +++++++++---------- modules/ml-pipeline/src/pipeline/eda.py | 6 ++- 7 files changed, 55 insertions(+), 30 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index d296e6a..fdeb8c5 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,6 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 1000 presets: medium_quality excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index c32d2fe..cece163 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -18,6 +18,11 @@ def remove_starting_columns(df): return df +def keep_negative_heat_change(df): + df = df[df["HEAT_DEMAND_CHANGE"] < 0] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -27,6 +32,7 @@ def remove_starting_columns(df): # return df business_logic = { + "keep_negative_heat_change": keep_negative_heat_change # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index b85d3a4..83389b0 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -12,9 +12,11 @@ def clip_predictions_to_minimum_value( predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"] + replace_index = ( + predictions_df["predictions"] > predictions_df["HEAT_DEMAND_STARTING"] - 1 + ) predictions_df.loc[replace_index, "predictions"] = ( - predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value + predictions_df.loc[replace_index, "HEAT_DEMAND_STARTING"] - minimum_value ) predictions_new = predictions_df["predictions"] diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index a84c095..25789b5 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -31,9 +31,9 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: SAP_ENDING + target: HEAT_DEMAND_ENDING identifier_columns: ["UPRN"] - drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] + drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "SAP_ENDING", "CARBON_ENDING"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py index 4b14386..845b819 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py +++ b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py @@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods: - Generate Plot Suite """ +import numpy as np import pandas as pd from typing import Union from sklearn.metrics import ( @@ -14,6 +15,18 @@ from sklearn.metrics import ( ) from core.interface.InterfaceMetrics import MLMetrics +# Define the function to return the SMAPE value +def symmetric_mape(actual, predicted) -> float: + + # Convert actual and predicted to numpy + # array data type if not already + if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]): + actual, predicted = np.array(actual), np.array(predicted) + + return np.mean( + np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2) + ) + def metrics_factory(metrics_type: str) -> MLMetrics: metrics = { @@ -34,7 +47,7 @@ class RegressionMetrics: median_absolute_error, mean_squared_error, mean_absolute_percentage_error, - # max_error + symmetric_mape, ] def generate_metrics( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index c499874..46211fa 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -13,12 +13,12 @@ stages: - HEAT_DEMAND_CHANGE - CARBON_CHANGE - RDSAP_CHANGE - - HEAT_DEMAND_ENDING + - SAP_ENDING - CARBON_ENDING default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: SAP_ENDING + default.feature_processor.feature_processor_config.target: HEAT_DEMAND_ENDING default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: 71e63a792f7723e2aea0709efde1a92b.dir + size: 31751660 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 5359 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: 71e63a792f7723e2aea0709efde1a92b.dir + size: 31751660 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 + time_limit: 1000 presets: medium_quality excluded_model_types: - KNN @@ -66,13 +66,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 7bb5156243b4db39349e80a01ffecde4.dir - size: 473398662 + md5: 0ffc51be7c8381c9e4106309e3e05ca3.dir + size: 345904743 nfiles: 27 - path: metrics/fit_metrics.json hash: md5 - md5: 2bb16ac67de8778fbc08171d562b34d5 - size: 184 + md5: 3d4ff3a3ca3c327e2c1e9aa1338c18ce + size: 220 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -82,13 +82,13 @@ stages: size: 3028 - path: data/model hash: md5 - md5: 7bb5156243b4db39349e80a01ffecde4.dir - size: 473398662 + md5: 0ffc51be7c8381c9e4106309e3e05ca3.dir + size: 345904743 nfiles: 27 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: 71e63a792f7723e2aea0709efde1a92b.dir + size: 31751660 nfiles: 2 params: configs/settings.yaml: @@ -100,8 +100,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 0bb3cf991906953def81c8204cdcfaf0.dir - size: 374532 + md5: 00ff804016290d56e1490e59c098b060.dir + size: 351811 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -112,13 +112,13 @@ stages: size: 4487 - path: data/predictions hash: md5 - md5: 0bb3cf991906953def81c8204cdcfaf0.dir - size: 374532 + md5: 00ff804016290d56e1490e59c098b060.dir + size: 351811 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: 71e63a792f7723e2aea0709efde1a92b.dir + size: 31751660 nfiles: 2 params: configs/settings.yaml: @@ -128,8 +128,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 2e13ae67759a64261d03224f1c0d4bf4 - size: 185 + md5: 63ef63e4fabe929b914a0059ceeddabc + size: 221 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index 2fdd8be..21f2bc8 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -38,7 +38,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o") train_df[[target, "HEAT_DEMAND_STARTING"]].plot( x=target, y="HEAT_DEMAND_STARTING", style="o" ) - # Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict # Load the autogluon model and check feature importance @@ -176,6 +175,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6}) # # +from core.MLMetrics import metrics_factory + from core.MLModels import model_factory from core.DataClient import dataclient_factory import pandas as pd @@ -206,6 +207,9 @@ mix_df = pd.concat([test_df.copy(), predictions], axis=1) mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) mix_df = mix_df.sort_values("residual", ascending=False) +metrics = metrics_factory("Regression") +metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"]) + cosine_similarity_df = mix_df[ mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"]) ] From 9e7d0fa538ab1d01b502d0554d819ba6e3d7b36a Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 12:32:25 +0000 Subject: [PATCH 02/15] add new model --- .../src/pipeline/configs/build_model.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 42 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index fdeb8c5..d296e6a 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,6 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 1000 + time_limit: 4000 presets: medium_quality excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 46211fa..13851cf 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 71e63a792f7723e2aea0709efde1a92b.dir - size: 31751660 + md5: e0be70d5025e40dd0d655d9949f72130.dir + size: 31800776 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 5359 - path: data/prepared_data hash: md5 - md5: 71e63a792f7723e2aea0709efde1a92b.dir - size: 31751660 + md5: e0be70d5025e40dd0d655d9949f72130.dir + size: 31800776 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 1000 + time_limit: 4000 presets: medium_quality excluded_model_types: - KNN @@ -66,13 +66,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 0ffc51be7c8381c9e4106309e3e05ca3.dir - size: 345904743 + md5: 14ca33cde5e86770135f768abaf84978.dir + size: 422447808 nfiles: 27 - path: metrics/fit_metrics.json hash: md5 - md5: 3d4ff3a3ca3c327e2c1e9aa1338c18ce - size: 220 + md5: 41bfb8d2da8f06d1864d73ce125cc6aa + size: 221 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -82,13 +82,13 @@ stages: size: 3028 - path: data/model hash: md5 - md5: 0ffc51be7c8381c9e4106309e3e05ca3.dir - size: 345904743 + md5: 14ca33cde5e86770135f768abaf84978.dir + size: 422447808 nfiles: 27 - path: data/prepared_data hash: md5 - md5: 71e63a792f7723e2aea0709efde1a92b.dir - size: 31751660 + md5: e0be70d5025e40dd0d655d9949f72130.dir + size: 31800776 nfiles: 2 params: configs/settings.yaml: @@ -100,8 +100,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 00ff804016290d56e1490e59c098b060.dir - size: 351811 + md5: 40d0c7a7fd4a15add0615e322cf341a0.dir + size: 352151 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -112,13 +112,13 @@ stages: size: 4487 - path: data/predictions hash: md5 - md5: 00ff804016290d56e1490e59c098b060.dir - size: 351811 + md5: 40d0c7a7fd4a15add0615e322cf341a0.dir + size: 352151 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 71e63a792f7723e2aea0709efde1a92b.dir - size: 31751660 + md5: e0be70d5025e40dd0d655d9949f72130.dir + size: 31800776 nfiles: 2 params: configs/settings.yaml: @@ -128,8 +128,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 63ef63e4fabe929b914a0059ceeddabc - size: 221 + md5: 4e023650240e78d6ad761f1db7aac922 + size: 220 startup_cleanup: cmd: python 0_startup_cleanup.py deps: From 4c6c5330d82bd4ffacea213b0c7b1ee4593ee525 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 12:33:44 +0000 Subject: [PATCH 03/15] add new model, new branch --- modules/ml-pipeline/src/pipeline/4_generate_metrics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py index 7b115a2..1f97d87 100644 --- a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py @@ -4,9 +4,7 @@ After the model is built, we can evaluate its performance """ import os -import yaml import pandas as pd -from pathlib import Path from core.interface.InterfaceModels import MLModel from core.interface.InterfaceMetrics import MLMetrics from core.interface.InterfaceDataClient import DataClient From 7d685caaf54898152414d55189a6aa92e144c8c4 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 12:46:02 +0000 Subject: [PATCH 04/15] Update Registry --- MODEL_REGISTRY.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 9ab31e5..d765cf4 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,9 +8,17 @@ "active": true }, "sap": { - "version": "v0.0.3", + "version": "v0.0.4", "stage": { - "dev": "v0.0.3" + "dev": "v0.0.4" + }, + "registered": true, + "active": true + }, + "heat": { + "version": "v0.0.1", + "stage": { + "dev": null }, "registered": true, "active": true From 2d331736a4dacc2e1edac368ce59335e024f83ee Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 12:47:01 +0000 Subject: [PATCH 05/15] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index d765cf4..1010de0 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -18,7 +18,7 @@ "heat": { "version": "v0.0.1", "stage": { - "dev": null + "dev": "v0.0.1" }, "registered": true, "active": true From 5a9eb608bd9fc5b2f0cb96ae48c57c94ec1b1e9e Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 27 Nov 2023 22:06:18 +0000 Subject: [PATCH 06/15] commit first heat-model --- .../src/pipeline/configs/build_model.yaml | 2 +- modules/ml-pipeline/src/pipeline/dvc.lock | 70 ++++++++++--------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 1ebb62d..4c72487 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 400 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 13851cf..1b07fdf 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: c9f030df733e318b80d1fa91b7732f79 - size: 5132 + md5: 896d3d88a4a9f68d174efe71dc089517 + size: 4222 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -20,7 +20,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: HEAT_DEMAND_ENDING default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,20 +29,20 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: e0be70d5025e40dd0d655d9949f72130.dir - size: 31800776 + md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir + size: 34480114 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: 84699d208874c52accaff61c6af9bb0a - size: 5359 + md5: b824822475c222521516493e68eef9c5 + size: 4149 - path: data/prepared_data hash: md5 - md5: e0be70d5025e40dd0d655d9949f72130.dir - size: 31800776 + md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir + size: 34480114 nfiles: 2 params: configs/build_model.yaml: @@ -58,37 +58,39 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 + time_limit: 400 presets: medium_quality excluded_model_types: - KNN - RF + infer_limit: 0.05 + infer_limit_batch_size: 10000 outs: - path: data/model/ hash: md5 - md5: 14ca33cde5e86770135f768abaf84978.dir - size: 422447808 - nfiles: 27 + md5: 452eba2d92233e81d321814aacefe5c2.dir + size: 323127043 + nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 41bfb8d2da8f06d1864d73ce125cc6aa - size: 221 + md5: 888124b56e0c5008a6423e290fc5cc71 + size: 222 generate_predictions: cmd: python 3_generate_predictions.py deps: - path: 3_generate_predictions.py hash: md5 - md5: 5ef2856a5a977304f1ec01f9b4205262 - size: 3028 + md5: 0a70ad4dfe99414a75d1261c75a177b9 + size: 2464 - path: data/model hash: md5 - md5: 14ca33cde5e86770135f768abaf84978.dir - size: 422447808 - nfiles: 27 + md5: 452eba2d92233e81d321814aacefe5c2.dir + size: 323127043 + nfiles: 24 - path: data/prepared_data hash: md5 - md5: e0be70d5025e40dd0d655d9949f72130.dir - size: 31800776 + md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir + size: 34480114 nfiles: 2 params: configs/settings.yaml: @@ -100,25 +102,25 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 40d0c7a7fd4a15add0615e322cf341a0.dir - size: 352151 + md5: f852550a0a51f0c2b120b0680c1a9b54.dir + size: 325890 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py deps: - path: 4_generate_metrics.py hash: md5 - md5: 2c9fb78955a8c19cff0a098976f81d1b - size: 4487 + md5: 567b1acb819e2ff432b989cdbdd4a2bf + size: 3448 - path: data/predictions hash: md5 - md5: 40d0c7a7fd4a15add0615e322cf341a0.dir - size: 352151 + md5: f852550a0a51f0c2b120b0680c1a9b54.dir + size: 325890 nfiles: 1 - path: data/prepared_data hash: md5 - md5: e0be70d5025e40dd0d655d9949f72130.dir - size: 31800776 + md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir + size: 34480114 nfiles: 2 params: configs/settings.yaml: @@ -128,15 +130,15 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 4e023650240e78d6ad761f1db7aac922 - size: 220 + md5: ed3012943593fac4ac7ad9a5499ac18f + size: 219 startup_cleanup: cmd: python 0_startup_cleanup.py deps: - path: 0_startup_cleanup.py hash: md5 - md5: fbb7e3b1b98b517c870f3e1df3e7f695 - size: 1676 + md5: b1b12f6b6393fbf8b83d23684df0a3d4 + size: 1220 params: configs/settings.yaml: default.startup_cleanup.artefacts: ./data From d8d5a66537c3a6ddf7af033b6b06e1c0980cfee8 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Mon, 27 Nov 2023 22:17:29 +0000 Subject: [PATCH 07/15] Update Registry --- MODEL_REGISTRY.md | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index b3ad75a..019e104 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,19 +8,27 @@ "active": true }, "sap": { + "version": "v0.2.6", + "stage": { + "dev": "v0.2.6" + }, + "registered": true, + "active": true + }, + "heat": { + "version": "v0.1.0", + "stage": { + "dev": "v0.0.1" + }, + "registered": true, + "active": true + }, + "carbon": { "version": "v0.1.0", "stage": { "dev": "v0.1.0" }, "registered": true, "active": true - }, - "heat": { - "version": "v0.0.1", - "stage": { - "dev": "v0.0.1" - }, - "registered": true, - "active": true } } From 7f984e6cbf8565d228cff25a6d05a1331f2eb881 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Mon, 27 Nov 2023 22:18:17 +0000 Subject: [PATCH 08/15] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 019e104..7ec1fc7 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -18,7 +18,7 @@ "heat": { "version": "v0.1.0", "stage": { - "dev": "v0.0.1" + "dev": "v0.1.0" }, "registered": true, "active": true From 9271df34e0736dd58c3c8cb68e7160078b93b96c Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 28 Nov 2023 14:51:55 +0000 Subject: [PATCH 09/15] add restriction to datast --- .../configs/feature_processor_logic.py | 37 +++++++++++++++++- modules/ml-pipeline/src/pipeline/dvc.lock | 38 +++++++++---------- 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index cece163..78c29a9 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -23,6 +23,37 @@ def keep_negative_heat_change(df): return df +def keep_negative_carbon_change(df): + df = df[df["CARBON_CHANGE"] < 0] + return df + + +# TODO: Move to ETL pipeline +def remove_unreasonable_habitable_rooms(df): + """ + Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2 + """ + minimum_room_size_index = ( + df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] >= 6.5 + ) + df = df[minimum_room_size_index] + return df + + +def remove_top_1_percent_heat_demand(df): + # threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%'] + threshold_value = 860 + df = df[df["HEAT_DEMAND_STARTING"] < threshold_value] + return df + + +def remove_top_1_percent_carbon(df): + # threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%'] + threshold_value = 18 + df = df[df["CARBON_STARTING"] < threshold_value] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -32,7 +63,11 @@ def keep_negative_heat_change(df): # return df business_logic = { - "keep_negative_heat_change": keep_negative_heat_change + "remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms, + "keep_negative_heat_change": keep_negative_heat_change, + "keep_negative_carbon_change": keep_negative_carbon_change, + "remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand, + "remove_top_1_percent_carbon": remove_top_1_percent_carbon, # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 1b07fdf..0011db6 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir - size: 34480114 + md5: f235f38714fefcf6e4927ae95ba912c3.dir + size: 30774760 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir - size: 34480114 + md5: f235f38714fefcf6e4927ae95ba912c3.dir + size: 30774760 nfiles: 2 params: configs/build_model.yaml: @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 452eba2d92233e81d321814aacefe5c2.dir - size: 323127043 + md5: a868845999b46e0272dc27f5cb5bc618.dir + size: 310555147 nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 888124b56e0c5008a6423e290fc5cc71 - size: 222 + md5: 809f27735c77cbcb62866b96018eedea + size: 216 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 452eba2d92233e81d321814aacefe5c2.dir - size: 323127043 + md5: a868845999b46e0272dc27f5cb5bc618.dir + size: 310555147 nfiles: 24 - path: data/prepared_data hash: md5 - md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir - size: 34480114 + md5: f235f38714fefcf6e4927ae95ba912c3.dir + size: 30774760 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: f852550a0a51f0c2b120b0680c1a9b54.dir - size: 325890 + md5: 2098fe82304751025e427f2cc241a2ff.dir + size: 295849 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3448 - path: data/predictions hash: md5 - md5: f852550a0a51f0c2b120b0680c1a9b54.dir - size: 325890 + md5: 2098fe82304751025e427f2cc241a2ff.dir + size: 295849 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir - size: 34480114 + md5: f235f38714fefcf6e4927ae95ba912c3.dir + size: 30774760 nfiles: 2 params: configs/settings.yaml: @@ -130,7 +130,7 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: ed3012943593fac4ac7ad9a5499ac18f + md5: aa671878e1bd8c6a8d4b5f9788c817c4 size: 219 startup_cleanup: cmd: python 0_startup_cleanup.py From 2cb28616bbc56d3bd8c0401dc747a594fdeef109 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 28 Nov 2023 15:01:27 +0000 Subject: [PATCH 10/15] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 7ec1fc7..34475c7 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -16,7 +16,7 @@ "active": true }, "heat": { - "version": "v0.1.0", + "version": "v0.2.0", "stage": { "dev": "v0.1.0" }, From ba1971498c3d3fcdfdcef915e05bb04d4223e038 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 28 Nov 2023 15:02:13 +0000 Subject: [PATCH 11/15] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 34475c7..35a0760 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -18,7 +18,7 @@ "heat": { "version": "v0.2.0", "stage": { - "dev": "v0.1.0" + "dev": "v0.2.0" }, "registered": true, "active": true From 66f54a92e25251d5dd3bfc73edfce26420130fa0 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 18 Jan 2024 00:14:20 +0000 Subject: [PATCH 12/15] train new 600 second model with new data --- .../src/pipeline/configs/build_model.yaml | 2 +- .../configs/feature_processor_logic.py | 14 ++--- .../pipeline/configs/post_prediction_logic.py | 4 +- .../src/pipeline/configs/settings.yaml | 8 +-- modules/ml-pipeline/src/pipeline/dvc.lock | 60 +++++++++---------- .../version_control/requirements.txt | 8 +-- 6 files changed, 48 insertions(+), 48 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 4c72487..9c97ef0 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,7 +13,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 400 + time_limit: 600 presets: medium_quality excluded_model_types: ['KNN', 'RF'] infer_limit: 0.05 diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 78c29a9..1094862 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -9,22 +9,22 @@ Business Logic dict + functions def remove_starting_columns(df): keep_column_index = [ - False if col_name.endswith("_STARTING") else True + False if col_name.endswith("_starting") else True for col_name in list(df.columns) ] keep_columns = df.columns[keep_column_index].to_list() - keep_columns.append("SAP_STARTING") + keep_columns.append("sap_starting") df = df[keep_columns] return df def keep_negative_heat_change(df): - df = df[df["HEAT_DEMAND_CHANGE"] < 0] + df = df[df["heat_demand_change"] < 0] return df def keep_negative_carbon_change(df): - df = df[df["CARBON_CHANGE"] < 0] + df = df[df["carbon_change"] < 0] return df @@ -34,7 +34,7 @@ def remove_unreasonable_habitable_rooms(df): Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2 """ minimum_room_size_index = ( - df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] >= 6.5 + df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5 ) df = df[minimum_room_size_index] return df @@ -43,14 +43,14 @@ def remove_unreasonable_habitable_rooms(df): def remove_top_1_percent_heat_demand(df): # threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%'] threshold_value = 860 - df = df[df["HEAT_DEMAND_STARTING"] < threshold_value] + df = df[df["heat_demand_starting"] < threshold_value] return df def remove_top_1_percent_carbon(df): # threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%'] threshold_value = 18 - df = df[df["CARBON_STARTING"] < threshold_value] + df = df[df["carbon_starting"] < threshold_value] return df diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index 83389b0..69e9575 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -13,10 +13,10 @@ def clip_predictions_to_minimum_value( predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement replace_index = ( - predictions_df["predictions"] > predictions_df["HEAT_DEMAND_STARTING"] - 1 + predictions_df["predictions"] > predictions_df["heat_demand_starting"] - 1 ) predictions_df.loc[replace_index, "predictions"] = ( - predictions_df.loc[replace_index, "HEAT_DEMAND_STARTING"] - minimum_value + predictions_df.loc[replace_index, "heat_demand_starting"] - minimum_value ) predictions_new = predictions_df["predictions"] diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 6dd7e2b..09792cf 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,7 +21,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -31,9 +31,9 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: HEAT_DEMAND_ENDING - identifier_columns: ["UPRN"] - drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "SAP_ENDING", "CARBON_ENDING"] + target: heat_demand_ending + identifier_columns: ["uprn"] + drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "sap_ending", "carbon_ending"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 0011db6..c1bb042 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -10,17 +10,17 @@ stages: params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - - HEAT_DEMAND_CHANGE - - CARBON_CHANGE - - RDSAP_CHANGE - - SAP_ENDING - - CARBON_ENDING + - heat_demand_change + - carbon_change + - rdsap_change + - sap_ending + - carbon_ending default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: HEAT_DEMAND_ENDING + default.feature_processor.feature_processor_config.target: heat_demand_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: f235f38714fefcf6e4927ae95ba912c3.dir - size: 30774760 + md5: 613ddd198a29002e6e05a2d60275d924.dir + size: 32746979 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 4149 - path: data/prepared_data hash: md5 - md5: f235f38714fefcf6e4927ae95ba912c3.dir - size: 30774760 + md5: 613ddd198a29002e6e05a2d60275d924.dir + size: 32746979 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 400 + time_limit: 600 presets: medium_quality excluded_model_types: - KNN @@ -68,13 +68,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: a868845999b46e0272dc27f5cb5bc618.dir - size: 310555147 - nfiles: 24 + md5: 837a42a0655862229620495c645d5fed.dir + size: 342382387 + nfiles: 26 - path: metrics/fit_metrics.json hash: md5 - md5: 809f27735c77cbcb62866b96018eedea - size: 216 + md5: f8a394b86c33dc1b3ce97abed803c8f1 + size: 220 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +84,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: a868845999b46e0272dc27f5cb5bc618.dir - size: 310555147 - nfiles: 24 + md5: 837a42a0655862229620495c645d5fed.dir + size: 342382387 + nfiles: 26 - path: data/prepared_data hash: md5 - md5: f235f38714fefcf6e4927ae95ba912c3.dir - size: 30774760 + md5: 613ddd198a29002e6e05a2d60275d924.dir + size: 32746979 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +102,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 2098fe82304751025e427f2cc241a2ff.dir - size: 295849 + md5: 75f8326e99eb9e1032728208229ec37b.dir + size: 314002 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +114,13 @@ stages: size: 3448 - path: data/predictions hash: md5 - md5: 2098fe82304751025e427f2cc241a2ff.dir - size: 295849 + md5: 75f8326e99eb9e1032728208229ec37b.dir + size: 314002 nfiles: 1 - path: data/prepared_data hash: md5 - md5: f235f38714fefcf6e4927ae95ba912c3.dir - size: 30774760 + md5: 613ddd198a29002e6e05a2d60275d924.dir + size: 32746979 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +130,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: aa671878e1bd8c6a8d4b5f9788c817c4 - size: 219 + md5: 269e89593f5e7ceb507c31dac2c2dd35 + size: 220 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt index 91cb005..a2b9531 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt @@ -1,4 +1,4 @@ -dvc==3.18.0 -dvc-s3==2.23.0 -gto==1.0.4 -pyOpenSSL==23.2.0 +dvc==3.36.0 +dvc-s3==3.0.1 +gto==1.6.1 +pyOpenSSL==23.3.0 From 4b81ce9374f5dac5b8532882cbd9361b5bdda0a0 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 18 Jan 2024 10:37:20 +0000 Subject: [PATCH 13/15] Update Registry --- MODEL_REGISTRY.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 35a0760..a71f0e3 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,15 +8,15 @@ "active": true }, "sap": { - "version": "v0.2.6", + "version": "v0.3.0", "stage": { - "dev": "v0.2.6" + "dev": "v0.3.0" }, "registered": true, "active": true }, "heat": { - "version": "v0.2.0", + "version": "v0.3.0", "stage": { "dev": "v0.2.0" }, @@ -24,9 +24,9 @@ "active": true }, "carbon": { - "version": "v0.1.0", + "version": "v0.2.0", "stage": { - "dev": "v0.1.0" + "dev": "v0.2.0" }, "registered": true, "active": true From 273dcdad315a1220173df2ba1a1d0be779427646 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Thu, 18 Jan 2024 10:38:15 +0000 Subject: [PATCH 14/15] Update Registry --- MODEL_REGISTRY.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index a71f0e3..1d5da81 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,9 +8,9 @@ "active": true }, "sap": { - "version": "v0.3.0", + "version": "v0.4.0", "stage": { - "dev": "v0.3.0" + "dev": "v0.4.0" }, "registered": true, "active": true @@ -18,13 +18,13 @@ "heat": { "version": "v0.3.0", "stage": { - "dev": "v0.2.0" + "dev": "v0.3.0" }, "registered": true, "active": true }, "carbon": { - "version": "v0.2.0", + "version": "v0.3.0", "stage": { "dev": "v0.2.0" }, From 66ff6e1e22f739dbad34c9e97a9e3cfa979213e9 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 29 Jan 2024 20:37:13 +0000 Subject: [PATCH 15/15] Using all permutation data with all data used in training, nteral cross validation --- .../src/pipeline/1_prepare_data.py | 3 +- .../ml-pipeline/src/pipeline/2_build_model.py | 16 +++++ .../src/pipeline/configs/build_model.yaml | 5 +- .../pipeline/configs/post_prediction_logic.py | 5 +- .../src/pipeline/configs/settings.yaml | 5 +- modules/ml-pipeline/src/pipeline/dvc.lock | 70 +++++++++++-------- modules/ml-pipeline/src/pipeline/dvc.yaml | 1 + 7 files changed, 68 insertions(+), 37 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index ed7e057..75d784f 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -87,7 +87,8 @@ def prepare_data( if train_proportion == 1: train = data - test = None + # Sample 10% of the data for testing + test = data.sample(round(len(data) * 0.1)) else: train, test = train_test_split( data, train_size=train_proportion, test_size=(1 - train_proportion) diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index 7ca4951..09e5910 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -26,9 +26,12 @@ prepare_data_params = settings.prepare_data build_model_params = settings.build_model feature_process_params = settings.feature_processor generate_metrics_params = settings.generate_metrics +generate_predictions_params = settings.generate_predictions model_type = build_model_params["model_type"] target = feature_process_params["feature_processor_config"]["target"] +fit_predictions_filepath = build_model_params["fit_predictions_filepath"] +predictions_column_name = generate_predictions_params["predictions_column_name"] identifier_columns = feature_process_params["feature_processor_config"][ "identifier_columns" ] @@ -60,6 +63,8 @@ def build_model( identifier_columns: List[str], model_save_location: str, model_hyperparameters: dict, + fit_predictions_filepath: str, + predictions_column_name: str, fit_metrics_filepath: str, train_filepath: Union[str, None] = None, test_filepath: Union[str, None] = None, @@ -93,6 +98,15 @@ def build_model( data=train_data, post_prediction_logic=post_prediction_logic ) + logger.info("--- Saving fit predictions ---") + + predictions_df = pd.DataFrame(fit_predictions) + predictions_df.columns = [predictions_column_name] + + dataclient.save_data( + obj=predictions_df, location=fit_predictions_filepath, save_config=None + ) + logger.info("--- Generating fit metrics ---") metrics_output = metrics.generate_metrics( @@ -128,6 +142,8 @@ if __name__ == "__main__": train_filepath=train_filepath, test_filepath=test_filepath, fit_metrics_filepath=fit_metrics_filepath, + fit_predictions_filepath=fit_predictions_filepath, + predictions_column_name=predictions_column_name, ) logger.info(f"--- {__file__} - Complete! ---") diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 9c97ef0..fcec7f7 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -3,6 +3,7 @@ default: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: null @@ -13,8 +14,8 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 600 + time_limit: 4000 presets: medium_quality - excluded_model_types: ['KNN', 'RF'] + excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.05 infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index 69e9575..d4b1896 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -5,7 +5,7 @@ import pandas as pd def clip_predictions_to_minimum_value( - data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1 + data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0 ) -> pd.Series: series_name = predictions.name @@ -13,7 +13,8 @@ def clip_predictions_to_minimum_value( predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement replace_index = ( - predictions_df["predictions"] > predictions_df["heat_demand_starting"] - 1 + predictions_df["predictions"] + > predictions_df["heat_demand_starting"] - minimum_value ) predictions_df.loc[replace_index, "predictions"] = ( predictions_df.loc[replace_index, "heat_demand_starting"] - minimum_value diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 09792cf..35816b6 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,8 +21,9 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet - train_proportion: 0.9 + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet + train_proportion: 1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index c1bb042..97c9335 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: 896d3d88a4a9f68d174efe71dc089517 - size: 4222 + md5: 11a3b8bfdfe199ab7ecc39ccc5652649 + size: 4298 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -20,29 +20,29 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: heat_demand_ending default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet - default.prepare_data.train_proportion: 0.9 + default.prepare_data.train_proportion: 1 outs: - path: data/prepared_data/ hash: md5 - md5: 613ddd198a29002e6e05a2d60275d924.dir - size: 32746979 + md5: dcd41f841c67b474a81a14e683646237.dir + size: 36317761 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: b824822475c222521516493e68eef9c5 - size: 4149 + md5: 7231450b78920b0c5e7c6bada496b24a + size: 4820 - path: data/prepared_data hash: md5 - md5: 613ddd198a29002e6e05a2d60275d924.dir - size: 32746979 + md5: dcd41f841c67b474a81a14e683646237.dir + size: 36317761 nfiles: 2 params: configs/build_model.yaml: @@ -51,6 +51,7 @@ stages: model_type: AutogluonAutoML model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json + fit_predictions_filepath: ./data/fit_predictions/predictions.parquet SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear @@ -58,23 +59,32 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 600 + time_limit: 4000 presets: medium_quality excluded_model_types: - - KNN - RF + - FASTAI + - CAT + - NN_TORCH + - KNN + - XT infer_limit: 0.05 infer_limit_batch_size: 10000 outs: + - path: data/fit_predictions/ + hash: md5 + md5: 89063bb3b725afe61b6ed5edb724bb06.dir + size: 3090627 + nfiles: 1 - path: data/model/ hash: md5 - md5: 837a42a0655862229620495c645d5fed.dir - size: 342382387 - nfiles: 26 + md5: c90eef03b5a76175506c048e88a401dd.dir + size: 783489255 + nfiles: 32 - path: metrics/fit_metrics.json hash: md5 - md5: f8a394b86c33dc1b3ce97abed803c8f1 - size: 220 + md5: 33f18fa6b7dda535de09733d4792c0fc + size: 217 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -84,13 +94,13 @@ stages: size: 2464 - path: data/model hash: md5 - md5: 837a42a0655862229620495c645d5fed.dir - size: 342382387 - nfiles: 26 + md5: c90eef03b5a76175506c048e88a401dd.dir + size: 783489255 + nfiles: 32 - path: data/prepared_data hash: md5 - md5: 613ddd198a29002e6e05a2d60275d924.dir - size: 32746979 + md5: dcd41f841c67b474a81a14e683646237.dir + size: 36317761 nfiles: 2 params: configs/settings.yaml: @@ -102,8 +112,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 75f8326e99eb9e1032728208229ec37b.dir - size: 314002 + md5: 406e2ebe33d6abed9042f137d8c0d2bf.dir + size: 520735 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -114,13 +124,13 @@ stages: size: 3448 - path: data/predictions hash: md5 - md5: 75f8326e99eb9e1032728208229ec37b.dir - size: 314002 + md5: 406e2ebe33d6abed9042f137d8c0d2bf.dir + size: 520735 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 613ddd198a29002e6e05a2d60275d924.dir - size: 32746979 + md5: dcd41f841c67b474a81a14e683646237.dir + size: 36317761 nfiles: 2 params: configs/settings.yaml: @@ -130,8 +140,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 269e89593f5e7ceb507c31dac2c2dd35 - size: 220 + md5: cc1ad408f2d9d3128df71822a38ea85e + size: 218 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index ccdd779..58889cc 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -38,6 +38,7 @@ stages: - configs/build_model.yaml: outs: - data/model/ + - data/fit_predictions/ - metrics/fit_metrics.json always_changed: true generate_predictions: