diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index d296e6a..fdeb8c5 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -13,6 +13,6 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 4000 + time_limit: 1000 presets: medium_quality excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index c32d2fe..cece163 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -18,6 +18,11 @@ def remove_starting_columns(df): return df +def keep_negative_heat_change(df): + df = df[df["HEAT_DEMAND_CHANGE"] < 0] + return df + + # def keep_ending_columns(df): # ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)] # keep_columns = df.columns[ending_column_index].to_list() @@ -27,6 +32,7 @@ def remove_starting_columns(df): # return df business_logic = { + "keep_negative_heat_change": keep_negative_heat_change # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py index b85d3a4..83389b0 100644 --- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py @@ -12,9 +12,11 @@ def clip_predictions_to_minimum_value( predictions.name = "predictions" predictions_df = pd.concat([data, predictions], axis=1) # We expect all prediction to be atleast one point improvement - replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"] + replace_index = ( + predictions_df["predictions"] > predictions_df["HEAT_DEMAND_STARTING"] - 1 + ) predictions_df.loc[replace_index, "predictions"] = ( - predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value + predictions_df.loc[replace_index, "HEAT_DEMAND_STARTING"] - minimum_value ) predictions_new = predictions_df["predictions"] diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index a84c095..25789b5 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -31,9 +31,9 @@ default: feature_processor_config: subsample_amount: null subsample_seed: 0 - target: SAP_ENDING + target: HEAT_DEMAND_ENDING identifier_columns: ["UPRN"] - drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] + drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "SAP_ENDING", "CARBON_ENDING"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py index 4b14386..845b819 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py +++ b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py @@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods: - Generate Plot Suite """ +import numpy as np import pandas as pd from typing import Union from sklearn.metrics import ( @@ -14,6 +15,18 @@ from sklearn.metrics import ( ) from core.interface.InterfaceMetrics import MLMetrics +# Define the function to return the SMAPE value +def symmetric_mape(actual, predicted) -> float: + + # Convert actual and predicted to numpy + # array data type if not already + if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]): + actual, predicted = np.array(actual), np.array(predicted) + + return np.mean( + np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2) + ) + def metrics_factory(metrics_type: str) -> MLMetrics: metrics = { @@ -34,7 +47,7 @@ class RegressionMetrics: median_absolute_error, mean_squared_error, mean_absolute_percentage_error, - # max_error + symmetric_mape, ] def generate_metrics( diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index c499874..46211fa 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -13,12 +13,12 @@ stages: - HEAT_DEMAND_CHANGE - CARBON_CHANGE - RDSAP_CHANGE - - HEAT_DEMAND_ENDING + - SAP_ENDING - CARBON_ENDING default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_seed: 0 - default.feature_processor.feature_processor_config.target: SAP_ENDING + default.feature_processor.feature_processor_config.target: HEAT_DEMAND_ENDING default.feature_processor.feature_processor_type: dataframe default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 @@ -29,8 +29,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: 71e63a792f7723e2aea0709efde1a92b.dir + size: 31751660 nfiles: 2 build_model: cmd: python 2_build_model.py @@ -41,8 +41,8 @@ stages: size: 5359 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: 71e63a792f7723e2aea0709efde1a92b.dir + size: 31751660 nfiles: 2 params: configs/build_model.yaml: @@ -58,7 +58,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 4000 + time_limit: 1000 presets: medium_quality excluded_model_types: - KNN @@ -66,13 +66,13 @@ stages: outs: - path: data/model/ hash: md5 - md5: 7bb5156243b4db39349e80a01ffecde4.dir - size: 473398662 + md5: 0ffc51be7c8381c9e4106309e3e05ca3.dir + size: 345904743 nfiles: 27 - path: metrics/fit_metrics.json hash: md5 - md5: 2bb16ac67de8778fbc08171d562b34d5 - size: 184 + md5: 3d4ff3a3ca3c327e2c1e9aa1338c18ce + size: 220 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -82,13 +82,13 @@ stages: size: 3028 - path: data/model hash: md5 - md5: 7bb5156243b4db39349e80a01ffecde4.dir - size: 473398662 + md5: 0ffc51be7c8381c9e4106309e3e05ca3.dir + size: 345904743 nfiles: 27 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: 71e63a792f7723e2aea0709efde1a92b.dir + size: 31751660 nfiles: 2 params: configs/settings.yaml: @@ -100,8 +100,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 0bb3cf991906953def81c8204cdcfaf0.dir - size: 374532 + md5: 00ff804016290d56e1490e59c098b060.dir + size: 351811 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -112,13 +112,13 @@ stages: size: 4487 - path: data/predictions hash: md5 - md5: 0bb3cf991906953def81c8204cdcfaf0.dir - size: 374532 + md5: 00ff804016290d56e1490e59c098b060.dir + size: 351811 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir - size: 33881619 + md5: 71e63a792f7723e2aea0709efde1a92b.dir + size: 31751660 nfiles: 2 params: configs/settings.yaml: @@ -128,8 +128,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 2e13ae67759a64261d03224f1c0d4bf4 - size: 185 + md5: 63ef63e4fabe929b914a0059ceeddabc + size: 221 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index 2fdd8be..21f2bc8 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -38,7 +38,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o") train_df[[target, "HEAT_DEMAND_STARTING"]].plot( x=target, y="HEAT_DEMAND_STARTING", style="o" ) - # Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict # Load the autogluon model and check feature importance @@ -176,6 +175,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6}) # # +from core.MLMetrics import metrics_factory + from core.MLModels import model_factory from core.DataClient import dataclient_factory import pandas as pd @@ -206,6 +207,9 @@ mix_df = pd.concat([test_df.copy(), predictions], axis=1) mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) mix_df = mix_df.sort_values("residual", ascending=False) +metrics = metrics_factory("Regression") +metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"]) + cosine_similarity_df = mix_df[ mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"]) ]