initial model for heat-dev

2026-06-08 11:17:25 +00:00 · 2023-10-09 17:52:47 +00:00 · 2023-10-09 17:52:47 +00:00 · ad2c266727
commit ad2c266727
parent ba4d1bcc8b
7 changed files with 55 additions and 30 deletions
--- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml
@ -13,6 +13,6 @@ default:
      output_filepath: ./data/model/allmodels/
      problem_type: regression
      eval_metric: mean_squared_error #mean_absolute_error
-      time_limit: 4000
+      time_limit: 1000
      presets: medium_quality
      excluded_model_types: ['KNN', 'RF']
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
@ -18,6 +18,11 @@ def remove_starting_columns(df):
    return df


+def keep_negative_heat_change(df):
+    df = df[df["HEAT_DEMAND_CHANGE"] < 0]
+    return df
+
+
 # def keep_ending_columns(df):
 #     ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
 #     keep_columns = df.columns[ending_column_index].to_list()
@ -27,6 +32,7 @@ def remove_starting_columns(df):
 #     return df

 business_logic = {
+    "keep_negative_heat_change": keep_negative_heat_change
    # "remove_starting_columns": remove_starting_columns
    # "keep_ENDING_COLUMNS": keep_ending_columns
 }
--- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
@ -12,9 +12,11 @@ def clip_predictions_to_minimum_value(
    predictions.name = "predictions"
    predictions_df = pd.concat([data, predictions], axis=1)
    # We expect all prediction to be atleast one point improvement
-    replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
+    replace_index = (
+        predictions_df["predictions"] > predictions_df["HEAT_DEMAND_STARTING"] - 1
+    )
    predictions_df.loc[replace_index, "predictions"] = (
-        predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
+        predictions_df.loc[replace_index, "HEAT_DEMAND_STARTING"] - minimum_value
    )

    predictions_new = predictions_df["predictions"]
--- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml
@ -31,9 +31,9 @@ default:
    feature_processor_config:
      subsample_amount: null
      subsample_seed: 0
-      target: SAP_ENDING
+      target: HEAT_DEMAND_ENDING
      identifier_columns: ["UPRN"]
-      drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
+      drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "SAP_ENDING", "CARBON_ENDING"]
      # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
      retain_features: null

--- a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py
+++ b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py
@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods:
 - Generate Plot Suite
 """

+import numpy as np
 import pandas as pd
 from typing import Union
 from sklearn.metrics import (
@ -14,6 +15,18 @@ from sklearn.metrics import (
 )
 from core.interface.InterfaceMetrics import MLMetrics

+# Define the function to return the SMAPE value
+def symmetric_mape(actual, predicted) -> float:
+
+    # Convert actual and predicted to numpy
+    # array data type if not already
+    if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]):
+        actual, predicted = np.array(actual), np.array(predicted)
+
+    return np.mean(
+        np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2)
+    )
+

 def metrics_factory(metrics_type: str) -> MLMetrics:
    metrics = {
@ -34,7 +47,7 @@ class RegressionMetrics:
        median_absolute_error,
        mean_squared_error,
        mean_absolute_percentage_error,
-        # max_error
+        symmetric_mape,
    ]

    def generate_metrics(
--- a/modules/ml-pipeline/src/pipeline/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/dvc.lock
@ -13,12 +13,12 @@ stages:
        - HEAT_DEMAND_CHANGE
        - CARBON_CHANGE
        - RDSAP_CHANGE
-        - HEAT_DEMAND_ENDING
+        - SAP_ENDING
        - CARBON_ENDING
        default.feature_processor.feature_processor_config.retain_features:
        default.feature_processor.feature_processor_config.subsample_amount:
        default.feature_processor.feature_processor_config.subsample_seed: 0
-        default.feature_processor.feature_processor_config.target: SAP_ENDING
+        default.feature_processor.feature_processor_config.target: HEAT_DEMAND_ENDING
        default.feature_processor.feature_processor_type: dataframe
        default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
        default.prepare_data.input_dataclient_type: aws-s3
@ -29,8 +29,8 @@ stages:
    outs:
    - path: data/prepared_data/
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
-      size: 33881619
+      md5: 71e63a792f7723e2aea0709efde1a92b.dir
+      size: 31751660
      nfiles: 2
  build_model:
    cmd: python 2_build_model.py
@ -41,8 +41,8 @@ stages:
      size: 5359
    - path: data/prepared_data
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
-      size: 33881619
+      md5: 71e63a792f7723e2aea0709efde1a92b.dir
+      size: 31751660
      nfiles: 2
    params:
      configs/build_model.yaml:
@ -58,7 +58,7 @@ stages:
              output_filepath: ./data/model/allmodels/
              problem_type: regression
              eval_metric: mean_squared_error
-              time_limit: 4000
+              time_limit: 1000
              presets: medium_quality
              excluded_model_types:
              - KNN
@ -66,13 +66,13 @@ stages:
    outs:
    - path: data/model/
      hash: md5
-      md5: 7bb5156243b4db39349e80a01ffecde4.dir
-      size: 473398662
+      md5: 0ffc51be7c8381c9e4106309e3e05ca3.dir
+      size: 345904743
      nfiles: 27
    - path: metrics/fit_metrics.json
      hash: md5
-      md5: 2bb16ac67de8778fbc08171d562b34d5
-      size: 184
+      md5: 3d4ff3a3ca3c327e2c1e9aa1338c18ce
+      size: 220
  generate_predictions:
    cmd: python 3_generate_predictions.py
    deps:
@ -82,13 +82,13 @@ stages:
      size: 3028
    - path: data/model
      hash: md5
-      md5: 7bb5156243b4db39349e80a01ffecde4.dir
-      size: 473398662
+      md5: 0ffc51be7c8381c9e4106309e3e05ca3.dir
+      size: 345904743
      nfiles: 27
    - path: data/prepared_data
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
-      size: 33881619
+      md5: 71e63a792f7723e2aea0709efde1a92b.dir
+      size: 31751660
      nfiles: 2
    params:
      configs/settings.yaml:
@ -100,8 +100,8 @@ stages:
    outs:
    - path: data/predictions/
      hash: md5
-      md5: 0bb3cf991906953def81c8204cdcfaf0.dir
-      size: 374532
+      md5: 00ff804016290d56e1490e59c098b060.dir
+      size: 351811
      nfiles: 1
  generate_metrics:
    cmd: python 4_generate_metrics.py
@ -112,13 +112,13 @@ stages:
      size: 4487
    - path: data/predictions
      hash: md5
-      md5: 0bb3cf991906953def81c8204cdcfaf0.dir
-      size: 374532
+      md5: 00ff804016290d56e1490e59c098b060.dir
+      size: 351811
      nfiles: 1
    - path: data/prepared_data
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
-      size: 33881619
+      md5: 71e63a792f7723e2aea0709efde1a92b.dir
+      size: 31751660
      nfiles: 2
    params:
      configs/settings.yaml:
@ -128,8 +128,8 @@ stages:
    outs:
    - path: metrics/metrics.json
      hash: md5
-      md5: 2e13ae67759a64261d03224f1c0d4bf4
-      size: 185
+      md5: 63ef63e4fabe929b914a0059ceeddabc
+      size: 221
  startup_cleanup:
    cmd: python 0_startup_cleanup.py
    deps:
--- a/modules/ml-pipeline/src/pipeline/eda.py
+++ b/modules/ml-pipeline/src/pipeline/eda.py
@ -38,7 +38,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
 train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
    x=target, y="HEAT_DEMAND_STARTING", style="o"
 )
-
 # Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict

 # Load the autogluon model and check feature importance
@ -176,6 +175,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
 #
 #

+from core.MLMetrics import metrics_factory
+
 from core.MLModels import model_factory
 from core.DataClient import dataclient_factory
 import pandas as pd
@ -206,6 +207,9 @@ mix_df = pd.concat([test_df.copy(), predictions], axis=1)
 mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
 mix_df = mix_df.sort_values("residual", ascending=False)

+metrics = metrics_factory("Regression")
+metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"])
+
 cosine_similarity_df = mix_df[
    mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
 ]