Merge pull request #67 from Hestia-Homes/heat-dev-model

Heat dev model
Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model
2026-06-08 11:17:25 +00:00 · 2023-10-10 13:45:23 +01:00 · 2023-10-10 12:33:51 +00:00 · 2023-10-10 12:33:44 +00:00 · 2023-10-10 12:32:25 +00:00 · 2023-10-09 17:52:47 +00:00
7 changed files with 53 additions and 30 deletions
--- a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py
+++ b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py
@ -4,9 +4,7 @@ After the model is built, we can evaluate its performance
 """
 import os
 import yaml
 import pandas as pd
 from pathlib import Path
 from core.interface.InterfaceModels import MLModel
 from core.interface.InterfaceMetrics import MLMetrics
 from core.interface.InterfaceDataClient import DataClient
--- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py
@ -18,6 +18,11 @@ def remove_starting_columns(df):
    return df
 def keep_negative_heat_change(df):
    df = df[df["HEAT_DEMAND_CHANGE"] < 0]
    return df
 # def keep_ending_columns(df):
 #     ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
 #     keep_columns = df.columns[ending_column_index].to_list()
@ -27,6 +32,7 @@ def remove_starting_columns(df):
 #     return df
 business_logic = {
    "keep_negative_heat_change": keep_negative_heat_change
    # "remove_starting_columns": remove_starting_columns
    # "keep_ENDING_COLUMNS": keep_ending_columns
 }
--- a/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
+++ b/modules/ml-pipeline/src/pipeline/configs/post_prediction_logic.py
@ -12,9 +12,11 @@ def clip_predictions_to_minimum_value(
    predictions.name = "predictions"
    predictions_df = pd.concat([data, predictions], axis=1)
    # We expect all prediction to be atleast one point improvement
-    replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
+    replace_index = (
        predictions_df["predictions"] > predictions_df["HEAT_DEMAND_STARTING"] - 1
    )
    predictions_df.loc[replace_index, "predictions"] = (
-        predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
+        predictions_df.loc[replace_index, "HEAT_DEMAND_STARTING"] - minimum_value
    )
    predictions_new = predictions_df["predictions"]
--- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml
+++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml
@ -31,9 +31,9 @@ default:
    feature_processor_config:
      subsample_amount: null
      subsample_seed: 0
-      target: SAP_ENDING
+      target: HEAT_DEMAND_ENDING
      identifier_columns: ["UPRN"]
-      drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
+      drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "SAP_ENDING", "CARBON_ENDING"]
      # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
      retain_features: null
--- a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py
+++ b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py
@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods:
 - Generate Plot Suite
 """
 import numpy as np
 import pandas as pd
 from typing import Union
 from sklearn.metrics import (
@ -14,6 +15,18 @@ from sklearn.metrics import (
 )
 from core.interface.InterfaceMetrics import MLMetrics
 # Define the function to return the SMAPE value
 def symmetric_mape(actual, predicted) -> float:
    # Convert actual and predicted to numpy
    # array data type if not already
    if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual), np.array(predicted)
    return np.mean(
        np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2)
    )
 def metrics_factory(metrics_type: str) -> MLMetrics:
    metrics = {
@ -34,7 +47,7 @@ class RegressionMetrics:
        median_absolute_error,
        mean_squared_error,
        mean_absolute_percentage_error,
-        # max_error
+        symmetric_mape,
    ]
    def generate_metrics(
--- a/modules/ml-pipeline/src/pipeline/dvc.lock
+++ b/modules/ml-pipeline/src/pipeline/dvc.lock
@ -13,12 +13,12 @@ stages:
        - HEAT_DEMAND_CHANGE
        - CARBON_CHANGE
        - RDSAP_CHANGE
-        - HEAT_DEMAND_ENDING
+        - SAP_ENDING
        - CARBON_ENDING
        default.feature_processor.feature_processor_config.retain_features:
        default.feature_processor.feature_processor_config.subsample_amount:
        default.feature_processor.feature_processor_config.subsample_seed: 0
-        default.feature_processor.feature_processor_config.target: SAP_ENDING
+        default.feature_processor.feature_processor_config.target: HEAT_DEMAND_ENDING
        default.feature_processor.feature_processor_type: dataframe
        default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
        default.prepare_data.input_dataclient_type: aws-s3
@ -29,8 +29,8 @@ stages:
    outs:
    - path: data/prepared_data/
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      md5: e0be70d5025e40dd0d655d9949f72130.dir
-      size: 33881619
+      size: 31800776
      nfiles: 2
  build_model:
    cmd: python 2_build_model.py
@ -41,8 +41,8 @@ stages:
      size: 5359
    - path: data/prepared_data
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      md5: e0be70d5025e40dd0d655d9949f72130.dir
-      size: 33881619
+      size: 31800776
      nfiles: 2
    params:
      configs/build_model.yaml:
@ -66,13 +66,13 @@ stages:
    outs:
    - path: data/model/
      hash: md5
-      md5: 7bb5156243b4db39349e80a01ffecde4.dir
+      md5: 14ca33cde5e86770135f768abaf84978.dir
-      size: 473398662
+      size: 422447808
      nfiles: 27
    - path: metrics/fit_metrics.json
      hash: md5
-      md5: 2bb16ac67de8778fbc08171d562b34d5
+      md5: 41bfb8d2da8f06d1864d73ce125cc6aa
-      size: 184
+      size: 221
  generate_predictions:
    cmd: python 3_generate_predictions.py
    deps:
@ -82,13 +82,13 @@ stages:
      size: 3028
    - path: data/model
      hash: md5
-      md5: 7bb5156243b4db39349e80a01ffecde4.dir
+      md5: 14ca33cde5e86770135f768abaf84978.dir
-      size: 473398662
+      size: 422447808
      nfiles: 27
    - path: data/prepared_data
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      md5: e0be70d5025e40dd0d655d9949f72130.dir
-      size: 33881619
+      size: 31800776
      nfiles: 2
    params:
      configs/settings.yaml:
@ -100,8 +100,8 @@ stages:
    outs:
    - path: data/predictions/
      hash: md5
-      md5: 0bb3cf991906953def81c8204cdcfaf0.dir
+      md5: 40d0c7a7fd4a15add0615e322cf341a0.dir
-      size: 374532
+      size: 352151
      nfiles: 1
  generate_metrics:
    cmd: python 4_generate_metrics.py
@ -112,13 +112,13 @@ stages:
      size: 4487
    - path: data/predictions
      hash: md5
-      md5: 0bb3cf991906953def81c8204cdcfaf0.dir
+      md5: 40d0c7a7fd4a15add0615e322cf341a0.dir
-      size: 374532
+      size: 352151
      nfiles: 1
    - path: data/prepared_data
      hash: md5
-      md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
+      md5: e0be70d5025e40dd0d655d9949f72130.dir
-      size: 33881619
+      size: 31800776
      nfiles: 2
    params:
      configs/settings.yaml:
@ -128,8 +128,8 @@ stages:
    outs:
    - path: metrics/metrics.json
      hash: md5
-      md5: 2e13ae67759a64261d03224f1c0d4bf4
+      md5: 4e023650240e78d6ad761f1db7aac922
-      size: 185
+      size: 220
  startup_cleanup:
    cmd: python 0_startup_cleanup.py
    deps:
--- a/modules/ml-pipeline/src/pipeline/eda.py
+++ b/modules/ml-pipeline/src/pipeline/eda.py
@ -38,7 +38,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
 train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
    x=target, y="HEAT_DEMAND_STARTING", style="o"
 )
 # Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
 # Load the autogluon model and check feature importance
@ -176,6 +175,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
 #
 #
 from core.MLMetrics import metrics_factory
 from core.MLModels import model_factory
 from core.DataClient import dataclient_factory
 import pandas as pd
@ -206,6 +207,9 @@ mix_df = pd.concat([test_df.copy(), predictions], axis=1)
 mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
 mix_df = mix_df.sort_values("residual", ascending=False)
 metrics = metrics_factory("Regression")
 metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"])
 cosine_similarity_df = mix_df[
    mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
 ]
Author	SHA1	Message	Date
quandanrepo	dffb01bf8e	Merge pull request #67 from Hestia-Homes/heat-dev-model Heat dev model	2023-10-10 13:45:23 +01:00
Michael Duong	d2a7615e3b	Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model	2023-10-10 12:33:51 +00:00
Michael Duong	4c6c5330d8	add new model, new branch	2023-10-10 12:33:44 +00:00
Michael Duong	9e7d0fa538	add new model	2023-10-10 12:32:25 +00:00
Michael Duong	ad2c266727	initial model for heat-dev	2023-10-09 17:52:47 +00:00