Compare commits

...

5 commits

Author SHA1 Message Date
quandanrepo
dffb01bf8e
Merge pull request #67 from Hestia-Homes/heat-dev-model
Heat dev model
2023-10-10 13:45:23 +01:00
Michael Duong
d2a7615e3b Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-10-10 12:33:51 +00:00
Michael Duong
4c6c5330d8 add new model, new branch 2023-10-10 12:33:44 +00:00
Michael Duong
9e7d0fa538 add new model 2023-10-10 12:32:25 +00:00
Michael Duong
ad2c266727 initial model for heat-dev 2023-10-09 17:52:47 +00:00
7 changed files with 53 additions and 30 deletions

View file

@ -4,9 +4,7 @@ After the model is built, we can evaluate its performance
"""
import os
import yaml
import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceMetrics import MLMetrics
from core.interface.InterfaceDataClient import DataClient

View file

@ -18,6 +18,11 @@ def remove_starting_columns(df):
return df
def keep_negative_heat_change(df):
df = df[df["HEAT_DEMAND_CHANGE"] < 0]
return df
# def keep_ending_columns(df):
# ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
# keep_columns = df.columns[ending_column_index].to_list()
@ -27,6 +32,7 @@ def remove_starting_columns(df):
# return df
business_logic = {
"keep_negative_heat_change": keep_negative_heat_change
# "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}

View file

@ -12,9 +12,11 @@ def clip_predictions_to_minimum_value(
predictions.name = "predictions"
predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement
replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
replace_index = (
predictions_df["predictions"] > predictions_df["HEAT_DEMAND_STARTING"] - 1
)
predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
predictions_df.loc[replace_index, "HEAT_DEMAND_STARTING"] - minimum_value
)
predictions_new = predictions_df["predictions"]

View file

@ -31,9 +31,9 @@ default:
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: SAP_ENDING
target: HEAT_DEMAND_ENDING
identifier_columns: ["UPRN"]
drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "SAP_ENDING", "CARBON_ENDING"]
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null

View file

@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods:
- Generate Plot Suite
"""
import numpy as np
import pandas as pd
from typing import Union
from sklearn.metrics import (
@ -14,6 +15,18 @@ from sklearn.metrics import (
)
from core.interface.InterfaceMetrics import MLMetrics
# Define the function to return the SMAPE value
def symmetric_mape(actual, predicted) -> float:
# Convert actual and predicted to numpy
# array data type if not already
if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]):
actual, predicted = np.array(actual), np.array(predicted)
return np.mean(
np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2)
)
def metrics_factory(metrics_type: str) -> MLMetrics:
metrics = {
@ -34,7 +47,7 @@ class RegressionMetrics:
median_absolute_error,
mean_squared_error,
mean_absolute_percentage_error,
# max_error
symmetric_mape,
]
def generate_metrics(

View file

@ -13,12 +13,12 @@ stages:
- HEAT_DEMAND_CHANGE
- CARBON_CHANGE
- RDSAP_CHANGE
- HEAT_DEMAND_ENDING
- SAP_ENDING
- CARBON_ENDING
default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: SAP_ENDING
default.feature_processor.feature_processor_config.target: HEAT_DEMAND_ENDING
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
default.prepare_data.input_dataclient_type: aws-s3
@ -29,8 +29,8 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: e0be70d5025e40dd0d655d9949f72130.dir
size: 31800776
nfiles: 2
build_model:
cmd: python 2_build_model.py
@ -41,8 +41,8 @@ stages:
size: 5359
- path: data/prepared_data
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: e0be70d5025e40dd0d655d9949f72130.dir
size: 31800776
nfiles: 2
params:
configs/build_model.yaml:
@ -66,13 +66,13 @@ stages:
outs:
- path: data/model/
hash: md5
md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 473398662
md5: 14ca33cde5e86770135f768abaf84978.dir
size: 422447808
nfiles: 27
- path: metrics/fit_metrics.json
hash: md5
md5: 2bb16ac67de8778fbc08171d562b34d5
size: 184
md5: 41bfb8d2da8f06d1864d73ce125cc6aa
size: 221
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
@ -82,13 +82,13 @@ stages:
size: 3028
- path: data/model
hash: md5
md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 473398662
md5: 14ca33cde5e86770135f768abaf84978.dir
size: 422447808
nfiles: 27
- path: data/prepared_data
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: e0be70d5025e40dd0d655d9949f72130.dir
size: 31800776
nfiles: 2
params:
configs/settings.yaml:
@ -100,8 +100,8 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 374532
md5: 40d0c7a7fd4a15add0615e322cf341a0.dir
size: 352151
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
@ -112,13 +112,13 @@ stages:
size: 4487
- path: data/predictions
hash: md5
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 374532
md5: 40d0c7a7fd4a15add0615e322cf341a0.dir
size: 352151
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: e0be70d5025e40dd0d655d9949f72130.dir
size: 31800776
nfiles: 2
params:
configs/settings.yaml:
@ -128,8 +128,8 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 2e13ae67759a64261d03224f1c0d4bf4
size: 185
md5: 4e023650240e78d6ad761f1db7aac922
size: 220
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:

View file

@ -38,7 +38,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
x=target, y="HEAT_DEMAND_STARTING", style="o"
)
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
# Load the autogluon model and check feature importance
@ -176,6 +175,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
#
#
from core.MLMetrics import metrics_factory
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import pandas as pd
@ -206,6 +207,9 @@ mix_df = pd.concat([test_df.copy(), predictions], axis=1)
mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
metrics = metrics_factory("Regression")
metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"])
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]