Compare commits

...

15 commits

Author SHA1 Message Date
quandanrepo
7554988070
Merge pull request #87 from Hestia-Homes/heat-dev-model
add restriction to datast
2023-11-28 15:00:46 +00:00
Michael Duong
9271df34e0 add restriction to datast 2023-11-28 14:51:55 +00:00
Github-Bot
7f984e6cbf Update Registry 2023-11-27 22:18:17 +00:00
Github-Bot
d8d5a66537 Update Registry 2023-11-27 22:17:29 +00:00
quandanrepo
676539e6a7
Merge pull request #86 from Hestia-Homes/heat-dev-model
Heat dev model
2023-11-27 22:16:44 +00:00
quandanrepo
890ca15193
Merge branch 'heat-dev' into heat-dev-model 2023-11-27 22:09:53 +00:00
Michael Duong
5a9eb608bd commit first heat-model 2023-11-27 22:06:18 +00:00
Michael Duong
f4f8dc2bf2 Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-11-27 21:51:03 +00:00
Github-Bot
2d331736a4 Update Registry 2023-10-10 12:47:01 +00:00
Github-Bot
7d685caaf5 Update Registry 2023-10-10 12:46:02 +00:00
quandanrepo
dffb01bf8e
Merge pull request #67 from Hestia-Homes/heat-dev-model
Heat dev model
2023-10-10 13:45:23 +01:00
Michael Duong
d2a7615e3b Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-10-10 12:33:51 +00:00
Michael Duong
4c6c5330d8 add new model, new branch 2023-10-10 12:33:44 +00:00
Michael Duong
9e7d0fa538 add new model 2023-10-10 12:32:25 +00:00
Michael Duong
ad2c266727 initial model for heat-dev 2023-10-09 17:52:47 +00:00
8 changed files with 102 additions and 47 deletions

View file

@ -8,6 +8,14 @@
"active": true
},
"sap": {
"version": "v0.2.6",
"stage": {
"dev": "v0.2.6"
},
"registered": true,
"active": true
},
"heat": {
"version": "v0.1.0",
"stage": {
"dev": "v0.1.0"
@ -15,10 +23,10 @@
"registered": true,
"active": true
},
"heat": {
"version": "v0.0.1",
"carbon": {
"version": "v0.1.0",
"stage": {
"dev": "v0.0.1"
"dev": "v0.1.0"
},
"registered": true,
"active": true

View file

@ -4,9 +4,7 @@ After the model is built, we can evaluate its performance
"""
import os
import yaml
import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceMetrics import MLMetrics
from core.interface.InterfaceDataClient import DataClient

View file

@ -13,7 +13,7 @@ default:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 4000
time_limit: 400
presets: medium_quality
excluded_model_types: ['KNN', 'RF']
infer_limit: 0.05

View file

@ -18,6 +18,42 @@ def remove_starting_columns(df):
return df
def keep_negative_heat_change(df):
df = df[df["HEAT_DEMAND_CHANGE"] < 0]
return df
def keep_negative_carbon_change(df):
df = df[df["CARBON_CHANGE"] < 0]
return df
# TODO: Move to ETL pipeline
def remove_unreasonable_habitable_rooms(df):
"""
Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2
"""
minimum_room_size_index = (
df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] >= 6.5
)
df = df[minimum_room_size_index]
return df
def remove_top_1_percent_heat_demand(df):
# threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%']
threshold_value = 860
df = df[df["HEAT_DEMAND_STARTING"] < threshold_value]
return df
def remove_top_1_percent_carbon(df):
# threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%']
threshold_value = 18
df = df[df["CARBON_STARTING"] < threshold_value]
return df
# def keep_ending_columns(df):
# ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
# keep_columns = df.columns[ending_column_index].to_list()
@ -27,6 +63,11 @@ def remove_starting_columns(df):
# return df
business_logic = {
"remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms,
"keep_negative_heat_change": keep_negative_heat_change,
"keep_negative_carbon_change": keep_negative_carbon_change,
"remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand,
"remove_top_1_percent_carbon": remove_top_1_percent_carbon,
# "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}

View file

@ -12,9 +12,11 @@ def clip_predictions_to_minimum_value(
predictions.name = "predictions"
predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement
replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
replace_index = (
predictions_df["predictions"] > predictions_df["HEAT_DEMAND_STARTING"] - 1
)
predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
predictions_df.loc[replace_index, "HEAT_DEMAND_STARTING"] - minimum_value
)
predictions_new = predictions_df["predictions"]

View file

@ -31,9 +31,9 @@ default:
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: SAP_ENDING
target: HEAT_DEMAND_ENDING
identifier_columns: ["UPRN"]
drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "SAP_ENDING", "CARBON_ENDING"]
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null

View file

@ -5,22 +5,22 @@ stages:
deps:
- path: 1_prepare_data.py
hash: md5
md5: c9f030df733e318b80d1fa91b7732f79
size: 5132
md5: 896d3d88a4a9f68d174efe71dc089517
size: 4222
params:
configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns:
- HEAT_DEMAND_CHANGE
- CARBON_CHANGE
- RDSAP_CHANGE
- HEAT_DEMAND_ENDING
- SAP_ENDING
- CARBON_ENDING
default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: SAP_ENDING
default.feature_processor.feature_processor_config.target: HEAT_DEMAND_ENDING
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -29,20 +29,20 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: f235f38714fefcf6e4927ae95ba912c3.dir
size: 30774760
nfiles: 2
build_model:
cmd: python 2_build_model.py
deps:
- path: 2_build_model.py
hash: md5
md5: 84699d208874c52accaff61c6af9bb0a
size: 5359
md5: b824822475c222521516493e68eef9c5
size: 4149
- path: data/prepared_data
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: f235f38714fefcf6e4927ae95ba912c3.dir
size: 30774760
nfiles: 2
params:
configs/build_model.yaml:
@ -58,37 +58,39 @@ stages:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error
time_limit: 4000
time_limit: 400
presets: medium_quality
excluded_model_types:
- KNN
- RF
infer_limit: 0.05
infer_limit_batch_size: 10000
outs:
- path: data/model/
hash: md5
md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 473398662
nfiles: 27
md5: a868845999b46e0272dc27f5cb5bc618.dir
size: 310555147
nfiles: 24
- path: metrics/fit_metrics.json
hash: md5
md5: 2bb16ac67de8778fbc08171d562b34d5
size: 184
md5: 809f27735c77cbcb62866b96018eedea
size: 216
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
- path: 3_generate_predictions.py
hash: md5
md5: 5ef2856a5a977304f1ec01f9b4205262
size: 3028
md5: 0a70ad4dfe99414a75d1261c75a177b9
size: 2464
- path: data/model
hash: md5
md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 473398662
nfiles: 27
md5: a868845999b46e0272dc27f5cb5bc618.dir
size: 310555147
nfiles: 24
- path: data/prepared_data
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: f235f38714fefcf6e4927ae95ba912c3.dir
size: 30774760
nfiles: 2
params:
configs/settings.yaml:
@ -100,25 +102,25 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 374532
md5: 2098fe82304751025e427f2cc241a2ff.dir
size: 295849
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
deps:
- path: 4_generate_metrics.py
hash: md5
md5: 2c9fb78955a8c19cff0a098976f81d1b
size: 4487
md5: 567b1acb819e2ff432b989cdbdd4a2bf
size: 3448
- path: data/predictions
hash: md5
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 374532
md5: 2098fe82304751025e427f2cc241a2ff.dir
size: 295849
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: f235f38714fefcf6e4927ae95ba912c3.dir
size: 30774760
nfiles: 2
params:
configs/settings.yaml:
@ -128,15 +130,15 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 2e13ae67759a64261d03224f1c0d4bf4
size: 185
md5: aa671878e1bd8c6a8d4b5f9788c817c4
size: 219
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: fbb7e3b1b98b517c870f3e1df3e7f695
size: 1676
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data

View file

@ -38,7 +38,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
x=target, y="HEAT_DEMAND_STARTING", style="o"
)
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
# Load the autogluon model and check feature importance
@ -176,6 +175,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
#
#
from core.MLMetrics import metrics_factory
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import pandas as pd
@ -206,6 +207,9 @@ mix_df = pd.concat([test_df.copy(), predictions], axis=1)
mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
metrics = metrics_factory("Regression")
metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"])
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]