Compare commits

...

39 commits

Author SHA1 Message Date
KhalimCK
f6c775fdcf
Merge pull request #118 from Hestia-Homes/heat-dev-model
Heat dev model
2024-05-31 13:39:45 +01:00
Michael Duong
e695a10c14 remove the scenario data as we don't have these eprs 2024-05-30 21:04:32 +01:00
Michael Duong
8650840058 remove the scenario data as we don't have these eprs 2024-05-30 20:59:19 +01:00
Michael Duong
2298895a39 run a new heat model for new data 2024-05-30 20:54:03 +01:00
Michael Duong
45e21383fe run a new heat model for new data 2024-05-30 20:53:23 +01:00
Github-Bot
ff032f122f Update Registry 2024-03-28 17:23:00 +00:00
Github-Bot
a798385639 Update Registry 2024-03-28 17:22:22 +00:00
KhalimCK
6fa2625250
Merge pull request #109 from Hestia-Homes/heat-dev-model
Heat dev model
2024-03-28 17:21:46 +00:00
Michael Duong
5415cc972d add new model 2024-03-28 17:00:08 +00:00
Michael Duong
bc29731c69 add new model 2024-03-28 16:58:42 +00:00
Michael Duong
d8ff8cc16a add new model 2024-03-28 16:52:23 +00:00
Michael Duong
8e6b1c2690 use new data for heat 2024-03-28 16:26:26 +00:00
Github-Bot
5290a0c769 Update Registry 2024-01-30 10:38:50 +00:00
Github-Bot
11d2be463e Update Registry 2024-01-30 10:38:06 +00:00
KhalimCK
e8dea4c105
Merge pull request #95 from Hestia-Homes/heat-dev-model
Heat dev model
2024-01-30 10:37:20 +00:00
Michael Duong
7d44b82583 Merge branch 'heat-dev' of github.com:Hestia-Homes/ML into heat-dev-model 2024-01-29 20:37:53 +00:00
Michael Duong
66ff6e1e22 Using all permutation data with all data used in training, nteral cross validation 2024-01-29 20:37:13 +00:00
Github-Bot
273dcdad31 Update Registry 2024-01-18 10:38:15 +00:00
Github-Bot
4b81ce9374 Update Registry 2024-01-18 10:37:20 +00:00
KhalimCK
469f77d8fb
Merge pull request #93 from Hestia-Homes/heat-dev-model
Heat dev model
2024-01-18 10:36:22 +00:00
Michael Duong
55da3d0339 Merge branch 'heat-dev' of github.com:Hestia-Homes/ML into heat-dev-model 2024-01-18 00:14:36 +00:00
Michael Duong
66f54a92e2 train new 600 second model with new data 2024-01-18 00:14:20 +00:00
Github-Bot
ba1971498c Update Registry 2023-11-28 15:02:13 +00:00
Github-Bot
2cb28616bb Update Registry 2023-11-28 15:01:27 +00:00
quandanrepo
7554988070
Merge pull request #87 from Hestia-Homes/heat-dev-model
add restriction to datast
2023-11-28 15:00:46 +00:00
Michael Duong
9271df34e0 add restriction to datast 2023-11-28 14:51:55 +00:00
Github-Bot
7f984e6cbf Update Registry 2023-11-27 22:18:17 +00:00
Github-Bot
d8d5a66537 Update Registry 2023-11-27 22:17:29 +00:00
quandanrepo
676539e6a7
Merge pull request #86 from Hestia-Homes/heat-dev-model
Heat dev model
2023-11-27 22:16:44 +00:00
quandanrepo
890ca15193
Merge branch 'heat-dev' into heat-dev-model 2023-11-27 22:09:53 +00:00
Michael Duong
5a9eb608bd commit first heat-model 2023-11-27 22:06:18 +00:00
Michael Duong
f4f8dc2bf2 Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-11-27 21:51:03 +00:00
Github-Bot
2d331736a4 Update Registry 2023-10-10 12:47:01 +00:00
Github-Bot
7d685caaf5 Update Registry 2023-10-10 12:46:02 +00:00
quandanrepo
dffb01bf8e
Merge pull request #67 from Hestia-Homes/heat-dev-model
Heat dev model
2023-10-10 13:45:23 +01:00
Michael Duong
d2a7615e3b Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-10-10 12:33:51 +00:00
Michael Duong
4c6c5330d8 add new model, new branch 2023-10-10 12:33:44 +00:00
Michael Duong
9e7d0fa538 add new model 2023-10-10 12:32:25 +00:00
Michael Duong
ad2c266727 initial model for heat-dev 2023-10-09 17:52:47 +00:00
7 changed files with 76 additions and 59 deletions

View file

@ -4,9 +4,7 @@ After the model is built, we can evaluate its performance
"""
import os
import yaml
import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceMetrics import MLMetrics
from core.interface.InterfaceDataClient import DataClient

View file

@ -18,30 +18,39 @@ def remove_starting_columns(df):
return df
def remove_floor_height_ending(df):
# df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
print("we in here")
def keep_negative_heat_change(df):
df = df[df["heat_demand_change"] < 0]
return df
def remove_minimum_habitable_room_size(df):
# Need minimum of 6.5m per habitable room
df = df[
df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
].reset_index(drop=True)
def keep_negative_carbon_change(df):
df = df[df["carbon_change"] < 0]
return df
def keep_flats(df):
df = df[df["property_type"] == "Flat"]
# TODO: Move to ETL pipeline
def remove_unreasonable_habitable_rooms(df):
"""
Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2
"""
minimum_room_size_index = (
df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5
)
df = df[minimum_room_size_index]
return df
def keep_non_zero_rdsap(df):
df = df[df["rdsap_change"] != 0]
def remove_top_1_percent_heat_demand(df):
# threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%']
threshold_value = 860
df = df[df["heat_demand_starting"] < threshold_value]
return df
def remove_top_1_percent_carbon(df):
# threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%']
threshold_value = 18
df = df[df["carbon_starting"] < threshold_value]
return df
@ -54,10 +63,11 @@ def keep_non_zero_rdsap(df):
# return df
business_logic = {
# "keep_non_zero_rdsap": keep_non_zero_rdsap,
# "keep_flats": keep_flats,
# "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
# "remove_floor_height_ending": remove_floor_height_ending
"remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms,
"keep_negative_heat_change": keep_negative_heat_change,
"keep_negative_carbon_change": keep_negative_carbon_change,
"remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand,
"remove_top_1_percent_carbon": remove_top_1_percent_carbon,
# "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}

View file

@ -1,6 +1,7 @@
"""
After predictions, we may want to apply some post processing to the predictions
"""
import pandas as pd
@ -13,10 +14,11 @@ def clip_predictions_to_minimum_value(
predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement
replace_index = (
predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"]
predictions_df["predictions"]
> predictions_df["heat_demand_starting"] - minimum_value
)
predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "sap_starting"] + minimum_value
predictions_df.loc[replace_index, "heat_demand_starting"] - minimum_value
)
predictions_new = predictions_df["predictions"]

View file

@ -8,6 +8,6 @@ default:
# - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -31,13 +31,13 @@ default:
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: sap_ending
target: heat_demand_ending
identifier_columns: ["uprn"]
# drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
"heat_demand_change", "carbon_change", "rdsap_change", "sap_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms']
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',

View file

@ -24,7 +24,7 @@ stages:
- heat_demand_change
- carbon_change
- rdsap_change
- heat_demand_ending
- sap_ending
- carbon_ending
- days_to_starting
- days_to_ending
@ -37,7 +37,7 @@ stages:
default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: sap_ending
default.feature_processor.feature_processor_config.target: heat_demand_ending
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
@ -49,8 +49,8 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
md5: 13cd955d579de20efe743f82bc434c7e.dir
size: 37294025
nfiles: 2
build_model:
cmd: python 2_build_model.py
@ -61,8 +61,8 @@ stages:
size: 4820
- path: data/prepared_data
hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
md5: 13cd955d579de20efe743f82bc434c7e.dir
size: 37294025
nfiles: 2
params:
configs/build_model.yaml:
@ -94,18 +94,18 @@ stages:
outs:
- path: data/fit_predictions/
hash: md5
md5: d9c9afc05e8780db47c0548b19bf7d19.dir
size: 3349989
md5: b9c9ca64ea6973c409c3a7b8f8ed0c3e.dir
size: 2902493
nfiles: 1
- path: data/model/
hash: md5
md5: 13c3100e1486c27a83a8a47491077842.dir
size: 773523079
md5: a9215bba342ed7ec3f97815dfef94e48.dir
size: 727501601
nfiles: 36
- path: metrics/fit_metrics.json
hash: md5
md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a
size: 224
md5: 548a431d58cd4f5a3118235dec734372
size: 219
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
@ -115,13 +115,13 @@ stages:
size: 2464
- path: data/model
hash: md5
md5: 13c3100e1486c27a83a8a47491077842.dir
size: 773523079
md5: a9215bba342ed7ec3f97815dfef94e48.dir
size: 727501601
nfiles: 36
- path: data/prepared_data
hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
md5: 13cd955d579de20efe743f82bc434c7e.dir
size: 37294025
nfiles: 2
params:
configs/settings.yaml:
@ -133,25 +133,25 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 463197
md5: 484781d6b359e458a25e9ab728d6514d.dir
size: 380517
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
deps:
- path: 4_generate_metrics.py
hash: md5
md5: 4fedb86d89d528f0a6597934ba3890a0
size: 3484
md5: d61bb524f706917f6a3eb72b1ab8bc61
size: 3447
- path: data/predictions
hash: md5
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 463197
md5: 484781d6b359e458a25e9ab728d6514d.dir
size: 380517
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
md5: 13cd955d579de20efe743f82bc434c7e.dir
size: 37294025
nfiles: 2
params:
configs/settings.yaml:
@ -161,8 +161,8 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 3e08df02fd5c5d094bcf936e1338d596
size: 223
md5: 4d246765aff7c45079d02b4d8f7527f7
size: 220
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
@ -176,15 +176,14 @@ stages:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md
outs:
- path: metrics/scenario_metrics.md
hash: md5
md5: fa4d6d7bbd7818613800da5f8f37ea96
size: 363
md5: d41d8cd98f00b204e9800998ecf8427e
size: 0
- path: metrics/scenario_table.md
hash: md5
md5: d6baf100a1623cc2467c2f8221d314c9
size: 2133
md5: d41d8cd98f00b204e9800998ecf8427e
size: 0

View file

@ -1,6 +1,7 @@
"""
Doing some eda on dataset
"""
# Look at response variable
from matplotlib import pyplot as plt
@ -38,7 +39,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
x=target, y="HEAT_DEMAND_STARTING", style="o"
)
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
# Load the autogluon model and check feature importance
@ -176,6 +176,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
#
#
from core.MLMetrics import metrics_factory
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import pandas as pd
@ -216,6 +218,12 @@ mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])]
metrics = metrics_factory("Regression")
metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"])
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]
from sklearn.metrics.pairwise import cosine_similarity
row_index = 0