Compare commits

...

25 commits

Author SHA1 Message Date
KhalimCK
e8dea4c105
Merge pull request #95 from Hestia-Homes/heat-dev-model
Heat dev model
2024-01-30 10:37:20 +00:00
Michael Duong
7d44b82583 Merge branch 'heat-dev' of github.com:Hestia-Homes/ML into heat-dev-model 2024-01-29 20:37:53 +00:00
Michael Duong
66ff6e1e22 Using all permutation data with all data used in training, nteral cross validation 2024-01-29 20:37:13 +00:00
Github-Bot
273dcdad31 Update Registry 2024-01-18 10:38:15 +00:00
Github-Bot
4b81ce9374 Update Registry 2024-01-18 10:37:20 +00:00
KhalimCK
469f77d8fb
Merge pull request #93 from Hestia-Homes/heat-dev-model
Heat dev model
2024-01-18 10:36:22 +00:00
Michael Duong
55da3d0339 Merge branch 'heat-dev' of github.com:Hestia-Homes/ML into heat-dev-model 2024-01-18 00:14:36 +00:00
Michael Duong
66f54a92e2 train new 600 second model with new data 2024-01-18 00:14:20 +00:00
Github-Bot
ba1971498c Update Registry 2023-11-28 15:02:13 +00:00
Github-Bot
2cb28616bb Update Registry 2023-11-28 15:01:27 +00:00
quandanrepo
7554988070
Merge pull request #87 from Hestia-Homes/heat-dev-model
add restriction to datast
2023-11-28 15:00:46 +00:00
Michael Duong
9271df34e0 add restriction to datast 2023-11-28 14:51:55 +00:00
Github-Bot
7f984e6cbf Update Registry 2023-11-27 22:18:17 +00:00
Github-Bot
d8d5a66537 Update Registry 2023-11-27 22:17:29 +00:00
quandanrepo
676539e6a7
Merge pull request #86 from Hestia-Homes/heat-dev-model
Heat dev model
2023-11-27 22:16:44 +00:00
quandanrepo
890ca15193
Merge branch 'heat-dev' into heat-dev-model 2023-11-27 22:09:53 +00:00
Michael Duong
5a9eb608bd commit first heat-model 2023-11-27 22:06:18 +00:00
Michael Duong
f4f8dc2bf2 Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-11-27 21:51:03 +00:00
Github-Bot
2d331736a4 Update Registry 2023-10-10 12:47:01 +00:00
Github-Bot
7d685caaf5 Update Registry 2023-10-10 12:46:02 +00:00
quandanrepo
dffb01bf8e
Merge pull request #67 from Hestia-Homes/heat-dev-model
Heat dev model
2023-10-10 13:45:23 +01:00
Michael Duong
d2a7615e3b Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-10-10 12:33:51 +00:00
Michael Duong
4c6c5330d8 add new model, new branch 2023-10-10 12:33:44 +00:00
Michael Duong
9e7d0fa538 add new model 2023-10-10 12:32:25 +00:00
Michael Duong
ad2c266727 initial model for heat-dev 2023-10-09 17:52:47 +00:00
12 changed files with 150 additions and 64 deletions

View file

@ -8,17 +8,25 @@
"active": true
},
"sap": {
"version": "v0.1.0",
"version": "v0.4.0",
"stage": {
"dev": "v0.1.0"
"dev": "v0.4.0"
},
"registered": true,
"active": true
},
"heat": {
"version": "v0.0.1",
"version": "v0.3.0",
"stage": {
"dev": "v0.0.1"
"dev": "v0.3.0"
},
"registered": true,
"active": true
},
"carbon": {
"version": "v0.3.0",
"stage": {
"dev": "v0.2.0"
},
"registered": true,
"active": true

View file

@ -87,7 +87,8 @@ def prepare_data(
if train_proportion == 1:
train = data
test = None
# Sample 10% of the data for testing
test = data.sample(round(len(data) * 0.1))
else:
train, test = train_test_split(
data, train_size=train_proportion, test_size=(1 - train_proportion)

View file

@ -26,9 +26,12 @@ prepare_data_params = settings.prepare_data
build_model_params = settings.build_model
feature_process_params = settings.feature_processor
generate_metrics_params = settings.generate_metrics
generate_predictions_params = settings.generate_predictions
model_type = build_model_params["model_type"]
target = feature_process_params["feature_processor_config"]["target"]
fit_predictions_filepath = build_model_params["fit_predictions_filepath"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
identifier_columns = feature_process_params["feature_processor_config"][
"identifier_columns"
]
@ -60,6 +63,8 @@ def build_model(
identifier_columns: List[str],
model_save_location: str,
model_hyperparameters: dict,
fit_predictions_filepath: str,
predictions_column_name: str,
fit_metrics_filepath: str,
train_filepath: Union[str, None] = None,
test_filepath: Union[str, None] = None,
@ -93,6 +98,15 @@ def build_model(
data=train_data, post_prediction_logic=post_prediction_logic
)
logger.info("--- Saving fit predictions ---")
predictions_df = pd.DataFrame(fit_predictions)
predictions_df.columns = [predictions_column_name]
dataclient.save_data(
obj=predictions_df, location=fit_predictions_filepath, save_config=None
)
logger.info("--- Generating fit metrics ---")
metrics_output = metrics.generate_metrics(
@ -128,6 +142,8 @@ if __name__ == "__main__":
train_filepath=train_filepath,
test_filepath=test_filepath,
fit_metrics_filepath=fit_metrics_filepath,
fit_predictions_filepath=fit_predictions_filepath,
predictions_column_name=predictions_column_name,
)
logger.info(f"--- {__file__} - Complete! ---")

View file

@ -4,9 +4,7 @@ After the model is built, we can evaluate its performance
"""
import os
import yaml
import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceMetrics import MLMetrics
from core.interface.InterfaceDataClient import DataClient

View file

@ -3,6 +3,7 @@ default:
model_type: AutogluonAutoML
model_save_filepath: ./data/model/optimised/
fit_metrics_filepath: ./metrics/fit_metrics.json
fit_predictions_filepath: ./data/fit_predictions/predictions.parquet
SKLearnLinearRegression: null
@ -15,6 +16,6 @@ default:
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 4000
presets: medium_quality
excluded_model_types: ['KNN', 'RF']
excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT']
infer_limit: 0.05
infer_limit_batch_size: 10000

View file

@ -9,15 +9,51 @@ Business Logic dict + functions
def remove_starting_columns(df):
keep_column_index = [
False if col_name.endswith("_STARTING") else True
False if col_name.endswith("_starting") else True
for col_name in list(df.columns)
]
keep_columns = df.columns[keep_column_index].to_list()
keep_columns.append("SAP_STARTING")
keep_columns.append("sap_starting")
df = df[keep_columns]
return df
def keep_negative_heat_change(df):
df = df[df["heat_demand_change"] < 0]
return df
def keep_negative_carbon_change(df):
df = df[df["carbon_change"] < 0]
return df
# TODO: Move to ETL pipeline
def remove_unreasonable_habitable_rooms(df):
"""
Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2
"""
minimum_room_size_index = (
df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5
)
df = df[minimum_room_size_index]
return df
def remove_top_1_percent_heat_demand(df):
# threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%']
threshold_value = 860
df = df[df["heat_demand_starting"] < threshold_value]
return df
def remove_top_1_percent_carbon(df):
# threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%']
threshold_value = 18
df = df[df["carbon_starting"] < threshold_value]
return df
# def keep_ending_columns(df):
# ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
# keep_columns = df.columns[ending_column_index].to_list()
@ -27,6 +63,11 @@ def remove_starting_columns(df):
# return df
business_logic = {
"remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms,
"keep_negative_heat_change": keep_negative_heat_change,
"keep_negative_carbon_change": keep_negative_carbon_change,
"remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand,
"remove_top_1_percent_carbon": remove_top_1_percent_carbon,
# "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}

View file

@ -5,16 +5,19 @@ import pandas as pd
def clip_predictions_to_minimum_value(
data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1
data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0
) -> pd.Series:
series_name = predictions.name
predictions.name = "predictions"
predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement
replace_index = predictions_df["SAP_STARTING"] + 1 > predictions_df["predictions"]
replace_index = (
predictions_df["predictions"]
> predictions_df["heat_demand_starting"] - minimum_value
)
predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
predictions_df.loc[replace_index, "heat_demand_starting"] - minimum_value
)
predictions_new = predictions_df["predictions"]

View file

@ -21,8 +21,9 @@ default:
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
train_proportion: 0.9
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet
train_proportion: 1
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
@ -31,9 +32,9 @@ default:
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: SAP_ENDING
identifier_columns: ["UPRN"]
drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
target: heat_demand_ending
identifier_columns: ["uprn"]
drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "sap_ending", "carbon_ending"]
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null

View file

@ -5,44 +5,44 @@ stages:
deps:
- path: 1_prepare_data.py
hash: md5
md5: c9f030df733e318b80d1fa91b7732f79
size: 5132
md5: 11a3b8bfdfe199ab7ecc39ccc5652649
size: 4298
params:
configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns:
- HEAT_DEMAND_CHANGE
- CARBON_CHANGE
- RDSAP_CHANGE
- HEAT_DEMAND_ENDING
- CARBON_ENDING
- heat_demand_change
- carbon_change
- rdsap_change
- sap_ending
- carbon_ending
default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: SAP_ENDING
default.feature_processor.feature_processor_config.target: heat_demand_ending
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
default.prepare_data.train_proportion: 0.9
default.prepare_data.train_proportion: 1
outs:
- path: data/prepared_data/
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: dcd41f841c67b474a81a14e683646237.dir
size: 36317761
nfiles: 2
build_model:
cmd: python 2_build_model.py
deps:
- path: 2_build_model.py
hash: md5
md5: 84699d208874c52accaff61c6af9bb0a
size: 5359
md5: 7231450b78920b0c5e7c6bada496b24a
size: 4820
- path: data/prepared_data
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: dcd41f841c67b474a81a14e683646237.dir
size: 36317761
nfiles: 2
params:
configs/build_model.yaml:
@ -51,6 +51,7 @@ stages:
model_type: AutogluonAutoML
model_save_filepath: ./data/model/optimised/
fit_metrics_filepath: ./metrics/fit_metrics.json
fit_predictions_filepath: ./data/fit_predictions/predictions.parquet
SKLearnLinearRegression:
SKLearnSVMRegression:
kernel: linear
@ -61,34 +62,45 @@ stages:
time_limit: 4000
presets: medium_quality
excluded_model_types:
- KNN
- RF
- FASTAI
- CAT
- NN_TORCH
- KNN
- XT
infer_limit: 0.05
infer_limit_batch_size: 10000
outs:
- path: data/fit_predictions/
hash: md5
md5: 89063bb3b725afe61b6ed5edb724bb06.dir
size: 3090627
nfiles: 1
- path: data/model/
hash: md5
md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 473398662
nfiles: 27
md5: c90eef03b5a76175506c048e88a401dd.dir
size: 783489255
nfiles: 32
- path: metrics/fit_metrics.json
hash: md5
md5: 2bb16ac67de8778fbc08171d562b34d5
size: 184
md5: 33f18fa6b7dda535de09733d4792c0fc
size: 217
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
- path: 3_generate_predictions.py
hash: md5
md5: 5ef2856a5a977304f1ec01f9b4205262
size: 3028
md5: 0a70ad4dfe99414a75d1261c75a177b9
size: 2464
- path: data/model
hash: md5
md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 473398662
nfiles: 27
md5: c90eef03b5a76175506c048e88a401dd.dir
size: 783489255
nfiles: 32
- path: data/prepared_data
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: dcd41f841c67b474a81a14e683646237.dir
size: 36317761
nfiles: 2
params:
configs/settings.yaml:
@ -100,25 +112,25 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 374532
md5: 406e2ebe33d6abed9042f137d8c0d2bf.dir
size: 520735
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
deps:
- path: 4_generate_metrics.py
hash: md5
md5: 2c9fb78955a8c19cff0a098976f81d1b
size: 4487
md5: 567b1acb819e2ff432b989cdbdd4a2bf
size: 3448
- path: data/predictions
hash: md5
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 374532
md5: 406e2ebe33d6abed9042f137d8c0d2bf.dir
size: 520735
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
md5: dcd41f841c67b474a81a14e683646237.dir
size: 36317761
nfiles: 2
params:
configs/settings.yaml:
@ -128,15 +140,15 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 2e13ae67759a64261d03224f1c0d4bf4
size: 185
md5: cc1ad408f2d9d3128df71822a38ea85e
size: 218
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: fbb7e3b1b98b517c870f3e1df3e7f695
size: 1676
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data

View file

@ -38,6 +38,7 @@ stages:
- configs/build_model.yaml:
outs:
- data/model/
- data/fit_predictions/
- metrics/fit_metrics.json
always_changed: true
generate_predictions:

View file

@ -38,7 +38,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
x=target, y="HEAT_DEMAND_STARTING", style="o"
)
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
# Load the autogluon model and check feature importance
@ -176,6 +175,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
#
#
from core.MLMetrics import metrics_factory
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import pandas as pd
@ -206,6 +207,9 @@ mix_df = pd.concat([test_df.copy(), predictions], axis=1)
mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
metrics = metrics_factory("Regression")
metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"])
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]

View file

@ -1,4 +1,4 @@
dvc==3.18.0
dvc-s3==2.23.0
gto==1.0.4
pyOpenSSL==23.2.0
dvc==3.36.0
dvc-s3==3.0.1
gto==1.6.1
pyOpenSSL==23.3.0