Compare commits

...

32 commits

Author SHA1 Message Date
KhalimCK
6fa2625250
Merge pull request #109 from Hestia-Homes/heat-dev-model
Heat dev model
2024-03-28 17:21:46 +00:00
Michael Duong
5415cc972d add new model 2024-03-28 17:00:08 +00:00
Michael Duong
bc29731c69 add new model 2024-03-28 16:58:42 +00:00
Michael Duong
d8ff8cc16a add new model 2024-03-28 16:52:23 +00:00
Michael Duong
8e6b1c2690 use new data for heat 2024-03-28 16:26:26 +00:00
Github-Bot
5290a0c769 Update Registry 2024-01-30 10:38:50 +00:00
Github-Bot
11d2be463e Update Registry 2024-01-30 10:38:06 +00:00
KhalimCK
e8dea4c105
Merge pull request #95 from Hestia-Homes/heat-dev-model
Heat dev model
2024-01-30 10:37:20 +00:00
Michael Duong
7d44b82583 Merge branch 'heat-dev' of github.com:Hestia-Homes/ML into heat-dev-model 2024-01-29 20:37:53 +00:00
Michael Duong
66ff6e1e22 Using all permutation data with all data used in training, nteral cross validation 2024-01-29 20:37:13 +00:00
Github-Bot
273dcdad31 Update Registry 2024-01-18 10:38:15 +00:00
Github-Bot
4b81ce9374 Update Registry 2024-01-18 10:37:20 +00:00
KhalimCK
469f77d8fb
Merge pull request #93 from Hestia-Homes/heat-dev-model
Heat dev model
2024-01-18 10:36:22 +00:00
Michael Duong
55da3d0339 Merge branch 'heat-dev' of github.com:Hestia-Homes/ML into heat-dev-model 2024-01-18 00:14:36 +00:00
Michael Duong
66f54a92e2 train new 600 second model with new data 2024-01-18 00:14:20 +00:00
Github-Bot
ba1971498c Update Registry 2023-11-28 15:02:13 +00:00
Github-Bot
2cb28616bb Update Registry 2023-11-28 15:01:27 +00:00
quandanrepo
7554988070
Merge pull request #87 from Hestia-Homes/heat-dev-model
add restriction to datast
2023-11-28 15:00:46 +00:00
Michael Duong
9271df34e0 add restriction to datast 2023-11-28 14:51:55 +00:00
Github-Bot
7f984e6cbf Update Registry 2023-11-27 22:18:17 +00:00
Github-Bot
d8d5a66537 Update Registry 2023-11-27 22:17:29 +00:00
quandanrepo
676539e6a7
Merge pull request #86 from Hestia-Homes/heat-dev-model
Heat dev model
2023-11-27 22:16:44 +00:00
quandanrepo
890ca15193
Merge branch 'heat-dev' into heat-dev-model 2023-11-27 22:09:53 +00:00
Michael Duong
5a9eb608bd commit first heat-model 2023-11-27 22:06:18 +00:00
Michael Duong
f4f8dc2bf2 Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-11-27 21:51:03 +00:00
Github-Bot
2d331736a4 Update Registry 2023-10-10 12:47:01 +00:00
Github-Bot
7d685caaf5 Update Registry 2023-10-10 12:46:02 +00:00
quandanrepo
dffb01bf8e
Merge pull request #67 from Hestia-Homes/heat-dev-model
Heat dev model
2023-10-10 13:45:23 +01:00
Michael Duong
d2a7615e3b Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-10-10 12:33:51 +00:00
Michael Duong
4c6c5330d8 add new model, new branch 2023-10-10 12:33:44 +00:00
Michael Duong
9e7d0fa538 add new model 2023-10-10 12:32:25 +00:00
Michael Duong
ad2c266727 initial model for heat-dev 2023-10-09 17:52:47 +00:00
12 changed files with 120 additions and 93 deletions

View file

@ -16,15 +16,15 @@
"active": true
},
"heat": {
"version": "v0.3.0",
"version": "v0.4.0",
"stage": {
"dev": "v0.3.0"
"dev": "v0.5.0"
},
"registered": true,
"active": true
},
"carbon": {
"version": "v0.3.0",
"version": "v0.4.0",
"stage": {
"dev": "v0.3.0"
},

View file

@ -9,7 +9,7 @@ ARG RUNTIME_ENVIRONMENT
ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT}
# Install necessary build tools - required to test locally
RUN yum install -y gcc python3-devel
RUN yum install -y gcc python3-devel gcc-c++
# Install python packages
COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt

View file

@ -4,9 +4,7 @@ After the model is built, we can evaluate its performance
"""
import os
import yaml
import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceMetrics import MLMetrics
from core.interface.InterfaceDataClient import DataClient

View file

@ -18,30 +18,39 @@ def remove_starting_columns(df):
return df
def remove_floor_height_ending(df):
# df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
print("we in here")
def keep_negative_heat_change(df):
df = df[df["heat_demand_change"] < 0]
return df
def remove_minimum_habitable_room_size(df):
# Need minimum of 6.5m per habitable room
df = df[
df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
].reset_index(drop=True)
def keep_negative_carbon_change(df):
df = df[df["carbon_change"] < 0]
return df
def keep_flats(df):
df = df[df["property_type"] == "Flat"]
# TODO: Move to ETL pipeline
def remove_unreasonable_habitable_rooms(df):
"""
Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2
"""
minimum_room_size_index = (
df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5
)
df = df[minimum_room_size_index]
return df
def keep_non_zero_rdsap(df):
df = df[df["rdsap_change"] != 0]
def remove_top_1_percent_heat_demand(df):
# threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%']
threshold_value = 860
df = df[df["heat_demand_starting"] < threshold_value]
return df
def remove_top_1_percent_carbon(df):
# threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%']
threshold_value = 18
df = df[df["carbon_starting"] < threshold_value]
return df
@ -54,10 +63,11 @@ def keep_non_zero_rdsap(df):
# return df
business_logic = {
# "keep_non_zero_rdsap": keep_non_zero_rdsap,
# "keep_flats": keep_flats,
# "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
# "remove_floor_height_ending": remove_floor_height_ending
"remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms,
"keep_negative_heat_change": keep_negative_heat_change,
"keep_negative_carbon_change": keep_negative_carbon_change,
"remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand,
"remove_top_1_percent_carbon": remove_top_1_percent_carbon,
# "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}

View file

@ -1,6 +1,7 @@
"""
After predictions, we may want to apply some post processing to the predictions
"""
import pandas as pd
@ -13,10 +14,11 @@ def clip_predictions_to_minimum_value(
predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement
replace_index = (
predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"]
predictions_df["predictions"]
> predictions_df["heat_demand_starting"] - minimum_value
)
predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "sap_starting"] + minimum_value
predictions_df.loc[replace_index, "heat_demand_starting"] - minimum_value
)
predictions_new = predictions_df["predictions"]

View file

@ -18,13 +18,9 @@ default:
prepare_data:
input_dataclient_type: aws-s3
output_dataclient_type: local
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet
train_proportion: 1
# data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
@ -33,9 +29,12 @@ default:
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: sap_ending
target: heat_demand_ending
identifier_columns: ["uprn"]
drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending"]
drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "sap_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms']
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null

View file

@ -1,36 +1,56 @@
schema: '2.0'
stages:
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics
prepare_data:
cmd: python 1_prepare_data.py
deps:
- path: 1_prepare_data.py
hash: md5
md5: 1793a35e71751d3c84f9affc67ecb9a8
size: 4296
md5: 11a3b8bfdfe199ab7ecc39ccc5652649
size: 4298
params:
configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns:
- heat_demand_change
- carbon_change
- rdsap_change
- heat_demand_ending
- sap_ending
- carbon_ending
- days_to_starting
- days_to_ending
- number_habitable_rooms_starting
- number_habitable_rooms_ending
- number_heated_rooms_starting
- number_heated_rooms_ending
- number_habitable_rooms
- number_heated_rooms
default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: sap_ending
default.feature_processor.feature_processor_config.target: heat_demand_ending
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet
default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
default.prepare_data.train_proportion: 1
default.prepare_data.train_proportion: 0.9
outs:
- path: data/prepared_data/
hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir
size: 43859225
md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 36889932
nfiles: 2
build_model:
cmd: python 2_build_model.py
@ -41,8 +61,8 @@ stages:
size: 4820
- path: data/prepared_data
hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir
size: 43859225
md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 36889932
nfiles: 2
params:
configs/build_model.yaml:
@ -73,18 +93,18 @@ stages:
outs:
- path: data/fit_predictions/
hash: md5
md5: ede187e9d0bffdef054f573f3c2bd222.dir
size: 3578590
md5: 7dda2f1dd257a6c5beaaa0b74eab6d5d.dir
size: 2901760
nfiles: 1
- path: data/model/
hash: md5
md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir
size: 814720415
nfiles: 31
md5: 741f8aed57383e860c535feb8b0adb71.dir
size: 752079341
nfiles: 32
- path: metrics/fit_metrics.json
hash: md5
md5: c45b84f12971a0156e4f3d85d3e725f5
size: 218
md5: 8eaa72b08074f735a9e54de871edc6e6
size: 221
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
@ -94,13 +114,13 @@ stages:
size: 2464
- path: data/model
hash: md5
md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir
size: 814720415
nfiles: 31
md5: 741f8aed57383e860c535feb8b0adb71.dir
size: 752079341
nfiles: 32
- path: data/prepared_data
hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir
size: 43859225
md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 36889932
nfiles: 2
params:
configs/settings.yaml:
@ -112,25 +132,25 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 5e60ca251af51de6fef3d0c659f8bb27.dir
size: 627416
md5: d842fe5350a3330c4c17e7e21c6359b2.dir
size: 380489
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
deps:
- path: 4_generate_metrics.py
hash: md5
md5: 4fedb86d89d528f0a6597934ba3890a0
size: 3484
md5: d61bb524f706917f6a3eb72b1ab8bc61
size: 3447
- path: data/predictions
hash: md5
md5: 5e60ca251af51de6fef3d0c659f8bb27.dir
size: 627416
md5: d842fe5350a3330c4c17e7e21c6359b2.dir
size: 380489
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir
size: 43859225
md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 36889932
nfiles: 2
params:
configs/settings.yaml:
@ -140,16 +160,5 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 033efa4d4044b6b6fc92dd37194727fa
size: 225
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics
md5: 2632fa5d0a38763c177bf0466a670c8b
size: 220

View file

@ -1,6 +1,7 @@
"""
Doing some eda on dataset
"""
# Look at response variable
from matplotlib import pyplot as plt
@ -38,7 +39,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
x=target, y="HEAT_DEMAND_STARTING", style="o"
)
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
# Load the autogluon model and check feature importance
@ -176,6 +176,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
#
#
from core.MLMetrics import metrics_factory
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import pandas as pd
@ -216,6 +218,12 @@ mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])]
metrics = metrics_factory("Regression")
metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"])
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]
from sklearn.metrics.pairwise import cosine_similarity
row_index = 0

View file

@ -1,7 +1,7 @@
joblib==1.3.2
boto3==1.28.17
pandas==1.5.3
autogluon==0.8.2
dynaconf==3.2.0
pandas==2.1.4
autogluon==1.0.0
dynaconf==3.2.1
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,7 +1,7 @@
joblib==1.3.2
boto3==1.28.17
pandas==1.5.3
autogluon==0.8.2
dynaconf==3.2.0
pandas==2.1.4
autogluon==1.0.0
dynaconf==3.2.1
pyarrow==13.0.0
PyYAML==6.0.1

View file

@ -1,9 +1,10 @@
joblib==1.3.2
boto3==1.28.17
pandas==1.5.3
autogluon==0.8.2
dynaconf==3.2.0
alibi==0.9.4
pandas==2.1.4
autogluon==1.0.0
ray==2.6.3
dynaconf==3.2.1
alibi==0.9.5
shap==0.42.1
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,4 +1,4 @@
boto3==1.28.41
pandas==1.5.3
autogluon==0.8.2
dynaconf==3.2.0
pandas==2.1.4
autogluon==1.0.0
dynaconf==3.2.1