Compare commits

...

32 commits

Author SHA1 Message Date
KhalimCK
6fa2625250
Merge pull request #109 from Hestia-Homes/heat-dev-model
Heat dev model
2024-03-28 17:21:46 +00:00
Michael Duong
5415cc972d add new model 2024-03-28 17:00:08 +00:00
Michael Duong
bc29731c69 add new model 2024-03-28 16:58:42 +00:00
Michael Duong
d8ff8cc16a add new model 2024-03-28 16:52:23 +00:00
Michael Duong
8e6b1c2690 use new data for heat 2024-03-28 16:26:26 +00:00
Github-Bot
5290a0c769 Update Registry 2024-01-30 10:38:50 +00:00
Github-Bot
11d2be463e Update Registry 2024-01-30 10:38:06 +00:00
KhalimCK
e8dea4c105
Merge pull request #95 from Hestia-Homes/heat-dev-model
Heat dev model
2024-01-30 10:37:20 +00:00
Michael Duong
7d44b82583 Merge branch 'heat-dev' of github.com:Hestia-Homes/ML into heat-dev-model 2024-01-29 20:37:53 +00:00
Michael Duong
66ff6e1e22 Using all permutation data with all data used in training, nteral cross validation 2024-01-29 20:37:13 +00:00
Github-Bot
273dcdad31 Update Registry 2024-01-18 10:38:15 +00:00
Github-Bot
4b81ce9374 Update Registry 2024-01-18 10:37:20 +00:00
KhalimCK
469f77d8fb
Merge pull request #93 from Hestia-Homes/heat-dev-model
Heat dev model
2024-01-18 10:36:22 +00:00
Michael Duong
55da3d0339 Merge branch 'heat-dev' of github.com:Hestia-Homes/ML into heat-dev-model 2024-01-18 00:14:36 +00:00
Michael Duong
66f54a92e2 train new 600 second model with new data 2024-01-18 00:14:20 +00:00
Github-Bot
ba1971498c Update Registry 2023-11-28 15:02:13 +00:00
Github-Bot
2cb28616bb Update Registry 2023-11-28 15:01:27 +00:00
quandanrepo
7554988070
Merge pull request #87 from Hestia-Homes/heat-dev-model
add restriction to datast
2023-11-28 15:00:46 +00:00
Michael Duong
9271df34e0 add restriction to datast 2023-11-28 14:51:55 +00:00
Github-Bot
7f984e6cbf Update Registry 2023-11-27 22:18:17 +00:00
Github-Bot
d8d5a66537 Update Registry 2023-11-27 22:17:29 +00:00
quandanrepo
676539e6a7
Merge pull request #86 from Hestia-Homes/heat-dev-model
Heat dev model
2023-11-27 22:16:44 +00:00
quandanrepo
890ca15193
Merge branch 'heat-dev' into heat-dev-model 2023-11-27 22:09:53 +00:00
Michael Duong
5a9eb608bd commit first heat-model 2023-11-27 22:06:18 +00:00
Michael Duong
f4f8dc2bf2 Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-11-27 21:51:03 +00:00
Github-Bot
2d331736a4 Update Registry 2023-10-10 12:47:01 +00:00
Github-Bot
7d685caaf5 Update Registry 2023-10-10 12:46:02 +00:00
quandanrepo
dffb01bf8e
Merge pull request #67 from Hestia-Homes/heat-dev-model
Heat dev model
2023-10-10 13:45:23 +01:00
Michael Duong
d2a7615e3b Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-10-10 12:33:51 +00:00
Michael Duong
4c6c5330d8 add new model, new branch 2023-10-10 12:33:44 +00:00
Michael Duong
9e7d0fa538 add new model 2023-10-10 12:32:25 +00:00
Michael Duong
ad2c266727 initial model for heat-dev 2023-10-09 17:52:47 +00:00
12 changed files with 120 additions and 93 deletions

View file

@ -16,15 +16,15 @@
"active": true "active": true
}, },
"heat": { "heat": {
"version": "v0.3.0", "version": "v0.4.0",
"stage": { "stage": {
"dev": "v0.3.0" "dev": "v0.5.0"
}, },
"registered": true, "registered": true,
"active": true "active": true
}, },
"carbon": { "carbon": {
"version": "v0.3.0", "version": "v0.4.0",
"stage": { "stage": {
"dev": "v0.3.0" "dev": "v0.3.0"
}, },

View file

@ -9,7 +9,7 @@ ARG RUNTIME_ENVIRONMENT
ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT} ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT}
# Install necessary build tools - required to test locally # Install necessary build tools - required to test locally
RUN yum install -y gcc python3-devel RUN yum install -y gcc python3-devel gcc-c++
# Install python packages # Install python packages
COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt

View file

@ -4,9 +4,7 @@ After the model is built, we can evaluate its performance
""" """
import os import os
import yaml
import pandas as pd import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceMetrics import MLMetrics from core.interface.InterfaceMetrics import MLMetrics
from core.interface.InterfaceDataClient import DataClient from core.interface.InterfaceDataClient import DataClient

View file

@ -18,30 +18,39 @@ def remove_starting_columns(df):
return df return df
def remove_floor_height_ending(df): def keep_negative_heat_change(df):
# df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING'] df = df[df["heat_demand_change"] < 0]
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
print("we in here")
return df return df
def remove_minimum_habitable_room_size(df): def keep_negative_carbon_change(df):
# Need minimum of 6.5m per habitable room df = df[df["carbon_change"] < 0]
df = df[
df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
].reset_index(drop=True)
return df return df
def keep_flats(df): # TODO: Move to ETL pipeline
df = df[df["property_type"] == "Flat"] def remove_unreasonable_habitable_rooms(df):
"""
Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2
"""
minimum_room_size_index = (
df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5
)
df = df[minimum_room_size_index]
return df return df
def keep_non_zero_rdsap(df): def remove_top_1_percent_heat_demand(df):
df = df[df["rdsap_change"] != 0] # threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%']
threshold_value = 860
df = df[df["heat_demand_starting"] < threshold_value]
return df
def remove_top_1_percent_carbon(df):
# threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%']
threshold_value = 18
df = df[df["carbon_starting"] < threshold_value]
return df return df
@ -54,10 +63,11 @@ def keep_non_zero_rdsap(df):
# return df # return df
business_logic = { business_logic = {
# "keep_non_zero_rdsap": keep_non_zero_rdsap, "remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms,
# "keep_flats": keep_flats, "keep_negative_heat_change": keep_negative_heat_change,
# "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, "keep_negative_carbon_change": keep_negative_carbon_change,
# "remove_floor_height_ending": remove_floor_height_ending "remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand,
"remove_top_1_percent_carbon": remove_top_1_percent_carbon,
# "remove_starting_columns": remove_starting_columns # "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns # "keep_ENDING_COLUMNS": keep_ending_columns
} }

View file

@ -1,6 +1,7 @@
""" """
After predictions, we may want to apply some post processing to the predictions After predictions, we may want to apply some post processing to the predictions
""" """
import pandas as pd import pandas as pd
@ -13,10 +14,11 @@ def clip_predictions_to_minimum_value(
predictions_df = pd.concat([data, predictions], axis=1) predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement # We expect all prediction to be atleast one point improvement
replace_index = ( replace_index = (
predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"] predictions_df["predictions"]
> predictions_df["heat_demand_starting"] - minimum_value
) )
predictions_df.loc[replace_index, "predictions"] = ( predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "sap_starting"] + minimum_value predictions_df.loc[replace_index, "heat_demand_starting"] - minimum_value
) )
predictions_new = predictions_df["predictions"] predictions_new = predictions_df["predictions"]

View file

@ -18,13 +18,9 @@ default:
prepare_data: prepare_data:
input_dataclient_type: aws-s3 input_dataclient_type: aws-s3
output_dataclient_type: local output_dataclient_type: local
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet train_proportion: 0.9
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet
train_proportion: 1
output_train_filepath: ./data/prepared_data/train.parquet output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet output_test_filepath: ./data/prepared_data/test.parquet
@ -33,9 +29,12 @@ default:
feature_processor_config: feature_processor_config:
subsample_amount: null subsample_amount: null
subsample_seed: 0 subsample_seed: 0
target: sap_ending target: heat_demand_ending
identifier_columns: ["uprn"] identifier_columns: ["uprn"]
drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending"] drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "sap_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms']
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null retain_features: null

View file

@ -1,36 +1,56 @@
schema: '2.0' schema: '2.0'
stages: stages:
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics
prepare_data: prepare_data:
cmd: python 1_prepare_data.py cmd: python 1_prepare_data.py
deps: deps:
- path: 1_prepare_data.py - path: 1_prepare_data.py
hash: md5 hash: md5
md5: 1793a35e71751d3c84f9affc67ecb9a8 md5: 11a3b8bfdfe199ab7ecc39ccc5652649
size: 4296 size: 4298
params: params:
configs/settings.yaml: configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns: default.feature_processor.feature_processor_config.drop_columns:
- heat_demand_change - heat_demand_change
- carbon_change - carbon_change
- rdsap_change - rdsap_change
- heat_demand_ending - sap_ending
- carbon_ending - carbon_ending
- days_to_starting
- days_to_ending
- number_habitable_rooms_starting
- number_habitable_rooms_ending
- number_heated_rooms_starting
- number_heated_rooms_ending
- number_habitable_rooms
- number_heated_rooms
default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_config.target: heat_demand_ending
default.feature_processor.feature_processor_type: dataframe default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
default.prepare_data.train_proportion: 1 default.prepare_data.train_proportion: 0.9
outs: outs:
- path: data/prepared_data/ - path: data/prepared_data/
hash: md5 hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 43859225 size: 36889932
nfiles: 2 nfiles: 2
build_model: build_model:
cmd: python 2_build_model.py cmd: python 2_build_model.py
@ -41,8 +61,8 @@ stages:
size: 4820 size: 4820
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 43859225 size: 36889932
nfiles: 2 nfiles: 2
params: params:
configs/build_model.yaml: configs/build_model.yaml:
@ -73,18 +93,18 @@ stages:
outs: outs:
- path: data/fit_predictions/ - path: data/fit_predictions/
hash: md5 hash: md5
md5: ede187e9d0bffdef054f573f3c2bd222.dir md5: 7dda2f1dd257a6c5beaaa0b74eab6d5d.dir
size: 3578590 size: 2901760
nfiles: 1 nfiles: 1
- path: data/model/ - path: data/model/
hash: md5 hash: md5
md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir md5: 741f8aed57383e860c535feb8b0adb71.dir
size: 814720415 size: 752079341
nfiles: 31 nfiles: 32
- path: metrics/fit_metrics.json - path: metrics/fit_metrics.json
hash: md5 hash: md5
md5: c45b84f12971a0156e4f3d85d3e725f5 md5: 8eaa72b08074f735a9e54de871edc6e6
size: 218 size: 221
generate_predictions: generate_predictions:
cmd: python 3_generate_predictions.py cmd: python 3_generate_predictions.py
deps: deps:
@ -94,13 +114,13 @@ stages:
size: 2464 size: 2464
- path: data/model - path: data/model
hash: md5 hash: md5
md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir md5: 741f8aed57383e860c535feb8b0adb71.dir
size: 814720415 size: 752079341
nfiles: 31 nfiles: 32
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 43859225 size: 36889932
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -112,25 +132,25 @@ stages:
outs: outs:
- path: data/predictions/ - path: data/predictions/
hash: md5 hash: md5
md5: 5e60ca251af51de6fef3d0c659f8bb27.dir md5: d842fe5350a3330c4c17e7e21c6359b2.dir
size: 627416 size: 380489
nfiles: 1 nfiles: 1
generate_metrics: generate_metrics:
cmd: python 4_generate_metrics.py cmd: python 4_generate_metrics.py
deps: deps:
- path: 4_generate_metrics.py - path: 4_generate_metrics.py
hash: md5 hash: md5
md5: 4fedb86d89d528f0a6597934ba3890a0 md5: d61bb524f706917f6a3eb72b1ab8bc61
size: 3484 size: 3447
- path: data/predictions - path: data/predictions
hash: md5 hash: md5
md5: 5e60ca251af51de6fef3d0c659f8bb27.dir md5: d842fe5350a3330c4c17e7e21c6359b2.dir
size: 627416 size: 380489
nfiles: 1 nfiles: 1
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 43859225 size: 36889932
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -140,16 +160,5 @@ stages:
outs: outs:
- path: metrics/metrics.json - path: metrics/metrics.json
hash: md5 hash: md5
md5: 033efa4d4044b6b6fc92dd37194727fa md5: 2632fa5d0a38763c177bf0466a670c8b
size: 225 size: 220
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics

View file

@ -1,6 +1,7 @@
""" """
Doing some eda on dataset Doing some eda on dataset
""" """
# Look at response variable # Look at response variable
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
@ -38,7 +39,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
train_df[[target, "HEAT_DEMAND_STARTING"]].plot( train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
x=target, y="HEAT_DEMAND_STARTING", style="o" x=target, y="HEAT_DEMAND_STARTING", style="o"
) )
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict # Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
# Load the autogluon model and check feature importance # Load the autogluon model and check feature importance
@ -176,6 +176,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
# #
# #
from core.MLMetrics import metrics_factory
from core.MLModels import model_factory from core.MLModels import model_factory
from core.DataClient import dataclient_factory from core.DataClient import dataclient_factory
import pandas as pd import pandas as pd
@ -216,6 +218,12 @@ mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False) mix_df = mix_df.sort_values("residual", ascending=False)
cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])] cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])]
metrics = metrics_factory("Regression")
metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"])
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
row_index = 0 row_index = 0

View file

@ -1,7 +1,7 @@
joblib==1.3.2 joblib==1.3.2
boto3==1.28.17 boto3==1.28.17
pandas==1.5.3 pandas==2.1.4
autogluon==0.8.2 autogluon==1.0.0
dynaconf==3.2.0 dynaconf==3.2.1
pyarrow==13.0.0 pyarrow==13.0.0
pre-commit==3.3.3 pre-commit==3.3.3

View file

@ -1,7 +1,7 @@
joblib==1.3.2 joblib==1.3.2
boto3==1.28.17 boto3==1.28.17
pandas==1.5.3 pandas==2.1.4
autogluon==0.8.2 autogluon==1.0.0
dynaconf==3.2.0 dynaconf==3.2.1
pyarrow==13.0.0 pyarrow==13.0.0
PyYAML==6.0.1 PyYAML==6.0.1

View file

@ -1,9 +1,10 @@
joblib==1.3.2 joblib==1.3.2
boto3==1.28.17 boto3==1.28.17
pandas==1.5.3 pandas==2.1.4
autogluon==0.8.2 autogluon==1.0.0
dynaconf==3.2.0 ray==2.6.3
alibi==0.9.4 dynaconf==3.2.1
alibi==0.9.5
shap==0.42.1 shap==0.42.1
pyarrow==13.0.0 pyarrow==13.0.0
pre-commit==3.3.3 pre-commit==3.3.3

View file

@ -1,4 +1,4 @@
boto3==1.28.41 boto3==1.28.41
pandas==1.5.3 pandas==2.1.4
autogluon==0.8.2 autogluon==1.0.0
dynaconf==3.2.0 dynaconf==3.2.1