Compare commits

...

37 commits

Author SHA1 Message Date
KhalimCK
c7edb7c611
Merge pull request #107 from Hestia-Homes/carbon-dev-model
Carbon dev model
2024-03-28 16:21:52 +00:00
Michael Duong
bb3af26c3f add binary to prediction docker, change requiremnets 2024-03-28 16:06:43 +00:00
Michael Duong
78bf0a490d use 0.9 training data 2024-03-27 23:43:07 +00:00
Michael Duong
2da24aa017 run carbon model with new data 2024-03-27 23:13:29 +00:00
Michael Duong
c0dc934be6 run carbon model with new data 2024-03-27 23:10:36 +00:00
Github-Bot
869a276d67 Update Registry 2024-01-30 10:39:26 +00:00
Github-Bot
96765cee05 Update Registry 2024-01-30 10:38:43 +00:00
KhalimCK
f99c0aee2c
Merge pull request #96 from Hestia-Homes/carbon-dev-model
Carbon dev model
2024-01-30 10:38:05 +00:00
Michael Duong
76d414417a Merge branch 'carbon-dev' of github.com:Hestia-Homes/ML into carbon-dev-model 2024-01-30 10:26:43 +00:00
Michael Duong
1887a52230 use new modesl with carbon model 2024-01-30 10:26:28 +00:00
Github-Bot
9880ebed4c Update Registry 2024-01-18 10:38:17 +00:00
Github-Bot
5d23992d05 Update Registry 2024-01-18 10:37:29 +00:00
KhalimCK
d4836e02cb
Merge pull request #92 from Hestia-Homes/carbon-dev-model
Carbon dev model
2024-01-18 10:36:46 +00:00
Michael Duong
9b29e838af update requirements for dvc 2024-01-17 23:45:07 +00:00
Michael Duong
79a55ba8b5 train 600 second model on new data 2024-01-17 23:35:50 +00:00
Michael Duong
e78a4bb30e Merge branch 'carbon-dev' of github.com:Hestia-Homes/ML into carbon-dev-model 2024-01-17 23:12:26 +00:00
Michael Duong
ae53499742 add keep only non negative carbon change to carbon model 2023-12-22 09:51:57 +00:00
Github-Bot
db29bece80 Update Registry 2023-11-28 15:27:34 +00:00
Github-Bot
65335468b4 Update Registry 2023-11-28 15:26:50 +00:00
quandanrepo
53afbd26d8
Merge pull request #88 from Hestia-Homes/carbon-dev-model
Carbon dev model
2023-11-28 15:26:04 +00:00
Michael Duong
718003b3d9 Merge branch 'carbon-dev' of github.com:Hestia-Homes/ML into carbon-dev-model 2023-11-28 15:14:09 +00:00
Michael Duong
888bfc30c6 Merge branch 'master' of github.com:Hestia-Homes/ML into carbon-dev-model 2023-11-28 15:13:50 +00:00
Michael Duong
2b1e8b912b restrict dataset 2023-11-28 15:13:42 +00:00
Github-Bot
62f2f83b0a Update Registry 2023-11-27 19:22:00 +00:00
Github-Bot
03322a13e7 Update Registry 2023-11-27 19:21:22 +00:00
KhalimCK
5f3d9efa92
Merge pull request #85 from Hestia-Homes/carbon-dev-model
Carbon dev model
2023-11-27 19:20:40 +00:00
Michael Duong
f29d6af6a2 change readme 2023-11-27 19:13:23 +00:00
Michael Duong
7afc4b06b2 Merge branch 'master' of github.com:Hestia-Homes/ML into carbon-dev-model 2023-11-27 19:12:40 +00:00
Michael Duong
217fb3dca8 add inference speed check 2023-11-27 18:52:47 +00:00
Michael Duong
9a04ffde3b Merge branch 'master' of github.com:Hestia-Homes/ML into carbon-dev-model 2023-11-27 18:30:10 +00:00
Michael Duong
e6c7b2f58c Merge branch 'carbon-dev' of github.com:Hestia-Homes/ML into carbon-dev-model 2023-10-12 08:39:24 +00:00
Michael Duong
f2cc32f4b4 using good model 4000s 2023-10-12 08:38:55 +00:00
Github-Bot
2f9092f447 Update Registry 2023-10-11 15:48:52 +00:00
Github-Bot
bb2db16f61 Update Registry 2023-10-11 15:48:04 +00:00
quandanrepo
5aaebd7f44
Merge pull request #71 from Hestia-Homes/carbon-dev-model
400 second model
2023-10-11 16:47:13 +01:00
Michael Duong
680e879503 400 second model 2023-10-11 15:38:55 +00:00
Michael Duong
f4e91162ec initial model 2023-10-11 13:23:54 +00:00
14 changed files with 126 additions and 94 deletions

View file

@ -16,17 +16,17 @@
"active": true "active": true
}, },
"heat": { "heat": {
"version": "v0.3.0", "version": "v0.4.0",
"stage": { "stage": {
"dev": "v0.3.0" "dev": "v0.4.0"
}, },
"registered": true, "registered": true,
"active": true "active": true
}, },
"carbon": { "carbon": {
"version": "v0.3.0", "version": "v0.4.0",
"stage": { "stage": {
"dev": "v0.3.0" "dev": "v0.4.0"
}, },
"registered": true, "registered": true,
"active": true "active": true

View file

@ -9,7 +9,7 @@ ARG RUNTIME_ENVIRONMENT
ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT} ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT}
# Install necessary build tools - required to test locally # Install necessary build tools - required to test locally
RUN yum install -y gcc python3-devel RUN yum install -y gcc python3-devel gcc-c++
# Install python packages # Install python packages
COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt

View file

@ -1,3 +1,3 @@
# The generic reproducible ML-pipeline # The generic reproducible ML-pipeline!
Pipeline required to build a model to produce an output, that gets hashed via DVC Pipeline required to build a model to produce an output, that gets hashed via DVC

View file

@ -1,3 +1,4 @@
# Ignore dynaconf secret files # Ignore dynaconf secret files
.secrets.* .secrets.*
example.py

View file

@ -19,3 +19,4 @@ default:
excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT'] excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT']
infer_limit: 0.05 infer_limit: 0.05
infer_limit_batch_size: 10000 infer_limit_batch_size: 10000
ag_args_ensemble: {'num_folds_parallel': 2}

View file

@ -18,30 +18,44 @@ def remove_starting_columns(df):
return df return df
def remove_floor_height_ending(df): def keep_negative_heat_change(df):
# df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING'] df = df[df["heat_demand_change"] < 0]
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
print("we in here")
return df return df
def remove_minimum_habitable_room_size(df): def keep_non_negative_carbon_ending(df):
# Need minimum of 6.5m per habitable room df = df[df["carbon_ending"] > 0]
df = df[
df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
].reset_index(drop=True)
return df return df
def keep_flats(df): def keep_negative_carbon_change(df):
df = df[df["property_type"] == "Flat"] df = df[df["carbon_change"] < 0]
return df return df
def keep_non_zero_rdsap(df): # TODO: Move to ETL pipeline
df = df[df["rdsap_change"] != 0] def remove_unreasonable_habitable_rooms(df):
"""
Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2
"""
minimum_room_size_index = (
df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5
)
df = df[minimum_room_size_index]
return df
def remove_top_1_percent_heat_demand(df):
# threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%']
threshold_value = 860
df = df[df["heat_demand_starting"] < threshold_value]
return df
def remove_top_1_percent_carbon(df):
# threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%']
threshold_value = 18
df = df[df["carbon_starting"] < threshold_value]
return df return df
@ -54,10 +68,12 @@ def keep_non_zero_rdsap(df):
# return df # return df
business_logic = { business_logic = {
# "keep_non_zero_rdsap": keep_non_zero_rdsap, "remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms,
# "keep_flats": keep_flats, "keep_negative_heat_change": keep_negative_heat_change,
# "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size, "keep_negative_carbon_change": keep_negative_carbon_change,
# "remove_floor_height_ending": remove_floor_height_ending "remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand,
"remove_top_1_percent_carbon": remove_top_1_percent_carbon,
"keep_non_negative_carbon_ending": keep_non_negative_carbon_ending,
# "remove_starting_columns": remove_starting_columns # "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns # "keep_ENDING_COLUMNS": keep_ending_columns
} }

View file

@ -1,23 +1,24 @@
""" """
After predictions, we may want to apply some post processing to the predictions After predictions, we may want to apply some post processing to the predictions
""" """
import pandas as pd import pandas as pd
def clip_predictions_to_minimum_value( def clip_predictions_to_minimum_value(
data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0 data: pd.DataFrame,
predictions: pd.Series,
) -> pd.Series: ) -> pd.Series:
series_name = predictions.name series_name = predictions.name
predictions.name = "predictions" predictions.name = "predictions"
predictions = predictions.astype(data["carbon_starting"].dtype)
predictions_df = pd.concat([data, predictions], axis=1) predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement # We expect all prediction to be atleast one point improvement
replace_index = ( replace_index = predictions_df["predictions"] > predictions_df["carbon_starting"]
predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"] predictions_df.loc[replace_index, "predictions"] = predictions_df.loc[
) replace_index, "carbon_starting"
predictions_df.loc[replace_index, "predictions"] = ( ]
predictions_df.loc[replace_index, "sap_starting"] + minimum_value
)
predictions_new = predictions_df["predictions"] predictions_new = predictions_df["predictions"]
predictions_new.name = series_name predictions_new.name = series_name

View file

@ -18,13 +18,8 @@ default:
prepare_data: prepare_data:
input_dataclient_type: aws-s3 input_dataclient_type: aws-s3
output_dataclient_type: local output_dataclient_type: local
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet train_proportion: 0.9
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet
train_proportion: 1
output_train_filepath: ./data/prepared_data/train.parquet output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet output_test_filepath: ./data/prepared_data/test.parquet
@ -33,9 +28,13 @@ default:
feature_processor_config: feature_processor_config:
subsample_amount: null subsample_amount: null
subsample_seed: 0 subsample_seed: 0
target: sap_ending target: carbon_ending
identifier_columns: ["uprn"] identifier_columns: ["uprn"]
drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending"] # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending"]
drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms']
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null retain_features: null

View file

@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel:
models = { models = {
"SKLearnLinearRegression": SKLearnLinearRegression(), "SKLearnLinearRegression": SKLearnLinearRegression(),
"SKLearnSVMRegression": SKLearnSVMRegression(), "SKLearnSVMRegression": SKLearnSVMRegression(),
"AutogluonAutoML": AutogluonAutoML() "AutogluonAutoML": AutogluonAutoML(),
# ADD OTHER MODELS HERE # ADD OTHER MODELS HERE
} }
@ -151,6 +151,7 @@ class AutogluonAutoML:
"excluded_model_types", "excluded_model_types",
"infer_limit", "infer_limit",
"infer_limit_batch_size", "infer_limit_batch_size",
"ag_args_ensemble",
] ]
def load_model(self, path: Union[Path, str]) -> None: def load_model(self, path: Union[Path, str]) -> None:
@ -207,6 +208,7 @@ class AutogluonAutoML:
excluded_model_types=model_hyperparameters["excluded_model_types"], excluded_model_types=model_hyperparameters["excluded_model_types"],
infer_limit=model_hyperparameters["infer_limit"], infer_limit=model_hyperparameters["infer_limit"],
infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
) )
def predict( def predict(

View file

@ -1,12 +1,23 @@
schema: '2.0' schema: '2.0'
stages: stages:
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics
prepare_data: prepare_data:
cmd: python 1_prepare_data.py cmd: python 1_prepare_data.py
deps: deps:
- path: 1_prepare_data.py - path: 1_prepare_data.py
hash: md5 hash: md5
md5: 1793a35e71751d3c84f9affc67ecb9a8 md5: 11a3b8bfdfe199ab7ecc39ccc5652649
size: 4296 size: 4298
params: params:
configs/settings.yaml: configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns: default.feature_processor.feature_processor_config.drop_columns:
@ -14,23 +25,32 @@ stages:
- carbon_change - carbon_change
- rdsap_change - rdsap_change
- heat_demand_ending - heat_demand_ending
- carbon_ending - sap_ending
- days_to_starting
- days_to_ending
- number_habitable_rooms_starting
- number_habitable_rooms_ending
- number_heated_rooms_starting
- number_heated_rooms_ending
- number_habitable_rooms
- number_heated_rooms
default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: sap_ending default.feature_processor.feature_processor_config.target: carbon_ending
default.feature_processor.feature_processor_type: dataframe default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
default.prepare_data.train_proportion: 1 default.prepare_data.train_proportion: 0.9
outs: outs:
- path: data/prepared_data/ - path: data/prepared_data/
hash: md5 hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir md5: 824541f44e6538d2ef10e9d754c79743.dir
size: 43859225 size: 36691842
nfiles: 2 nfiles: 2
build_model: build_model:
cmd: python 2_build_model.py cmd: python 2_build_model.py
@ -41,8 +61,8 @@ stages:
size: 4820 size: 4820
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir md5: 824541f44e6538d2ef10e9d754c79743.dir
size: 43859225 size: 36691842
nfiles: 2 nfiles: 2
params: params:
configs/build_model.yaml: configs/build_model.yaml:
@ -70,21 +90,23 @@ stages:
- XT - XT
infer_limit: 0.05 infer_limit: 0.05
infer_limit_batch_size: 10000 infer_limit_batch_size: 10000
ag_args_ensemble:
num_folds_parallel: 2
outs: outs:
- path: data/fit_predictions/ - path: data/fit_predictions/
hash: md5 hash: md5
md5: ede187e9d0bffdef054f573f3c2bd222.dir md5: 5a3091120d3497fa00b994d91bc7e5eb.dir
size: 3578590 size: 3664806
nfiles: 1 nfiles: 1
- path: data/model/ - path: data/model/
hash: md5 hash: md5
md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir md5: 074da8dcfa515b9f3d082b21c7d76616.dir
size: 814720415 size: 721558897
nfiles: 31 nfiles: 31
- path: metrics/fit_metrics.json - path: metrics/fit_metrics.json
hash: md5 hash: md5
md5: c45b84f12971a0156e4f3d85d3e725f5 md5: 728a49dcef5a98182325df455f929a33
size: 218 size: 225
generate_predictions: generate_predictions:
cmd: python 3_generate_predictions.py cmd: python 3_generate_predictions.py
deps: deps:
@ -94,13 +116,13 @@ stages:
size: 2464 size: 2464
- path: data/model - path: data/model
hash: md5 hash: md5
md5: b2ad0b538dc4aef0de3d431fc9c40c4f.dir md5: 074da8dcfa515b9f3d082b21c7d76616.dir
size: 814720415 size: 721558897
nfiles: 31 nfiles: 31
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir md5: 824541f44e6538d2ef10e9d754c79743.dir
size: 43859225 size: 36691842
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -112,8 +134,8 @@ stages:
outs: outs:
- path: data/predictions/ - path: data/predictions/
hash: md5 hash: md5
md5: 5e60ca251af51de6fef3d0c659f8bb27.dir md5: 680f51234d214d4cab9e6a064c75fc5d.dir
size: 627416 size: 499546
nfiles: 1 nfiles: 1
generate_metrics: generate_metrics:
cmd: python 4_generate_metrics.py cmd: python 4_generate_metrics.py
@ -124,13 +146,13 @@ stages:
size: 3484 size: 3484
- path: data/predictions - path: data/predictions
hash: md5 hash: md5
md5: 5e60ca251af51de6fef3d0c659f8bb27.dir md5: 680f51234d214d4cab9e6a064c75fc5d.dir
size: 627416 size: 499546
nfiles: 1 nfiles: 1
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 84fa631bd02686b052d6a7144eafd38e.dir md5: 824541f44e6538d2ef10e9d754c79743.dir
size: 43859225 size: 36691842
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -140,16 +162,5 @@ stages:
outs: outs:
- path: metrics/metrics.json - path: metrics/metrics.json
hash: md5 hash: md5
md5: 033efa4d4044b6b6fc92dd37194727fa md5: 67b7ab30a4b0839d20bc6eb0c84e4dd1
size: 225 size: 226
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics

View file

@ -1,7 +1,7 @@
joblib==1.3.2 joblib==1.3.2
boto3==1.28.17 boto3==1.28.17
pandas==1.5.3 pandas==2.1.4
autogluon==0.8.2 autogluon==1.0.0
dynaconf==3.2.0 dynaconf==3.2.1
pyarrow==13.0.0 pyarrow==13.0.0
pre-commit==3.3.3 pre-commit==3.3.3

View file

@ -1,7 +1,7 @@
joblib==1.3.2 joblib==1.3.2
boto3==1.28.17 boto3==1.28.17
pandas==1.5.3 pandas==2.1.4
autogluon==0.8.2 autogluon==1.0.0
dynaconf==3.2.0 dynaconf==3.2.1
pyarrow==13.0.0 pyarrow==13.0.0
PyYAML==6.0.1 PyYAML==6.0.1

View file

@ -1,9 +1,10 @@
joblib==1.3.2 joblib==1.3.2
boto3==1.28.17 boto3==1.28.17
pandas==1.5.3 pandas==2.1.4
autogluon==0.8.2 autogluon==1.0.0
dynaconf==3.2.0 ray==2.6.3
alibi==0.9.4 dynaconf==3.2.1
alibi==0.9.5
shap==0.42.1 shap==0.42.1
pyarrow==13.0.0 pyarrow==13.0.0
pre-commit==3.3.3 pre-commit==3.3.3

View file

@ -1,4 +1,4 @@
boto3==1.28.41 boto3==1.28.41
pandas==1.5.3 pandas==2.1.4
autogluon==0.8.2 autogluon==1.0.0
dynaconf==3.2.0 dynaconf==3.2.1