run a new heat model for new data

This commit is contained in:
Michael Duong 2024-05-30 20:53:23 +01:00
commit 45e21383fe
24 changed files with 342 additions and 62 deletions

9
.dockerignore Normal file
View file

@ -0,0 +1,9 @@
modules/ml-pipeline/src/pipeline/data/predictions
modules/ml-pipeline/src/pipeline/data/fit_predictions
modules/ml-pipeline/src/pipeline/data/prepared_data
modules/ml-pipeline/src/pipeline/data/model/allmodels
modules/ml-pipeline/src/pipeline/metrics
modules/ml-pipeline/src/pipeline/__pycache__
modules/ml-pipeline/src/pipeline/.dvc
modules/ml-pipeline/src/pipeline/analysis
modules/ml-pipeline/src/pipeline/metrics

View file

@ -19,8 +19,8 @@ jobs:
- name: Install Serverless and plugins
run: |
npm install -g serverless
npm install -g serverless-domain-manager
npm install -g serverless@^3.38.0
npm install -g serverless-domain-manager@^7.3.8
- name: Install DVC
run: |

View file

@ -98,6 +98,16 @@ jobs:
git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH}
dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md
echo "## Scenario comparison" >> report.md
cat metrics/scenario_table.md >> report.md
echo "" >> report.md
echo "## Scenario metrics" >> report.md
cat metrics/scenario_metrics.md >> report.md
cml comment create report.md
# echo "## Residuals plot from model" >> report.md

View file

@ -8,6 +8,14 @@
"active": true
},
"sap": {
"version": "v0.14.0",
"stage": {
"dev": "v0.14.0"
},
"registered": true,
"active": true
},
"heat": {
"version": "v0.5.0",
"stage": {
"dev": "v0.5.0"
@ -15,20 +23,12 @@
"registered": true,
"active": true
},
"heat": {
"version": "v0.4.0",
"carbon": {
"version": "v0.5.0",
"stage": {
"dev": "v0.5.0"
},
"registered": true,
"active": true
},
"carbon": {
"version": "v0.4.0",
"stage": {
"dev": "v0.3.0"
},
"registered": true,
"active": true
}
}

View file

@ -1,4 +1,9 @@
modules/ml-pipeline/src/pipeline/data/predictions*
modules/ml-pipeline/src/pipeline/data/prepared_data*
modules/ml-pipeline/src/pipeline/data/model/allmodels*
modules/ml-pipeline/src/pipeline/metrics*
modules/ml-pipeline/src/pipeline/data/predictions
modules/ml-pipeline/src/pipeline/data/fit_predictions
modules/ml-pipeline/src/pipeline/data/prepared_data
modules/ml-pipeline/src/pipeline/data/model/allmodels
modules/ml-pipeline/src/pipeline/metrics
modules/ml-pipeline/src/__pycache__
modules/ml-pipeline/src/.dvc
modules/ml-pipeline/src/analysis
modules/ml-pipeline/src/metrics

View file

@ -1,4 +1,8 @@
pipeline/data/predictions*
pipeline/data/prepared_data/train.parquet*
pipeline/data/model/allmodels*
pipeline/metrics*
pipeline/data/predictions
pipeline/data/fit_predictions
pipeline/data/prepared_data/train.parquet
pipeline/data/fit_predictions
pipeline/data/model/allmodels
pipeline/metrics
pipeline/.dvc
pipeline/analysis

View file

@ -1,7 +1,7 @@
# Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow)
FROM python:3.10.12-slim
RUN apt-get update && apt-get install -y libgomp1
RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev
COPY pipeline/requirements/predictions/requirements.txt requirements.txt

View file

@ -0,0 +1,162 @@
"""
Fourth part of the pipeline:
After the model is built and metrics are generated,
we want to test this model against known scenarios
"""
import os
import pandas as pd
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.interface.InterfaceMetrics import MLMetrics
from configs.post_prediction_logic import post_prediction_logic
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.MLMetrics import metrics_factory
from core.Logger import logger
from config import settings
logger.info(f"--- Initiate Parameters ---")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_params = settings.client
prepare_data_params = settings.prepare_data
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
generate_metrics_params = settings.generate_metrics
feature_process_params = settings.feature_processor
scenarios_params = settings.scenarios
model_filepath = build_model_params["model_save_filepath"]
target = feature_process_params["feature_processor_config"]["target"]
scenario_data_filepaths = scenarios_params["scenario_data_filepaths"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
comparison_output_filepath = scenarios_params["comparison_output_filepath"]
metrics_output_filepath = scenarios_params["metrics_output_filepath"]
logger.info(f"--- Initiate MLModel ---")
model = model_factory(build_model_params["model_type"])
logger.info(f"--- Initiate DataClient ---")
# Use data client for input and output, as we use dvc to cache later to the cloud
input_dataclient_type = scenarios_params["input_dataclient_type"]
input_dataclient = dataclient_factory(
dataclient_type=input_dataclient_type,
dataclient_config=client_params[input_dataclient_type],
)
output_dataclient_type = scenarios_params["output_dataclient_type"]
output_dataclient = dataclient_factory(
dataclient_type=output_dataclient_type,
dataclient_config=client_params[output_dataclient_type],
)
logger.info(f"--- Initiate MLMetrics ---")
metrics = metrics_factory(generate_metrics_params["metrics_type"])
def generate_scenario_predictions(
input_dataclient: DataClient,
output_dataclient: DataClient,
model: MLModel,
metrics: MLMetrics,
model_filepath: str,
scenario_data_filepaths: list,
predictions_column_name: str,
comparison_output_filepath: str,
metrics_output_filepath: str,
):
"""
Given the new model, we generate prediction for expected scenarios
"""
logger.info("--- Loading Scenario Data ---")
scenario_data = pd.DataFrame()
# If we have no scenario data, we can save empty dataframes
if scenario_data_filepaths is None:
logger.info("No scenario data filepaths provided")
output_dataclient.save_data(
obj=scenario_data, location=comparison_output_filepath, save_config=None
)
output_dataclient.save_data(
obj=scenario_data, location=metrics_output_filepath, save_config=None
)
return
# Can have multiple scenario data files
for scenario_data_filepath in scenario_data_filepaths:
scenario_data = pd.concat(
[
scenario_data,
input_dataclient.load_data(scenario_data_filepath, load_config=None),
]
)
logger.info("--- Loading Model ---")
model.load_model(model_filepath)
logger.info("--- Generating Predictions ---")
predictions = model.predict(
data=scenario_data, post_prediction_logic=post_prediction_logic
)
logger.info("--- Generate Scenario Predicted Impact ---")
predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name]
scenario_data = pd.concat([scenario_data, predictions_df], axis=1)
scenario_data["predicted_impact"] = abs(
scenario_data[predictions_column_name] - scenario_data["sap_starting"]
)
logger.info("--- Generate Metrics ---")
metrics_dict = metrics.generate_metrics(
scenario_data["impact"], scenario_data["predicted_impact"]
)
metrics_df = pd.DataFrame(metrics_dict, index=[0]).T.reset_index()
metrics_df.columns = ["metric", "value"]
logger.info("--- Save prediction into metrics ---")
output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]]
output_dataclient.save_data(
obj=output_df, location=comparison_output_filepath, save_config=None
)
output_dataclient.save_data(
obj=metrics_df, location=metrics_output_filepath, save_config=None
)
if __name__ == "__main__":
logger.info(f"--- {__file__} - Start! ---")
logger.info(f"--- Generate Scenario Predictions ---")
generate_scenario_predictions(
input_dataclient=input_dataclient,
output_dataclient=output_dataclient,
model=model,
metrics=metrics,
model_filepath=model_filepath,
scenario_data_filepaths=scenario_data_filepaths,
predictions_column_name=predictions_column_name,
comparison_output_filepath=comparison_output_filepath,
metrics_output_filepath=metrics_output_filepath,
)
logger.info(f"--- {__file__} - Complete! ---")

View file

@ -37,3 +37,4 @@ Workflow:
- This experiment will have the corresponding .dvc files for the hashed model and data
- Use version control as normal
- git add, git commit etc
- To revert change, use `git checkout {COMMIT_HASH}`, followed by `git switch -c {NEW_BRANCH_NAME}`

View file

@ -7,6 +7,7 @@ settings = Dynaconf(
"./configs/settings.yaml",
"./configs/build_model.yaml",
"./configs/analysis.yaml",
"./configs/scenarios.yaml",
],
)

View file

@ -14,8 +14,9 @@ default:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 4000
time_limit: 1800
presets: medium_quality
excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT']
excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT']
infer_limit: 0.05
infer_limit_batch_size: 10000
ag_args_ensemble: {'num_folds_parallel': 2}

View file

@ -0,0 +1,13 @@
default:
scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
# - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -18,8 +18,10 @@ default:
prepare_data:
input_dataclient_type: aws-s3
output_dataclient_type: local
# data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
@ -37,6 +39,29 @@ default:
'number_habitable_rooms', 'number_heated_rooms']
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',
# 'walls_energy_eff_ending', 'secondheat_description_ending',
# 'property_type', 'mainheatc_energy_eff_ending', 'built_form',
# 'walls_insulation_thickness_ending', 'potential_energy_efficiency',
# 'transaction_type_ending',
# 'floor_thermal_transmittance_ending',
# 'low_energy_lighting_ending', 'heat_demand_starting',
# 'photo_supply_ending', 'carbon_starting',
# 'walls_thermal_transmittance_ending',
# 'roof_insulation_thickness_ending',
# 'total_floor_area_ending', 'number_open_fireplaces_ending',
# 'windows_energy_eff_ending',
# 'floor_height_ending',
# 'extension_count_ending',
# 'has_air_source_heat_pump_ending',
# 'charging_system_ending', 'construction_age_band', 'glazed_type_ending',
# 'roof_thermal_transmittance_ending',
# 'floor_insulation_thickness_ending', 'has_mains_gas_ending',
# 'estimated_perimeter_starting', 'energy_consumption_potential',
# 'environment_impact_potential', 'heater_type_ending',
# 'multi_glaze_proportion_ending',
# 'lighting_energy_eff_ending', 'fixed_lighting_outlets_count']
generate_predictions:
input_dataclient_type: local

View file

@ -245,7 +245,8 @@ class LocalClient:
save_methods = {
".parquet": self._save_parquet,
".json": self._save_json
".json": self._save_json,
".md": self._save_md,
# "": _save_directory(**save_config),
# ADD MORE save_methods HERE
}
@ -294,3 +295,10 @@ class LocalClient:
# Write the contents of the buffer to the local file
with open(location, "wb") as f:
f.write(buffer.getvalue())
def _save_md(self, obj: pd.DataFrame, location: str, save_config: dict):
"""
Save object as markdown
"""
obj.to_markdown(location, **save_config)

View file

@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel:
models = {
"SKLearnLinearRegression": SKLearnLinearRegression(),
"SKLearnSVMRegression": SKLearnSVMRegression(),
"AutogluonAutoML": AutogluonAutoML()
"AutogluonAutoML": AutogluonAutoML(),
# ADD OTHER MODELS HERE
}
@ -151,6 +151,7 @@ class AutogluonAutoML:
"excluded_model_types",
"infer_limit",
"infer_limit_batch_size",
"ag_args_ensemble",
]
def load_model(self, path: Union[Path, str]) -> None:
@ -207,6 +208,7 @@ class AutogluonAutoML:
excluded_model_types=model_hyperparameters["excluded_model_types"],
infer_limit=model_hyperparameters["infer_limit"],
infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
)
def predict(

View file

@ -39,8 +39,8 @@ stages:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: heat_demand_ending
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -49,8 +49,8 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 36889932
md5: 13cd955d579de20efe743f82bc434c7e.dir
size: 37294025
nfiles: 2
build_model:
cmd: python 2_build_model.py
@ -61,8 +61,8 @@ stages:
size: 4820
- path: data/prepared_data
hash: md5
md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 36889932
md5: 13cd955d579de20efe743f82bc434c7e.dir
size: 37294025
nfiles: 2
params:
configs/build_model.yaml:
@ -79,32 +79,33 @@ stages:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error
time_limit: 4000
time_limit: 1800
presets: medium_quality
excluded_model_types:
- RF
- FASTAI
- CAT
- NN_TORCH
- KNN
- XT
infer_limit: 0.05
infer_limit_batch_size: 10000
ag_args_ensemble:
num_folds_parallel: 2
outs:
- path: data/fit_predictions/
hash: md5
md5: 7dda2f1dd257a6c5beaaa0b74eab6d5d.dir
size: 2901760
md5: b9c9ca64ea6973c409c3a7b8f8ed0c3e.dir
size: 2902493
nfiles: 1
- path: data/model/
hash: md5
md5: 741f8aed57383e860c535feb8b0adb71.dir
size: 752079341
nfiles: 32
md5: a9215bba342ed7ec3f97815dfef94e48.dir
size: 727501601
nfiles: 36
- path: metrics/fit_metrics.json
hash: md5
md5: 8eaa72b08074f735a9e54de871edc6e6
size: 221
md5: 548a431d58cd4f5a3118235dec734372
size: 219
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
@ -114,13 +115,13 @@ stages:
size: 2464
- path: data/model
hash: md5
md5: 741f8aed57383e860c535feb8b0adb71.dir
size: 752079341
nfiles: 32
md5: a9215bba342ed7ec3f97815dfef94e48.dir
size: 727501601
nfiles: 36
- path: data/prepared_data
hash: md5
md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 36889932
md5: 13cd955d579de20efe743f82bc434c7e.dir
size: 37294025
nfiles: 2
params:
configs/settings.yaml:
@ -132,8 +133,8 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: d842fe5350a3330c4c17e7e21c6359b2.dir
size: 380489
md5: 484781d6b359e458a25e9ab728d6514d.dir
size: 380517
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
@ -144,13 +145,13 @@ stages:
size: 3447
- path: data/predictions
hash: md5
md5: d842fe5350a3330c4c17e7e21c6359b2.dir
size: 380489
md5: 484781d6b359e458a25e9ab728d6514d.dir
size: 380517
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 4cec69f112537658f14eb3cb678f91e3.dir
size: 36889932
md5: 13cd955d579de20efe743f82bc434c7e.dir
size: 37294025
nfiles: 2
params:
configs/settings.yaml:
@ -160,5 +161,30 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 2632fa5d0a38763c177bf0466a670c8b
md5: 4d246765aff7c45079d02b4d8f7527f7
size: 220
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
- path: 5_generate_scenarios.py
hash: md5
md5: 40506749fefd926d47c60ff5b16db307
size: 5337
params:
configs/scenarios.yaml:
default.scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md
outs:
- path: metrics/scenario_metrics.md
hash: md5
md5: d9fbb5c725258b82c465ddd9f86f9c16
size: 377
- path: metrics/scenario_table.md
hash: md5
md5: 396d20b1a049d5f93fc38a409c4ca497
size: 2133

View file

@ -71,6 +71,17 @@ stages:
outs:
- metrics/metrics.json
always_changed: true
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
- 5_generate_scenarios.py
params:
- configs/scenarios.yaml:
- default.scenarios
outs:
- metrics/scenario_table.md
- metrics/scenario_metrics.md
always_changed: true
metrics:
- metrics/metrics.json
- metrics/fit_metrics.json

View file

@ -1,2 +1,4 @@
/fit_metrics.json
/metrics.json
/scenario_table.md
/scenario_metrics.md

View file

@ -1,7 +1,7 @@
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon==1.0.0
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,7 +1,7 @@
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon==1.0.0
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1
pyarrow==13.0.0
PyYAML==6.0.1

View file

@ -1,7 +1,7 @@
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon==1.0.0
autogluon.tabular[all]==1.0.0
ray==2.6.3
dynaconf==3.2.1
alibi==0.9.5

View file

@ -1,4 +1,4 @@
boto3==1.28.41
pandas==2.1.4
autogluon==1.0.0
dynaconf==3.2.1
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1

View file

@ -1,4 +1,4 @@
dvc==3.36.0
dvc-s3==3.0.1
gto==1.6.1
dvc==3.51.0
dvc-s3==3.2.0
gto==1.7.1
pyOpenSSL==23.3.0