Merge pull request #104 from Hestia-Homes/sap-dev-model

Sap dev model
This commit is contained in:
KhalimCK 2024-03-28 09:29:53 +00:00 committed by GitHub
commit e746352977
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 348 additions and 58 deletions

View file

@ -98,6 +98,16 @@ jobs:
git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH}
dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md
echo "## Scenario comparison" >> report.md
cat metrics/scenario_table.md >> report.md
echo "" >> report.md
echo "## Scenario metrics" >> report.md
cat metrics/scenario_metrics.md >> report.md
cml comment create report.md
# echo "## Residuals plot from model" >> report.md

View file

@ -0,0 +1,150 @@
"""
Fourth part of the pipeline:
After the model is built and metrics are generated,
we want to test this model against known scenarios
"""
import os
import pandas as pd
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.interface.InterfaceMetrics import MLMetrics
from configs.post_prediction_logic import post_prediction_logic
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.MLMetrics import metrics_factory
from core.Logger import logger
from config import settings
logger.info(f"--- Initiate Parameters ---")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_params = settings.client
prepare_data_params = settings.prepare_data
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
generate_metrics_params = settings.generate_metrics
feature_process_params = settings.feature_processor
scenarios_params = settings.scenarios
model_filepath = build_model_params["model_save_filepath"]
target = feature_process_params["feature_processor_config"]["target"]
scenario_data_filepaths = scenarios_params["scenario_data_filepaths"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
comparison_output_filepath = scenarios_params["comparison_output_filepath"]
metrics_output_filepath = scenarios_params["metrics_output_filepath"]
logger.info(f"--- Initiate MLModel ---")
model = model_factory(build_model_params["model_type"])
logger.info(f"--- Initiate DataClient ---")
# Use data client for input and output, as we use dvc to cache later to the cloud
input_dataclient_type = scenarios_params["input_dataclient_type"]
input_dataclient = dataclient_factory(
dataclient_type=input_dataclient_type,
dataclient_config=client_params[input_dataclient_type],
)
output_dataclient_type = scenarios_params["output_dataclient_type"]
output_dataclient = dataclient_factory(
dataclient_type=output_dataclient_type,
dataclient_config=client_params[output_dataclient_type],
)
logger.info(f"--- Initiate MLMetrics ---")
metrics = metrics_factory(generate_metrics_params["metrics_type"])
def generate_scenario_predictions(
input_dataclient: DataClient,
output_dataclient: DataClient,
model: MLModel,
metrics: MLMetrics,
model_filepath: str,
scenario_data_filepaths: list,
predictions_column_name: str,
comparison_output_filepath: str,
metrics_output_filepath: str,
):
"""
Given the new model, we generate prediction for expected scenarios
"""
logger.info("--- Loading Scenario Data ---")
scenario_data = pd.DataFrame()
# Can have multiple scenario data files
for scenario_data_filepath in scenario_data_filepaths:
scenario_data = pd.concat(
[
scenario_data,
input_dataclient.load_data(scenario_data_filepath, load_config=None),
]
)
logger.info("--- Loading Model ---")
model.load_model(model_filepath)
logger.info("--- Generating Predictions ---")
predictions = model.predict(
data=scenario_data, post_prediction_logic=post_prediction_logic
)
logger.info("--- Generate Scenario Predicted Impact ---")
predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name]
scenario_data = pd.concat([scenario_data, predictions_df], axis=1)
scenario_data["predicted_impact"] = abs(
scenario_data[predictions_column_name] - scenario_data["sap_starting"]
)
logger.info("--- Generate Metrics ---")
metrics_dict = metrics.generate_metrics(
scenario_data["impact"], scenario_data["predicted_impact"]
)
metrics_df = pd.DataFrame(metrics_dict, index=[0]).T.reset_index()
metrics_df.columns = ["metric", "value"]
logger.info("--- Save prediction into metrics ---")
output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]]
output_dataclient.save_data(
obj=output_df, location=comparison_output_filepath, save_config=None
)
output_dataclient.save_data(
obj=metrics_df, location=metrics_output_filepath, save_config=None
)
if __name__ == "__main__":
logger.info(f"--- {__file__} - Start! ---")
logger.info(f"--- Generate Scenario Predictions ---")
generate_scenario_predictions(
input_dataclient=input_dataclient,
output_dataclient=output_dataclient,
model=model,
metrics=metrics,
model_filepath=model_filepath,
scenario_data_filepaths=scenario_data_filepaths,
predictions_column_name=predictions_column_name,
comparison_output_filepath=comparison_output_filepath,
metrics_output_filepath=metrics_output_filepath,
)
logger.info(f"--- {__file__} - Complete! ---")

View file

@ -7,6 +7,7 @@ settings = Dynaconf(
"./configs/settings.yaml",
"./configs/build_model.yaml",
"./configs/analysis.yaml",
"./configs/scenarios.yaml",
],
)

View file

@ -14,8 +14,9 @@ default:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 4000
time_limit: 1800
presets: medium_quality
excluded_model_types: ['RF', 'FASTAI', 'CAT', 'NN_TORCH', 'KNN', 'XT']
excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT']
infer_limit: 0.05
infer_limit_batch_size: 10000
ag_args_ensemble: {'num_folds_parallel': 2}

View file

@ -0,0 +1,10 @@
default:
scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
# - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet
- s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -18,13 +18,8 @@ default:
prepare_data:
input_dataclient_type: aws-s3
output_dataclient_type: local
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
# data_filepath: s3://retrofit-datalake-dev/dataset_with0perm_all.parquet
train_proportion: 1
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
@ -35,9 +30,35 @@ default:
subsample_seed: 0
target: sap_ending
identifier_columns: ["uprn"]
drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
# drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms']
retain_features: null
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',
# 'walls_energy_eff_ending', 'secondheat_description_ending',
# 'property_type', 'mainheatc_energy_eff_ending', 'built_form',
# 'walls_insulation_thickness_ending', 'potential_energy_efficiency',
# 'transaction_type_ending',
# 'floor_thermal_transmittance_ending',
# 'low_energy_lighting_ending', 'heat_demand_starting',
# 'photo_supply_ending', 'carbon_starting',
# 'walls_thermal_transmittance_ending',
# 'roof_insulation_thickness_ending',
# 'total_floor_area_ending', 'number_open_fireplaces_ending',
# 'windows_energy_eff_ending',
# 'floor_height_ending',
# 'extension_count_ending',
# 'has_air_source_heat_pump_ending',
# 'charging_system_ending', 'construction_age_band', 'glazed_type_ending',
# 'roof_thermal_transmittance_ending',
# 'floor_insulation_thickness_ending', 'has_mains_gas_ending',
# 'estimated_perimeter_starting', 'energy_consumption_potential',
# 'environment_impact_potential', 'heater_type_ending',
# 'multi_glaze_proportion_ending',
# 'lighting_energy_eff_ending', 'fixed_lighting_outlets_count']
generate_predictions:
input_dataclient_type: local

View file

@ -245,7 +245,8 @@ class LocalClient:
save_methods = {
".parquet": self._save_parquet,
".json": self._save_json
".json": self._save_json,
".md": self._save_md,
# "": _save_directory(**save_config),
# ADD MORE save_methods HERE
}
@ -294,3 +295,10 @@ class LocalClient:
# Write the contents of the buffer to the local file
with open(location, "wb") as f:
f.write(buffer.getvalue())
def _save_md(self, obj: pd.DataFrame, location: str, save_config: dict):
"""
Save object as markdown
"""
obj.to_markdown(location, **save_config)

View file

@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel:
models = {
"SKLearnLinearRegression": SKLearnLinearRegression(),
"SKLearnSVMRegression": SKLearnSVMRegression(),
"AutogluonAutoML": AutogluonAutoML()
"AutogluonAutoML": AutogluonAutoML(),
# ADD OTHER MODELS HERE
}
@ -151,6 +151,7 @@ class AutogluonAutoML:
"excluded_model_types",
"infer_limit",
"infer_limit_batch_size",
"ag_args_ensemble",
]
def load_model(self, path: Union[Path, str]) -> None:
@ -207,6 +208,7 @@ class AutogluonAutoML:
excluded_model_types=model_hyperparameters["excluded_model_types"],
infer_limit=model_hyperparameters["infer_limit"],
infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
)
def predict(

View file

@ -1,5 +1,16 @@
schema: '2.0'
stages:
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics
prepare_data:
cmd: python 1_prepare_data.py
deps:
@ -17,22 +28,69 @@ stages:
- carbon_ending
- days_to_starting
- days_to_ending
- number_habitable_rooms_starting
- number_habitable_rooms_ending
- number_heated_rooms_starting
- number_heated_rooms_ending
- number_habitable_rooms
- number_heated_rooms
default.feature_processor.feature_processor_config.retain_features:
- uprn
- sap_starting
- hot_water_energy_eff_ending
- mainheat_energy_eff_ending
- constituency
- roof_energy_eff_ending
- walls_energy_eff_ending
- secondheat_description_ending
- property_type
- mainheatc_energy_eff_ending
- built_form
- walls_insulation_thickness_ending
- potential_energy_efficiency
- transaction_type_ending
- floor_thermal_transmittance_ending
- low_energy_lighting_ending
- heat_demand_starting
- photo_supply_ending
- carbon_starting
- walls_thermal_transmittance_ending
- roof_insulation_thickness_ending
- total_floor_area_ending
- number_open_fireplaces_ending
- windows_energy_eff_ending
- floor_height_ending
- extension_count_ending
- has_air_source_heat_pump_ending
- charging_system_ending
- construction_age_band
- glazed_type_ending
- roof_thermal_transmittance_ending
- floor_insulation_thickness_ending
- has_mains_gas_ending
- estimated_perimeter_starting
- energy_consumption_potential
- environment_impact_potential
- heater_type_ending
- multi_glaze_proportion_ending
- lighting_energy_eff_ending
- fixed_lighting_outlets_count
default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: sap_ending
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
default.prepare_data.output_train_filepath: ./data/prepared_data/train.parquet
default.prepare_data.train_proportion: 1
default.prepare_data.train_proportion: 0.9
outs:
- path: data/prepared_data/
hash: md5
md5: 3c77fa10cd1cd503eb4d2540394629f6.dir
size: 42626894
md5: efa416abea618ae6220a0c3d597603cf.dir
size: 44750997
nfiles: 2
build_model:
cmd: python 2_build_model.py
@ -43,8 +101,8 @@ stages:
size: 4820
- path: data/prepared_data
hash: md5
md5: 3c77fa10cd1cd503eb4d2540394629f6.dir
size: 42626894
md5: efa416abea618ae6220a0c3d597603cf.dir
size: 44750997
nfiles: 2
params:
configs/build_model.yaml:
@ -61,32 +119,33 @@ stages:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error
time_limit: 4000
time_limit: 1800
presets: medium_quality
excluded_model_types:
- RF
- FASTAI
- CAT
- NN_TORCH
- KNN
- XT
infer_limit: 0.05
infer_limit_batch_size: 10000
ag_args_ensemble:
num_folds_parallel: 2
outs:
- path: data/fit_predictions/
hash: md5
md5: e0a11ac6e4adf69d6180c0217c639a0e.dir
size: 3680908
md5: de46250d454c4d713ab580b10ff3fd31.dir
size: 3349318
nfiles: 1
- path: data/model/
hash: md5
md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir
size: 805896324
nfiles: 31
md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir
size: 735951861
nfiles: 35
- path: metrics/fit_metrics.json
hash: md5
md5: 0ed5b1141bbb8bc3156e7c056b29f3cd
size: 225
md5: 8a952a5e884c268e6059357a627b9251
size: 224
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
@ -96,13 +155,13 @@ stages:
size: 2464
- path: data/model
hash: md5
md5: bdaaf823857f9dc7b6ee2d4b88927cc1.dir
size: 805896324
nfiles: 31
md5: 18bd7a93ece75a65d3a950b7dfdab4fb.dir
size: 735951861
nfiles: 35
- path: data/prepared_data
hash: md5
md5: 3c77fa10cd1cd503eb4d2540394629f6.dir
size: 42626894
md5: efa416abea618ae6220a0c3d597603cf.dir
size: 44750997
nfiles: 2
params:
configs/settings.yaml:
@ -114,8 +173,8 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 38707d16ae1e2330cc03f524db9cdd60.dir
size: 648730
md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir
size: 463563
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
@ -126,13 +185,13 @@ stages:
size: 3484
- path: data/predictions
hash: md5
md5: 38707d16ae1e2330cc03f524db9cdd60.dir
size: 648730
md5: 07ef721a0dc94a52e3ba7a70ac45b8ff.dir
size: 463563
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 3c77fa10cd1cd503eb4d2540394629f6.dir
size: 42626894
md5: efa416abea618ae6220a0c3d597603cf.dir
size: 44750997
nfiles: 2
params:
configs/settings.yaml:
@ -142,16 +201,30 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 145e7ac84ab4a4407b23695a632b4d91
size: 226
startup_cleanup:
cmd: python 0_startup_cleanup.py
md5: 9f863f47799d42c101eba3b03a179455
size: 224
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
- path: 0_startup_cleanup.py
- path: 5_generate_scenarios.py
hash: md5
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
md5: a18f6c6ae2082f038df47386cf3e418e
size: 4896
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics
configs/scenarios.yaml:
default.scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md
outs:
- path: metrics/scenario_metrics.md
hash: md5
md5: 64e7db945ff655ae03c20c9845f19106
size: 363
- path: metrics/scenario_table.md
hash: md5
md5: d4f8afe07b774374aeaa48f1b7b8a5fc
size: 2133

View file

@ -71,6 +71,17 @@ stages:
outs:
- metrics/metrics.json
always_changed: true
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
- 5_generate_scenarios.py
params:
- configs/scenarios.yaml:
- default.scenarios
outs:
- metrics/scenario_table.md
- metrics/scenario_metrics.md
always_changed: true
metrics:
- metrics/metrics.json
- metrics/fit_metrics.json

View file

@ -1,2 +1,4 @@
/fit_metrics.json
/metrics.json
/scenario_table.md
/scenario_metrics.md

View file

@ -1,7 +1,7 @@
joblib==1.3.2
boto3==1.28.17
pandas==1.5.3
autogluon==0.8.2
pandas==2.1.4
autogluon==1.0.0
dynaconf==3.2.0
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,7 +1,7 @@
joblib==1.3.2
boto3==1.28.17
pandas==1.5.3
autogluon==0.8.2
pandas==2.1.4
autogluon==1.0.0
dynaconf==3.2.0
pyarrow==13.0.0
PyYAML==6.0.1

View file

@ -1,9 +1,10 @@
joblib==1.3.2
boto3==1.28.17
pandas==1.5.3
autogluon==0.8.2
pandas==2.1.4
autogluon==1.0.0
ray==2.6.3
dynaconf==3.2.0
alibi==0.9.4
alibi==0.9.5
shap==0.42.1
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,4 +1,4 @@
boto3==1.28.41
pandas==1.5.3
autogluon==0.8.2
pandas==2.1.4
autogluon==1.0.0
dynaconf==3.2.0