Compare commits

..

11 commits

Author SHA1 Message Date
quandanrepo
676539e6a7
Merge pull request #86 from Hestia-Homes/heat-dev-model
Heat dev model
2023-11-27 22:16:44 +00:00
quandanrepo
890ca15193
Merge branch 'heat-dev' into heat-dev-model 2023-11-27 22:09:53 +00:00
Michael Duong
5a9eb608bd commit first heat-model 2023-11-27 22:06:18 +00:00
Michael Duong
f4f8dc2bf2 Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-11-27 21:51:03 +00:00
Github-Bot
2d331736a4 Update Registry 2023-10-10 12:47:01 +00:00
Github-Bot
7d685caaf5 Update Registry 2023-10-10 12:46:02 +00:00
quandanrepo
dffb01bf8e
Merge pull request #67 from Hestia-Homes/heat-dev-model
Heat dev model
2023-10-10 13:45:23 +01:00
Michael Duong
d2a7615e3b Merge branch 'master' of github.com:Hestia-Homes/ML into heat-dev-model 2023-10-10 12:33:51 +00:00
Michael Duong
4c6c5330d8 add new model, new branch 2023-10-10 12:33:44 +00:00
Michael Duong
9e7d0fa538 add new model 2023-10-10 12:32:25 +00:00
Michael Duong
ad2c266727 initial model for heat-dev 2023-10-09 17:52:47 +00:00
36 changed files with 119 additions and 484 deletions

View file

@ -1,9 +0,0 @@
modules/ml-pipeline/src/pipeline/data/predictions
modules/ml-pipeline/src/pipeline/data/fit_predictions
modules/ml-pipeline/src/pipeline/data/prepared_data
modules/ml-pipeline/src/pipeline/data/model/allmodels
modules/ml-pipeline/src/pipeline/metrics
modules/ml-pipeline/src/pipeline/__pycache__
modules/ml-pipeline/src/pipeline/.dvc
modules/ml-pipeline/src/pipeline/analysis
modules/ml-pipeline/src/pipeline/metrics

View file

@ -19,8 +19,8 @@ jobs:
- name: Install Serverless and plugins
run: |
npm install -g serverless@^3.38.0
npm install -g serverless-domain-manager@^7.3.8
npm install -g serverless
npm install -g serverless-domain-manager
- name: Install DVC
run: |

View file

@ -98,16 +98,6 @@ jobs:
git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH}
dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md
echo "## Scenario comparison" >> report.md
cat metrics/scenario_table.md >> report.md
echo "" >> report.md
echo "## Scenario metrics" >> report.md
cat metrics/scenario_metrics.md >> report.md
cml comment create report.md
# echo "## Residuals plot from model" >> report.md

View file

@ -8,25 +8,17 @@
"active": true
},
"sap": {
"version": "v0.14.0",
"version": "v0.1.0",
"stage": {
"dev": "v0.14.0"
"dev": "v0.1.0"
},
"registered": true,
"active": true
},
"heat": {
"version": "v0.5.0",
"version": "v0.0.1",
"stage": {
"dev": "v0.5.0"
},
"registered": true,
"active": true
},
"carbon": {
"version": "v0.5.0",
"stage": {
"dev": "v0.5.0"
"dev": "v0.0.1"
},
"registered": true,
"active": true

View file

@ -1,9 +0,0 @@
modules/ml-pipeline/src/pipeline/data/predictions
modules/ml-pipeline/src/pipeline/data/fit_predictions
modules/ml-pipeline/src/pipeline/data/prepared_data
modules/ml-pipeline/src/pipeline/data/model/allmodels
modules/ml-pipeline/src/pipeline/metrics
modules/ml-pipeline/src/__pycache__
modules/ml-pipeline/src/.dvc
modules/ml-pipeline/src/analysis
modules/ml-pipeline/src/metrics

View file

@ -9,7 +9,7 @@ ARG RUNTIME_ENVIRONMENT
ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT}
# Install necessary build tools - required to test locally
RUN yum install -y gcc python3-devel gcc-c++
RUN yum install -y gcc python3-devel
# Install python packages
COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt

3
modules/ml-pipeline/.dvc/.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
/config.local
/tmp
/cache

View file

@ -0,0 +1,2 @@
['remote "myremote"']
url = /tmp/dvcstore

View file

@ -0,0 +1,3 @@
# Add patterns of files dvc should ignore, which could improve
# the performance. Learn more at
# https://dvc.org/doc/user-guide/dvcignore

2
modules/ml-pipeline/.gto Normal file
View file

@ -0,0 +1,2 @@
# .gto config file
stages: [dev, stage, prod] # list of allowed Stages

View file

@ -1,8 +0,0 @@
pipeline/data/predictions
pipeline/data/fit_predictions
pipeline/data/prepared_data/train.parquet
pipeline/data/fit_predictions
pipeline/data/model/allmodels
pipeline/metrics
pipeline/.dvc
pipeline/analysis

View file

@ -1,7 +1,7 @@
# Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow)
FROM python:3.10.12-slim
RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev
RUN apt-get update && apt-get install -y libgomp1
COPY pipeline/requirements/predictions/requirements.txt requirements.txt

View file

@ -87,8 +87,7 @@ def prepare_data(
if train_proportion == 1:
train = data
# Sample 10% of the data for testing
test = data.sample(round(len(data) * 0.1))
test = None
else:
train, test = train_test_split(
data, train_size=train_proportion, test_size=(1 - train_proportion)

View file

@ -26,12 +26,9 @@ prepare_data_params = settings.prepare_data
build_model_params = settings.build_model
feature_process_params = settings.feature_processor
generate_metrics_params = settings.generate_metrics
generate_predictions_params = settings.generate_predictions
model_type = build_model_params["model_type"]
target = feature_process_params["feature_processor_config"]["target"]
fit_predictions_filepath = build_model_params["fit_predictions_filepath"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
identifier_columns = feature_process_params["feature_processor_config"][
"identifier_columns"
]
@ -63,8 +60,6 @@ def build_model(
identifier_columns: List[str],
model_save_location: str,
model_hyperparameters: dict,
fit_predictions_filepath: str,
predictions_column_name: str,
fit_metrics_filepath: str,
train_filepath: Union[str, None] = None,
test_filepath: Union[str, None] = None,
@ -98,15 +93,6 @@ def build_model(
data=train_data, post_prediction_logic=post_prediction_logic
)
logger.info("--- Saving fit predictions ---")
predictions_df = pd.DataFrame(fit_predictions)
predictions_df.columns = [predictions_column_name]
dataclient.save_data(
obj=predictions_df, location=fit_predictions_filepath, save_config=None
)
logger.info("--- Generating fit metrics ---")
metrics_output = metrics.generate_metrics(
@ -142,8 +128,6 @@ if __name__ == "__main__":
train_filepath=train_filepath,
test_filepath=test_filepath,
fit_metrics_filepath=fit_metrics_filepath,
fit_predictions_filepath=fit_predictions_filepath,
predictions_column_name=predictions_column_name,
)
logger.info(f"--- {__file__} - Complete! ---")

View file

@ -4,9 +4,7 @@ After the model is built, we can evaluate its performance
"""
import os
import yaml
import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceMetrics import MLMetrics
from core.interface.InterfaceDataClient import DataClient
@ -33,6 +31,7 @@ predictions_output_filepath = generate_predictions_params["predictions_output_fi
predictions_column_name = generate_predictions_params["predictions_column_name"]
metrics_output_filepath = generate_metrics_params["metrics_output_filepath"]
logger.info(f"--- Initiate MLModel ---")
model = model_factory(build_model_params["model_type"])

View file

@ -1,162 +0,0 @@
"""
Fourth part of the pipeline:
After the model is built and metrics are generated,
we want to test this model against known scenarios
"""
import os
import pandas as pd
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.interface.InterfaceMetrics import MLMetrics
from configs.post_prediction_logic import post_prediction_logic
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.MLMetrics import metrics_factory
from core.Logger import logger
from config import settings
logger.info(f"--- Initiate Parameters ---")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_params = settings.client
prepare_data_params = settings.prepare_data
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
generate_metrics_params = settings.generate_metrics
feature_process_params = settings.feature_processor
scenarios_params = settings.scenarios
model_filepath = build_model_params["model_save_filepath"]
target = feature_process_params["feature_processor_config"]["target"]
scenario_data_filepaths = scenarios_params["scenario_data_filepaths"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
comparison_output_filepath = scenarios_params["comparison_output_filepath"]
metrics_output_filepath = scenarios_params["metrics_output_filepath"]
logger.info(f"--- Initiate MLModel ---")
model = model_factory(build_model_params["model_type"])
logger.info(f"--- Initiate DataClient ---")
# Use data client for input and output, as we use dvc to cache later to the cloud
input_dataclient_type = scenarios_params["input_dataclient_type"]
input_dataclient = dataclient_factory(
dataclient_type=input_dataclient_type,
dataclient_config=client_params[input_dataclient_type],
)
output_dataclient_type = scenarios_params["output_dataclient_type"]
output_dataclient = dataclient_factory(
dataclient_type=output_dataclient_type,
dataclient_config=client_params[output_dataclient_type],
)
logger.info(f"--- Initiate MLMetrics ---")
metrics = metrics_factory(generate_metrics_params["metrics_type"])
def generate_scenario_predictions(
input_dataclient: DataClient,
output_dataclient: DataClient,
model: MLModel,
metrics: MLMetrics,
model_filepath: str,
scenario_data_filepaths: list,
predictions_column_name: str,
comparison_output_filepath: str,
metrics_output_filepath: str,
):
"""
Given the new model, we generate prediction for expected scenarios
"""
logger.info("--- Loading Scenario Data ---")
scenario_data = pd.DataFrame()
# If we have no scenario data, we can save empty dataframes
if scenario_data_filepaths is None:
logger.info("No scenario data filepaths provided")
output_dataclient.save_data(
obj=scenario_data, location=comparison_output_filepath, save_config=None
)
output_dataclient.save_data(
obj=scenario_data, location=metrics_output_filepath, save_config=None
)
return
# Can have multiple scenario data files
for scenario_data_filepath in scenario_data_filepaths:
scenario_data = pd.concat(
[
scenario_data,
input_dataclient.load_data(scenario_data_filepath, load_config=None),
]
)
logger.info("--- Loading Model ---")
model.load_model(model_filepath)
logger.info("--- Generating Predictions ---")
predictions = model.predict(
data=scenario_data, post_prediction_logic=post_prediction_logic
)
logger.info("--- Generate Scenario Predicted Impact ---")
predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name]
scenario_data = pd.concat([scenario_data, predictions_df], axis=1)
scenario_data["predicted_impact"] = abs(
scenario_data[predictions_column_name] - scenario_data["sap_starting"]
)
logger.info("--- Generate Metrics ---")
metrics_dict = metrics.generate_metrics(
scenario_data["impact"], scenario_data["predicted_impact"]
)
metrics_df = pd.DataFrame(metrics_dict, index=[0]).T.reset_index()
metrics_df.columns = ["metric", "value"]
logger.info("--- Save prediction into metrics ---")
output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]]
output_dataclient.save_data(
obj=output_df, location=comparison_output_filepath, save_config=None
)
output_dataclient.save_data(
obj=metrics_df, location=metrics_output_filepath, save_config=None
)
if __name__ == "__main__":
logger.info(f"--- {__file__} - Start! ---")
logger.info(f"--- Generate Scenario Predictions ---")
generate_scenario_predictions(
input_dataclient=input_dataclient,
output_dataclient=output_dataclient,
model=model,
metrics=metrics,
model_filepath=model_filepath,
scenario_data_filepaths=scenario_data_filepaths,
predictions_column_name=predictions_column_name,
comparison_output_filepath=comparison_output_filepath,
metrics_output_filepath=metrics_output_filepath,
)
logger.info(f"--- {__file__} - Complete! ---")

View file

@ -37,4 +37,3 @@ Workflow:
- This experiment will have the corresponding .dvc files for the hashed model and data
- Use version control as normal
- git add, git commit etc
- To revert change, use `git checkout {COMMIT_HASH}`, followed by `git switch -c {NEW_BRANCH_NAME}`

View file

@ -7,7 +7,6 @@ settings = Dynaconf(
"./configs/settings.yaml",
"./configs/build_model.yaml",
"./configs/analysis.yaml",
"./configs/scenarios.yaml",
],
)

View file

@ -13,4 +13,4 @@ default:
dataclient_type: local
nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
row_index: [20695, 50243, 7653] # index of an example datapoint
row_index: [0, 10, 20] # index of an example datapoint

View file

@ -3,7 +3,6 @@ default:
model_type: AutogluonAutoML
model_save_filepath: ./data/model/optimised/
fit_metrics_filepath: ./metrics/fit_metrics.json
fit_predictions_filepath: ./data/fit_predictions/predictions.parquet
SKLearnLinearRegression: null
@ -14,9 +13,8 @@ default:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 1800
time_limit: 400
presets: medium_quality
excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT']
excluded_model_types: ['KNN', 'RF']
infer_limit: 0.05
infer_limit_batch_size: 10000
ag_args_ensemble: {'num_folds_parallel': 2}

View file

@ -9,39 +9,17 @@ Business Logic dict + functions
def remove_starting_columns(df):
keep_column_index = [
False if col_name.endswith("_starting") else True
False if col_name.endswith("_STARTING") else True
for col_name in list(df.columns)
]
keep_columns = df.columns[keep_column_index].to_list()
keep_columns.append("sap_starting")
keep_columns.append("SAP_STARTING")
df = df[keep_columns]
return df
def remove_floor_height_ending(df):
# df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
print("we in here")
return df
def remove_minimum_habitable_room_size(df):
# Need minimum of 6.5m per habitable room
df = df[
df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
].reset_index(drop=True)
return df
def keep_flats(df):
df = df[df["property_type"] == "Flat"]
return df
def keep_non_zero_rdsap(df):
df = df[df["rdsap_change"] != 0]
def keep_negative_heat_change(df):
df = df[df["HEAT_DEMAND_CHANGE"] < 0]
return df
@ -54,10 +32,7 @@ def keep_non_zero_rdsap(df):
# return df
business_logic = {
# "keep_non_zero_rdsap": keep_non_zero_rdsap,
# "keep_flats": keep_flats,
# "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
# "remove_floor_height_ending": remove_floor_height_ending
"keep_negative_heat_change": keep_negative_heat_change
# "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}

View file

@ -5,7 +5,7 @@ import pandas as pd
def clip_predictions_to_minimum_value(
data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0
data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1
) -> pd.Series:
series_name = predictions.name
@ -13,10 +13,10 @@ def clip_predictions_to_minimum_value(
predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement
replace_index = (
predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"]
predictions_df["predictions"] > predictions_df["HEAT_DEMAND_STARTING"] - 1
)
predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "sap_starting"] + minimum_value
predictions_df.loc[replace_index, "HEAT_DEMAND_STARTING"] - minimum_value
)
predictions_new = predictions_df["predictions"]

View file

@ -1,13 +0,0 @@
default:
scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
# - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -18,10 +18,10 @@ default:
prepare_data:
input_dataclient_type: aws-s3
output_dataclient_type: local
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
@ -31,37 +31,11 @@ default:
feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: sap_ending
identifier_columns: ["uprn"]
# drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms']
target: HEAT_DEMAND_ENDING
identifier_columns: ["UPRN"]
drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "SAP_ENDING", "CARBON_ENDING"]
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',
# 'walls_energy_eff_ending', 'secondheat_description_ending',
# 'property_type', 'mainheatc_energy_eff_ending', 'built_form',
# 'walls_insulation_thickness_ending', 'potential_energy_efficiency',
# 'transaction_type_ending',
# 'floor_thermal_transmittance_ending',
# 'low_energy_lighting_ending', 'heat_demand_starting',
# 'photo_supply_ending', 'carbon_starting',
# 'walls_thermal_transmittance_ending',
# 'roof_insulation_thickness_ending',
# 'total_floor_area_ending', 'number_open_fireplaces_ending',
# 'windows_energy_eff_ending',
# 'floor_height_ending',
# 'extension_count_ending',
# 'has_air_source_heat_pump_ending',
# 'charging_system_ending', 'construction_age_band', 'glazed_type_ending',
# 'roof_thermal_transmittance_ending',
# 'floor_insulation_thickness_ending', 'has_mains_gas_ending',
# 'estimated_perimeter_starting', 'energy_consumption_potential',
# 'environment_impact_potential', 'heater_type_ending',
# 'multi_glaze_proportion_ending',
# 'lighting_energy_eff_ending', 'fixed_lighting_outlets_count']
generate_predictions:
input_dataclient_type: local

View file

@ -245,8 +245,7 @@ class LocalClient:
save_methods = {
".parquet": self._save_parquet,
".json": self._save_json,
".md": self._save_md,
".json": self._save_json
# "": _save_directory(**save_config),
# ADD MORE save_methods HERE
}
@ -295,10 +294,3 @@ class LocalClient:
# Write the contents of the buffer to the local file
with open(location, "wb") as f:
f.write(buffer.getvalue())
def _save_md(self, obj: pd.DataFrame, location: str, save_config: dict):
"""
Save object as markdown
"""
obj.to_markdown(location, **save_config)

View file

@ -25,7 +25,7 @@ def model_factory(model_type: str) -> MLModel:
models = {
"SKLearnLinearRegression": SKLearnLinearRegression(),
"SKLearnSVMRegression": SKLearnSVMRegression(),
"AutogluonAutoML": AutogluonAutoML(),
"AutogluonAutoML": AutogluonAutoML()
# ADD OTHER MODELS HERE
}
@ -151,7 +151,6 @@ class AutogluonAutoML:
"excluded_model_types",
"infer_limit",
"infer_limit_batch_size",
"ag_args_ensemble",
]
def load_model(self, path: Union[Path, str]) -> None:
@ -208,7 +207,6 @@ class AutogluonAutoML:
excluded_model_types=model_hyperparameters["excluded_model_types"],
infer_limit=model_hyperparameters["infer_limit"],
infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"],
ag_args_ensemble=model_hyperparameters["ag_args_ensemble"],
)
def predict(

View file

@ -1,46 +1,26 @@
schema: '2.0'
stages:
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 0_startup_cleanup.py
hash: md5
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics
prepare_data:
cmd: python 1_prepare_data.py
deps:
- path: 1_prepare_data.py
hash: md5
md5: 11a3b8bfdfe199ab7ecc39ccc5652649
size: 4298
md5: 896d3d88a4a9f68d174efe71dc089517
size: 4222
params:
configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns:
- heat_demand_change
- carbon_change
- rdsap_change
- heat_demand_ending
- carbon_ending
- days_to_starting
- days_to_ending
- number_habitable_rooms_starting
- number_habitable_rooms_ending
- number_heated_rooms_starting
- number_heated_rooms_ending
- number_habitable_rooms
- number_heated_rooms
- HEAT_DEMAND_CHANGE
- CARBON_CHANGE
- RDSAP_CHANGE
- SAP_ENDING
- CARBON_ENDING
default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: sap_ending
default.feature_processor.feature_processor_config.target: HEAT_DEMAND_ENDING
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -49,20 +29,20 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir
size: 34480114
nfiles: 2
build_model:
cmd: python 2_build_model.py
deps:
- path: 2_build_model.py
hash: md5
md5: 7231450b78920b0c5e7c6bada496b24a
size: 4820
md5: b824822475c222521516493e68eef9c5
size: 4149
- path: data/prepared_data
hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir
size: 34480114
nfiles: 2
params:
configs/build_model.yaml:
@ -71,7 +51,6 @@ stages:
model_type: AutogluonAutoML
model_save_filepath: ./data/model/optimised/
fit_metrics_filepath: ./metrics/fit_metrics.json
fit_predictions_filepath: ./data/fit_predictions/predictions.parquet
SKLearnLinearRegression:
SKLearnSVMRegression:
kernel: linear
@ -79,33 +58,23 @@ stages:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error
time_limit: 1800
time_limit: 400
presets: medium_quality
excluded_model_types:
- RF
- CAT
- NN_TORCH
- KNN
- XT
- RF
infer_limit: 0.05
infer_limit_batch_size: 10000
ag_args_ensemble:
num_folds_parallel: 2
outs:
- path: data/fit_predictions/
hash: md5
md5: d9c9afc05e8780db47c0548b19bf7d19.dir
size: 3349989
nfiles: 1
- path: data/model/
hash: md5
md5: 13c3100e1486c27a83a8a47491077842.dir
size: 773523079
nfiles: 36
md5: 452eba2d92233e81d321814aacefe5c2.dir
size: 323127043
nfiles: 24
- path: metrics/fit_metrics.json
hash: md5
md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a
size: 224
md5: 888124b56e0c5008a6423e290fc5cc71
size: 222
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
@ -115,13 +84,13 @@ stages:
size: 2464
- path: data/model
hash: md5
md5: 13c3100e1486c27a83a8a47491077842.dir
size: 773523079
nfiles: 36
md5: 452eba2d92233e81d321814aacefe5c2.dir
size: 323127043
nfiles: 24
- path: data/prepared_data
hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir
size: 34480114
nfiles: 2
params:
configs/settings.yaml:
@ -133,25 +102,25 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 463197
md5: f852550a0a51f0c2b120b0680c1a9b54.dir
size: 325890
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
deps:
- path: 4_generate_metrics.py
hash: md5
md5: 4fedb86d89d528f0a6597934ba3890a0
size: 3484
md5: 567b1acb819e2ff432b989cdbdd4a2bf
size: 3448
- path: data/predictions
hash: md5
md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 463197
md5: f852550a0a51f0c2b120b0680c1a9b54.dir
size: 325890
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 45056059
md5: 6f9c63363ad52a836524dbb6fae7a2ac.dir
size: 34480114
nfiles: 2
params:
configs/settings.yaml:
@ -161,30 +130,16 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 3e08df02fd5c5d094bcf936e1338d596
size: 223
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
md5: ed3012943593fac4ac7ad9a5499ac18f
size: 219
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
- path: 5_generate_scenarios.py
- path: 0_startup_cleanup.py
hash: md5
md5: 40506749fefd926d47c60ff5b16db307
size: 5337
md5: b1b12f6b6393fbf8b83d23684df0a3d4
size: 1220
params:
configs/scenarios.yaml:
default.scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md
outs:
- path: metrics/scenario_metrics.md
hash: md5
md5: fa4d6d7bbd7818613800da5f8f37ea96
size: 363
- path: metrics/scenario_table.md
hash: md5
md5: d6baf100a1623cc2467c2f8221d314c9
size: 2133
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics

View file

@ -38,7 +38,6 @@ stages:
- configs/build_model.yaml:
outs:
- data/model/
- data/fit_predictions/
- metrics/fit_metrics.json
always_changed: true
generate_predictions:
@ -71,17 +70,6 @@ stages:
outs:
- metrics/metrics.json
always_changed: true
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
- 5_generate_scenarios.py
params:
- configs/scenarios.yaml:
- default.scenarios
outs:
- metrics/scenario_table.md
- metrics/scenario_metrics.md
always_changed: true
metrics:
- metrics/metrics.json
- metrics/fit_metrics.json

View file

@ -38,7 +38,6 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
x=target, y="HEAT_DEMAND_STARTING", style="o"
)
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
# Load the autogluon model and check feature importance
@ -176,6 +175,8 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
#
#
from core.MLMetrics import metrics_factory
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import pandas as pd
@ -190,35 +191,31 @@ prediction_analysis_params = settings.prediction_analysis
model = model_factory(build_model_params["model_type"])
model.load_model(build_model_params["model_save_filepath"])
dataclient_type = prediction_analysis_params["dataclient_type"]
# dataclient_type = 'aws-s3'
# dataclient = dataclient_factory(
# dataclient_type=dataclient_type,
# dataclient_config=client_params[dataclient_type],
# )
# data = dataclient.load_data("s3://retrofit-data-dev/sap_change_model/dataset.parquet")
dataclient = dataclient_factory(
dataclient_type=dataclient_type,
dataclient_config=client_params[dataclient_type],
)
target = feature_process_params["feature_processor_config"]["target"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
output_test_filepath = prepare_data_params["output_test_filepath"]
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
# score_data = dataclient.load_data("s3://retrofit-data-dev/carbon_change_predictions/51/2023-11-28T21:01:21.869339.parquet")
local_dataclient = dataclient_factory(
dataclient_type="local",
dataclient_config=client_params["local"],
)
test_df = local_dataclient.load_data(output_test_filepath)
predictions = local_dataclient.load_data(predictions_output_filepath)
test_df = dataclient.load_data(output_test_filepath)
predictions = dataclient.load_data(predictions_output_filepath)
mix_df = pd.concat([test_df.copy(), predictions], axis=1)
mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])]
metrics = metrics_factory("Regression")
metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"])
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]
from sklearn.metrics.pairwise import cosine_similarity
row_index = 0
row_index = 58199
from sklearn.preprocessing import LabelEncoder
@ -232,17 +229,7 @@ feature_vector = cosine_similarity_df.loc[[row_index]]
cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
similar_index = (
cosine_similarity_df.sort_values("cosine", ascending=False).head(15).index
cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index
)
check_df = mix_df.loc[similar_index]
columns_to_check = [
"LOW_ENERGY_LIGHTING_ENDING",
"walls_thermal_transmittance_ENDING",
"floor_thermal_transmittance_ENDING",
"roof_thermal_transmittance_ENDING",
"roof_insulation_thickness_ENDING",
]
cosine_similarity_df = mix_df[columns_to_check]

View file

@ -1,4 +1,2 @@
/fit_metrics.json
/metrics.json
/scenario_table.md
/scenario_metrics.md

View file

@ -1,7 +1,7 @@
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1
pandas==1.5.3
autogluon==0.8.2
dynaconf==3.2.0
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,7 +1,7 @@
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1
pandas==1.5.3
autogluon==0.8.2
dynaconf==3.2.0
pyarrow==13.0.0
PyYAML==6.0.1

View file

@ -1,10 +1,9 @@
joblib==1.3.2
boto3==1.28.17
pandas==2.1.4
autogluon.tabular[all]==1.0.0
ray==2.6.3
dynaconf==3.2.1
alibi==0.9.5
pandas==1.5.3
autogluon==0.8.2
dynaconf==3.2.0
alibi==0.9.4
shap==0.42.1
pyarrow==13.0.0
pre-commit==3.3.3

View file

@ -1,4 +1,4 @@
boto3==1.28.41
pandas==2.1.4
autogluon.tabular[all]==1.0.0
dynaconf==3.2.1
pandas==1.5.3
autogluon==0.8.2
dynaconf==3.2.0

View file

@ -1,4 +1,4 @@
dvc==3.51.0
dvc-s3==3.2.0
gto==1.7.1
pyOpenSSL==23.3.0
dvc==3.18.0
dvc-s3==2.23.0
gto==1.0.4
pyOpenSSL==23.2.0