Merge pull request #60 from Hestia-Homes/sap_change-model

Sap change model
This commit is contained in:
quandanrepo 2023-10-04 11:24:39 +01:00 committed by GitHub
commit b0cfc2d184
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 124 additions and 51 deletions

View file

@ -11,6 +11,9 @@ on:
- closed
branches:
- "master"
- "sap_change-dev"
- "heat_change-dev"
- "carbon_change-dev"
permissions: write-all

View file

@ -5,7 +5,7 @@ on:
# branches:
# - "model-**"
pull_request:
branches: [ "master" ]
branches: [ "master", "sap_change-dev", "heat_change-dev", "carbon_change-dev"]
label:
types: ["created", "edited"]
@ -89,13 +89,14 @@ jobs:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
TARGET_BRANCH: ${{ github.base_ref }}
run: |
cd modules/ml-pipeline/src/pipeline
echo "## Model metrics" > report.md
# Compare metrics to master
git fetch --depth=1 origin master:master
dvc metrics diff --md --all master >> report.md
git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH}
dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md
cml comment create report.md

View file

@ -1,3 +1,3 @@
# The generic reproducible ML-pipeline
Pipeline required to build a model to produce an output
Pipeline required to build a model to produce an output, that gets hashed via DVC

View file

@ -6,7 +6,7 @@ Once we have the features, we build a model
import os
import yaml
import pandas as pd
from typing import Union
from typing import Union, List
from pathlib import Path
from core.Logger import logger
from core.interface.InterfaceMetrics import MLMetrics
@ -31,6 +31,9 @@ generate_metrics_params = settings.generate_metrics
model_type = build_model_params["model_type"]
target = feature_process_params["feature_processor_config"]["target"]
identifier_columns = feature_process_params["feature_processor_config"][
"identifier_columns"
]
model_save_location = build_model_params["model_save_filepath"]
model_hyperparameters = build_model_params[model_type]
train_filepath = prepare_data_params["output_train_filepath"]
@ -62,6 +65,7 @@ def build_model(
model: MLModel,
metrics: MLMetrics,
target: str,
identifier_columns: List[str],
model_save_location: str,
model_hyperparameters: dict,
fit_metrics_filepath: str,
@ -90,17 +94,17 @@ def build_model(
logger.info("----------------------")
model.train_model(
data=train_data, target=target, model_hyperparameters=model_hyperparameters
data=train_data.drop(columns=identifier_columns),
target=target,
model_hyperparameters=model_hyperparameters,
)
logger.info("----------------------------------")
logger.info("--- Generating fit predictions ---")
logger.info("----------------------------------")
prediction_data = train_data.drop(columns=target)
fit_predictions = model.predict(
data=prediction_data, post_prediction_logic=post_prediction_logic
data=train_data, post_prediction_logic=post_prediction_logic
)
logger.info("------------------------------")
@ -142,6 +146,7 @@ if __name__ == "__main__":
model=model,
metrics=metrics,
target=target,
identifier_columns=identifier_columns,
model_save_location=model_save_location,
model_hyperparameters=model_hyperparameters,
train_filepath=train_filepath,

View file

@ -1,7 +1,7 @@
default:
build_model:
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
model_save_filepath: ./data/model/optimised/
fit_metrics_filepath: ./metrics/fit_metrics.json
SKLearnLinearRegression: null
@ -10,9 +10,9 @@ default:
kernel: "linear"
AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 1000
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 4000
presets: medium_quality
excluded_model_types: ['KNN']
excluded_model_types: ['KNN', 'RF']

View file

@ -27,7 +27,7 @@ def remove_starting_columns(df):
# return df
business_logic = {
"remove_starting_columns": remove_starting_columns
# "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}

View file

@ -18,7 +18,10 @@ default:
prepare_data:
input_dataclient_type: aws-s3
output_dataclient_type: local
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
@ -29,7 +32,9 @@ default:
subsample_amount: null
subsample_seed: 0
target: SAP_ENDING
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
identifier_columns: ["UPRN"]
drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null
generate_predictions:

View file

@ -165,8 +165,12 @@ class AutogluonAutoML:
if self.model is None:
raise KeyError("No model trained/ loaded - unable to save")
logger.info("In local development mode - no need for s3 client")
logger.info("Using AutoGluon Model - Model saving already occured")
logger.info(
"Using AutoGluon Model - Model saving is using optimised deployment mode"
)
logger.info("Saving optimised model")
self.model.clone_for_deployment(str(path))
return str(path)

View file

@ -10,7 +10,6 @@ stages:
params:
configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns:
- UPRN
- HEAT_DEMAND_CHANGE
- CARBON_CHANGE
- RDSAP_CHANGE
@ -21,7 +20,7 @@ stages:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: SAP_ENDING
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -30,65 +29,66 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
size: 21115444
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
nfiles: 2
build_model:
cmd: python 2_build_model.py
deps:
- path: 2_build_model.py
hash: md5
md5: 039578b629d7cd204016e92cd079ea90
size: 5181
md5: 84699d208874c52accaff61c6af9bb0a
size: 5359
- path: data/prepared_data
hash: md5
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
size: 21115444
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
nfiles: 2
params:
configs/build_model.yaml:
default:
build_model:
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
model_save_filepath: ./data/model/optimised/
fit_metrics_filepath: ./metrics/fit_metrics.json
SKLearnLinearRegression:
SKLearnSVMRegression:
kernel: linear
AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 1000
eval_metric: mean_squared_error
time_limit: 4000
presets: medium_quality
excluded_model_types:
- KNN
- RF
outs:
- path: data/model/
hash: md5
md5: d073af40ba5c7c2d9b8064665062f51e.dir
size: 363710367
nfiles: 20
md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 473398662
nfiles: 27
- path: metrics/fit_metrics.json
hash: md5
md5: dcd9ea03a2771077e1bd14018bb7fd18
size: 183
md5: 2bb16ac67de8778fbc08171d562b34d5
size: 184
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
- path: 3_generate_predictions.py
hash: md5
md5: 238b3fa9f3c6f3720e77c116857070ae
size: 4720
md5: 5ef2856a5a977304f1ec01f9b4205262
size: 3028
- path: data/model
hash: md5
md5: d073af40ba5c7c2d9b8064665062f51e.dir
size: 363710367
nfiles: 20
md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 473398662
nfiles: 27
- path: data/prepared_data
hash: md5
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
size: 21115444
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
nfiles: 2
params:
configs/settings.yaml:
@ -100,8 +100,8 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: a2ecfae1e418fe9cb9fe044c148bbb37.dir
size: 381538
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 374532
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
@ -112,13 +112,13 @@ stages:
size: 4487
- path: data/predictions
hash: md5
md5: a2ecfae1e418fe9cb9fe044c148bbb37.dir
size: 381538
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 374532
nfiles: 1
- path: data/prepared_data
hash: md5
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
size: 21115444
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
nfiles: 2
params:
configs/settings.yaml:
@ -128,8 +128,8 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: ec02774fd01243fa4706189c60087ccf
size: 182
md5: 2e13ae67759a64261d03224f1c0d4bf4
size: 185
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:

View file

@ -175,3 +175,57 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96
#
#
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import pandas as pd
from config import settings
client_params = settings.client
prepare_data_params = settings.prepare_data
feature_process_params = settings.feature_processor
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
prediction_analysis_params = settings.prediction_analysis
model = model_factory(build_model_params["model_type"])
model.load_model(build_model_params["model_save_filepath"])
dataclient_type = prediction_analysis_params["dataclient_type"]
dataclient = dataclient_factory(
dataclient_type=dataclient_type,
dataclient_config=client_params[dataclient_type],
)
target = feature_process_params["feature_processor_config"]["target"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
output_test_filepath = prepare_data_params["output_test_filepath"]
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
test_df = dataclient.load_data(output_test_filepath)
predictions = dataclient.load_data(predictions_output_filepath)
mix_df = pd.concat([test_df.copy(), predictions], axis=1)
mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]
from sklearn.metrics.pairwise import cosine_similarity
row_index = 58199
from sklearn.preprocessing import LabelEncoder
object_columns = cosine_similarity_df.select_dtypes(["object"])
cosine_similarity_df[object_columns.columns] = cosine_similarity_df[
object_columns.columns
].apply(LabelEncoder().fit_transform)
feature_vector = cosine_similarity_df.loc[[row_index]]
cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
similar_index = (
cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index
)
check_df = mix_df.loc[similar_index]

View file

@ -1,3 +1,4 @@
dvc==3.18.0
dvc-s3==2.23.0
gto==1.0.4
pyOpenSSL==23.2.0