Merge pull request #65 from Hestia-Homes/sap-dev

Sap dev
This commit is contained in:
quandanrepo 2023-10-07 09:56:42 +01:00 committed by GitHub
commit ba4d1bcc8b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 136 additions and 68 deletions

View file

@ -2,7 +2,7 @@ name: Sap Change Model Deploy
on:
push:
branches: [ sap_change-dev, sap_change-prod ]
branches: [ sap-dev, sap-prod ]
jobs:
deploy:
@ -54,10 +54,12 @@ jobs:
- name: Set stack_name
id: set_stack_name
run: |
if [[ "${{ github.ref_name }}" == "sap_change-dev" || "${{ github.ref_name }}" == "sap_change-prod" ]]; then
echo "::set-output name=stack_name::sapmodel"
else
# Take branch prefix and add "model" for stack name
stack_name=$( echo ${{ github.ref_name }} | awk -F"-" '{print $1}' | sed 's/$/model/g')
if [ -z "${stack_name}" ]; then
echo "::set-output name=stack_name::"
else
echo "::set-output name=stack_name::${stack_name}"
fi
- name: Set runtime_environment

View file

@ -10,7 +10,9 @@ on:
types:
- closed
branches:
- "master"
- "sap-dev"
- "heat-dev"
- "carbon-dev"
permissions: write-all
@ -176,6 +178,8 @@ jobs:
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Register Model
env:
TARGET_BRANCH: ${{ github.base_ref }}
run: |
REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}')
@ -203,11 +207,11 @@ jobs:
git tag -a ${new_tag} -m "Assigning stage dev to artifact ${REGISTER_MODEL_NAME} version ${latest_version}"
git push origin ${new_tag}
git checkout master
git checkout ${TARGET_BRANCH}
git fetch --all
git pull
gto show --json > MODEL_REGISTRY.md
git add .
git commit -m "Update Registry"
git push origin master
git push origin ${TARGET_BRANCH}

View file

@ -5,7 +5,7 @@ on:
# branches:
# - "model-**"
pull_request:
branches: [ "master" ]
branches: ["sap-dev", "heat-dev", "carbon-dev"]
label:
types: ["created", "edited"]
@ -89,13 +89,14 @@ jobs:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
TARGET_BRANCH: ${{ github.base_ref }}
run: |
cd modules/ml-pipeline/src/pipeline
echo "## Model metrics" > report.md
# Compare metrics to master
git fetch --depth=1 origin master:master
dvc metrics diff --md --all master >> report.md
git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH}
dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md
cml comment create report.md

View file

@ -8,17 +8,9 @@
"active": true
},
"sap": {
"version": "v1.0.0",
"version": "v0.0.3",
"stage": {
"dev": "v1.0.0"
},
"registered": true,
"active": true
},
"migrate": {
"version": null,
"stage": {
"dev": "f320b9e0e9f3ea7735aed1abee07b1fb498c39c3"
"dev": "v0.0.3"
},
"registered": true,
"active": true

View file

@ -1,3 +1,3 @@
# The generic reproducible ML-pipeline
Pipeline required to build a model to produce an output
Pipeline required to build a model to produce an output, that gets hashed via DVC

View file

@ -6,7 +6,7 @@ Once we have the features, we build a model
import os
import yaml
import pandas as pd
from typing import Union
from typing import Union, List
from pathlib import Path
from core.Logger import logger
from core.interface.InterfaceMetrics import MLMetrics
@ -31,6 +31,9 @@ generate_metrics_params = settings.generate_metrics
model_type = build_model_params["model_type"]
target = feature_process_params["feature_processor_config"]["target"]
identifier_columns = feature_process_params["feature_processor_config"][
"identifier_columns"
]
model_save_location = build_model_params["model_save_filepath"]
model_hyperparameters = build_model_params[model_type]
train_filepath = prepare_data_params["output_train_filepath"]
@ -62,6 +65,7 @@ def build_model(
model: MLModel,
metrics: MLMetrics,
target: str,
identifier_columns: List[str],
model_save_location: str,
model_hyperparameters: dict,
fit_metrics_filepath: str,
@ -90,17 +94,17 @@ def build_model(
logger.info("----------------------")
model.train_model(
data=train_data, target=target, model_hyperparameters=model_hyperparameters
data=train_data.drop(columns=identifier_columns),
target=target,
model_hyperparameters=model_hyperparameters,
)
logger.info("----------------------------------")
logger.info("--- Generating fit predictions ---")
logger.info("----------------------------------")
prediction_data = train_data.drop(columns=target)
fit_predictions = model.predict(
data=prediction_data, post_prediction_logic=post_prediction_logic
data=train_data, post_prediction_logic=post_prediction_logic
)
logger.info("------------------------------")
@ -142,6 +146,7 @@ if __name__ == "__main__":
model=model,
metrics=metrics,
target=target,
identifier_columns=identifier_columns,
model_save_location=model_save_location,
model_hyperparameters=model_hyperparameters,
train_filepath=train_filepath,

View file

@ -1,7 +1,7 @@
default:
build_model:
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
model_save_filepath: ./data/model/optimised/
fit_metrics_filepath: ./metrics/fit_metrics.json
SKLearnLinearRegression: null
@ -10,9 +10,9 @@ default:
kernel: "linear"
AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 1000
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 4000
presets: medium_quality
excluded_model_types: ['KNN']
excluded_model_types: ['KNN', 'RF']

View file

@ -27,7 +27,7 @@ def remove_starting_columns(df):
# return df
business_logic = {
"remove_starting_columns": remove_starting_columns
# "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}

View file

@ -18,7 +18,10 @@ default:
prepare_data:
input_dataclient_type: aws-s3
output_dataclient_type: local
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet
@ -29,7 +32,9 @@ default:
subsample_amount: null
subsample_seed: 0
target: SAP_ENDING
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
identifier_columns: ["UPRN"]
drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null
generate_predictions:

View file

@ -165,8 +165,12 @@ class AutogluonAutoML:
if self.model is None:
raise KeyError("No model trained/ loaded - unable to save")
logger.info("In local development mode - no need for s3 client")
logger.info("Using AutoGluon Model - Model saving already occured")
logger.info(
"Using AutoGluon Model - Model saving is using optimised deployment mode"
)
logger.info("Saving optimised model")
self.model.clone_for_deployment(str(path))
return str(path)

View file

@ -10,7 +10,6 @@ stages:
params:
configs/settings.yaml:
default.feature_processor.feature_processor_config.drop_columns:
- UPRN
- HEAT_DEMAND_CHANGE
- CARBON_CHANGE
- RDSAP_CHANGE
@ -21,7 +20,7 @@ stages:
default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: SAP_ENDING
default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -30,65 +29,66 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
size: 21115444
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
nfiles: 2
build_model:
cmd: python 2_build_model.py
deps:
- path: 2_build_model.py
hash: md5
md5: 039578b629d7cd204016e92cd079ea90
size: 5181
md5: 84699d208874c52accaff61c6af9bb0a
size: 5359
- path: data/prepared_data
hash: md5
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
size: 21115444
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
nfiles: 2
params:
configs/build_model.yaml:
default:
build_model:
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
model_save_filepath: ./data/model/optimised/
fit_metrics_filepath: ./metrics/fit_metrics.json
SKLearnLinearRegression:
SKLearnSVMRegression:
kernel: linear
AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 1000
eval_metric: mean_squared_error
time_limit: 4000
presets: medium_quality
excluded_model_types:
- KNN
- RF
outs:
- path: data/model/
hash: md5
md5: d073af40ba5c7c2d9b8064665062f51e.dir
size: 363710367
nfiles: 20
md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 473398662
nfiles: 27
- path: metrics/fit_metrics.json
hash: md5
md5: dcd9ea03a2771077e1bd14018bb7fd18
size: 183
md5: 2bb16ac67de8778fbc08171d562b34d5
size: 184
generate_predictions:
cmd: python 3_generate_predictions.py
deps:
- path: 3_generate_predictions.py
hash: md5
md5: 238b3fa9f3c6f3720e77c116857070ae
size: 4720
md5: 5ef2856a5a977304f1ec01f9b4205262
size: 3028
- path: data/model
hash: md5
md5: d073af40ba5c7c2d9b8064665062f51e.dir
size: 363710367
nfiles: 20
md5: 7bb5156243b4db39349e80a01ffecde4.dir
size: 473398662
nfiles: 27
- path: data/prepared_data
hash: md5
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
size: 21115444
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
nfiles: 2
params:
configs/settings.yaml:
@ -100,8 +100,8 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: a2ecfae1e418fe9cb9fe044c148bbb37.dir
size: 381538
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 374532
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
@ -112,13 +112,13 @@ stages:
size: 4487
- path: data/predictions
hash: md5
md5: a2ecfae1e418fe9cb9fe044c148bbb37.dir
size: 381538
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
size: 374532
nfiles: 1
- path: data/prepared_data
hash: md5
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
size: 21115444
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
size: 33881619
nfiles: 2
params:
configs/settings.yaml:
@ -128,8 +128,8 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: ec02774fd01243fa4706189c60087ccf
size: 182
md5: 2e13ae67759a64261d03224f1c0d4bf4
size: 185
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:

View file

@ -175,3 +175,57 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96
#
#
from core.MLModels import model_factory
from core.DataClient import dataclient_factory
import pandas as pd
from config import settings
client_params = settings.client
prepare_data_params = settings.prepare_data
feature_process_params = settings.feature_processor
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
prediction_analysis_params = settings.prediction_analysis
model = model_factory(build_model_params["model_type"])
model.load_model(build_model_params["model_save_filepath"])
dataclient_type = prediction_analysis_params["dataclient_type"]
dataclient = dataclient_factory(
dataclient_type=dataclient_type,
dataclient_config=client_params[dataclient_type],
)
target = feature_process_params["feature_processor_config"]["target"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
output_test_filepath = prepare_data_params["output_test_filepath"]
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
test_df = dataclient.load_data(output_test_filepath)
predictions = dataclient.load_data(predictions_output_filepath)
mix_df = pd.concat([test_df.copy(), predictions], axis=1)
mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]
from sklearn.metrics.pairwise import cosine_similarity
row_index = 58199
from sklearn.preprocessing import LabelEncoder
object_columns = cosine_similarity_df.select_dtypes(["object"])
cosine_similarity_df[object_columns.columns] = cosine_similarity_df[
object_columns.columns
].apply(LabelEncoder().fit_transform)
feature_vector = cosine_similarity_df.loc[[row_index]]
cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
similar_index = (
cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index
)
check_df = mix_df.loc[similar_index]

View file

@ -1,3 +1,4 @@
dvc==3.18.0
dvc-s3==2.23.0
gto==1.0.4
pyOpenSSL==23.2.0