mirror of
https://github.com/Hestia-Homes/ML.git
synced 2026-06-08 11:17:25 +00:00
commit
ba4d1bcc8b
13 changed files with 136 additions and 68 deletions
10
.github/workflows/Deploy.yml
vendored
10
.github/workflows/Deploy.yml
vendored
|
|
@ -2,7 +2,7 @@ name: Sap Change Model Deploy
|
|||
|
||||
on:
|
||||
push:
|
||||
branches: [ sap_change-dev, sap_change-prod ]
|
||||
branches: [ sap-dev, sap-prod ]
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
|
|
@ -54,10 +54,12 @@ jobs:
|
|||
- name: Set stack_name
|
||||
id: set_stack_name
|
||||
run: |
|
||||
if [[ "${{ github.ref_name }}" == "sap_change-dev" || "${{ github.ref_name }}" == "sap_change-prod" ]]; then
|
||||
echo "::set-output name=stack_name::sapmodel"
|
||||
else
|
||||
# Take branch prefix and add "model" for stack name
|
||||
stack_name=$( echo ${{ github.ref_name }} | awk -F"-" '{print $1}' | sed 's/$/model/g')
|
||||
if [ -z "${stack_name}" ]; then
|
||||
echo "::set-output name=stack_name::"
|
||||
else
|
||||
echo "::set-output name=stack_name::${stack_name}"
|
||||
fi
|
||||
|
||||
- name: Set runtime_environment
|
||||
|
|
|
|||
10
.github/workflows/MLPipelinePostMerge.yml
vendored
10
.github/workflows/MLPipelinePostMerge.yml
vendored
|
|
@ -10,7 +10,9 @@ on:
|
|||
types:
|
||||
- closed
|
||||
branches:
|
||||
- "master"
|
||||
- "sap-dev"
|
||||
- "heat-dev"
|
||||
- "carbon-dev"
|
||||
|
||||
permissions: write-all
|
||||
|
||||
|
|
@ -176,6 +178,8 @@ jobs:
|
|||
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
|
||||
|
||||
- name: Register Model
|
||||
env:
|
||||
TARGET_BRANCH: ${{ github.base_ref }}
|
||||
run: |
|
||||
|
||||
REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}')
|
||||
|
|
@ -203,11 +207,11 @@ jobs:
|
|||
git tag -a ${new_tag} -m "Assigning stage dev to artifact ${REGISTER_MODEL_NAME} version ${latest_version}"
|
||||
git push origin ${new_tag}
|
||||
|
||||
git checkout master
|
||||
git checkout ${TARGET_BRANCH}
|
||||
git fetch --all
|
||||
git pull
|
||||
|
||||
gto show --json > MODEL_REGISTRY.md
|
||||
git add .
|
||||
git commit -m "Update Registry"
|
||||
git push origin master
|
||||
git push origin ${TARGET_BRANCH}
|
||||
|
|
|
|||
7
.github/workflows/MLPipelinePullRequest.yml
vendored
7
.github/workflows/MLPipelinePullRequest.yml
vendored
|
|
@ -5,7 +5,7 @@ on:
|
|||
# branches:
|
||||
# - "model-**"
|
||||
pull_request:
|
||||
branches: [ "master" ]
|
||||
branches: ["sap-dev", "heat-dev", "carbon-dev"]
|
||||
label:
|
||||
types: ["created", "edited"]
|
||||
|
||||
|
|
@ -89,13 +89,14 @@ jobs:
|
|||
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
|
||||
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
TARGET_BRANCH: ${{ github.base_ref }}
|
||||
run: |
|
||||
cd modules/ml-pipeline/src/pipeline
|
||||
echo "## Model metrics" > report.md
|
||||
|
||||
# Compare metrics to master
|
||||
git fetch --depth=1 origin master:master
|
||||
dvc metrics diff --md --all master >> report.md
|
||||
git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH}
|
||||
dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md
|
||||
|
||||
cml comment create report.md
|
||||
|
||||
|
|
|
|||
|
|
@ -8,17 +8,9 @@
|
|||
"active": true
|
||||
},
|
||||
"sap": {
|
||||
"version": "v1.0.0",
|
||||
"version": "v0.0.3",
|
||||
"stage": {
|
||||
"dev": "v1.0.0"
|
||||
},
|
||||
"registered": true,
|
||||
"active": true
|
||||
},
|
||||
"migrate": {
|
||||
"version": null,
|
||||
"stage": {
|
||||
"dev": "f320b9e0e9f3ea7735aed1abee07b1fb498c39c3"
|
||||
"dev": "v0.0.3"
|
||||
},
|
||||
"registered": true,
|
||||
"active": true
|
||||
|
|
|
|||
|
|
@ -1,3 +1,3 @@
|
|||
# The generic reproducible ML-pipeline
|
||||
|
||||
Pipeline required to build a model to produce an output
|
||||
Pipeline required to build a model to produce an output, that gets hashed via DVC
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ Once we have the features, we build a model
|
|||
import os
|
||||
import yaml
|
||||
import pandas as pd
|
||||
from typing import Union
|
||||
from typing import Union, List
|
||||
from pathlib import Path
|
||||
from core.Logger import logger
|
||||
from core.interface.InterfaceMetrics import MLMetrics
|
||||
|
|
@ -31,6 +31,9 @@ generate_metrics_params = settings.generate_metrics
|
|||
|
||||
model_type = build_model_params["model_type"]
|
||||
target = feature_process_params["feature_processor_config"]["target"]
|
||||
identifier_columns = feature_process_params["feature_processor_config"][
|
||||
"identifier_columns"
|
||||
]
|
||||
model_save_location = build_model_params["model_save_filepath"]
|
||||
model_hyperparameters = build_model_params[model_type]
|
||||
train_filepath = prepare_data_params["output_train_filepath"]
|
||||
|
|
@ -62,6 +65,7 @@ def build_model(
|
|||
model: MLModel,
|
||||
metrics: MLMetrics,
|
||||
target: str,
|
||||
identifier_columns: List[str],
|
||||
model_save_location: str,
|
||||
model_hyperparameters: dict,
|
||||
fit_metrics_filepath: str,
|
||||
|
|
@ -90,17 +94,17 @@ def build_model(
|
|||
logger.info("----------------------")
|
||||
|
||||
model.train_model(
|
||||
data=train_data, target=target, model_hyperparameters=model_hyperparameters
|
||||
data=train_data.drop(columns=identifier_columns),
|
||||
target=target,
|
||||
model_hyperparameters=model_hyperparameters,
|
||||
)
|
||||
|
||||
logger.info("----------------------------------")
|
||||
logger.info("--- Generating fit predictions ---")
|
||||
logger.info("----------------------------------")
|
||||
|
||||
prediction_data = train_data.drop(columns=target)
|
||||
|
||||
fit_predictions = model.predict(
|
||||
data=prediction_data, post_prediction_logic=post_prediction_logic
|
||||
data=train_data, post_prediction_logic=post_prediction_logic
|
||||
)
|
||||
|
||||
logger.info("------------------------------")
|
||||
|
|
@ -142,6 +146,7 @@ if __name__ == "__main__":
|
|||
model=model,
|
||||
metrics=metrics,
|
||||
target=target,
|
||||
identifier_columns=identifier_columns,
|
||||
model_save_location=model_save_location,
|
||||
model_hyperparameters=model_hyperparameters,
|
||||
train_filepath=train_filepath,
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
default:
|
||||
build_model:
|
||||
model_type: AutogluonAutoML
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
model_save_filepath: ./data/model/optimised/
|
||||
fit_metrics_filepath: ./metrics/fit_metrics.json
|
||||
|
||||
SKLearnLinearRegression: null
|
||||
|
|
@ -10,9 +10,9 @@ default:
|
|||
kernel: "linear"
|
||||
|
||||
AutogluonAutoML:
|
||||
output_filepath: ./data/model/autogluonmodel/
|
||||
output_filepath: ./data/model/allmodels/
|
||||
problem_type: regression
|
||||
eval_metric: mean_absolute_error
|
||||
time_limit: 1000
|
||||
eval_metric: mean_squared_error #mean_absolute_error
|
||||
time_limit: 4000
|
||||
presets: medium_quality
|
||||
excluded_model_types: ['KNN']
|
||||
excluded_model_types: ['KNN', 'RF']
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ def remove_starting_columns(df):
|
|||
# return df
|
||||
|
||||
business_logic = {
|
||||
"remove_starting_columns": remove_starting_columns
|
||||
# "remove_starting_columns": remove_starting_columns
|
||||
# "keep_ENDING_COLUMNS": keep_ending_columns
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -18,7 +18,10 @@ default:
|
|||
prepare_data:
|
||||
input_dataclient_type: aws-s3
|
||||
output_dataclient_type: local
|
||||
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
|
||||
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
|
||||
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
|
||||
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
|
||||
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
|
||||
train_proportion: 0.9
|
||||
output_train_filepath: ./data/prepared_data/train.parquet
|
||||
output_test_filepath: ./data/prepared_data/test.parquet
|
||||
|
|
@ -29,7 +32,9 @@ default:
|
|||
subsample_amount: null
|
||||
subsample_seed: 0
|
||||
target: SAP_ENDING
|
||||
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
|
||||
identifier_columns: ["UPRN"]
|
||||
drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
|
||||
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
|
||||
retain_features: null
|
||||
|
||||
generate_predictions:
|
||||
|
|
|
|||
|
|
@ -165,8 +165,12 @@ class AutogluonAutoML:
|
|||
if self.model is None:
|
||||
raise KeyError("No model trained/ loaded - unable to save")
|
||||
|
||||
logger.info("In local development mode - no need for s3 client")
|
||||
logger.info("Using AutoGluon Model - Model saving already occured")
|
||||
logger.info(
|
||||
"Using AutoGluon Model - Model saving is using optimised deployment mode"
|
||||
)
|
||||
|
||||
logger.info("Saving optimised model")
|
||||
self.model.clone_for_deployment(str(path))
|
||||
|
||||
return str(path)
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,6 @@ stages:
|
|||
params:
|
||||
configs/settings.yaml:
|
||||
default.feature_processor.feature_processor_config.drop_columns:
|
||||
- UPRN
|
||||
- HEAT_DEMAND_CHANGE
|
||||
- CARBON_CHANGE
|
||||
- RDSAP_CHANGE
|
||||
|
|
@ -21,7 +20,7 @@ stages:
|
|||
default.feature_processor.feature_processor_config.subsample_seed: 0
|
||||
default.feature_processor.feature_processor_config.target: SAP_ENDING
|
||||
default.feature_processor.feature_processor_type: dataframe
|
||||
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
|
||||
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
|
||||
default.prepare_data.input_dataclient_type: aws-s3
|
||||
default.prepare_data.output_dataclient_type: local
|
||||
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
|
||||
|
|
@ -30,65 +29,66 @@ stages:
|
|||
outs:
|
||||
- path: data/prepared_data/
|
||||
hash: md5
|
||||
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
|
||||
size: 21115444
|
||||
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
|
||||
size: 33881619
|
||||
nfiles: 2
|
||||
build_model:
|
||||
cmd: python 2_build_model.py
|
||||
deps:
|
||||
- path: 2_build_model.py
|
||||
hash: md5
|
||||
md5: 039578b629d7cd204016e92cd079ea90
|
||||
size: 5181
|
||||
md5: 84699d208874c52accaff61c6af9bb0a
|
||||
size: 5359
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
|
||||
size: 21115444
|
||||
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
|
||||
size: 33881619
|
||||
nfiles: 2
|
||||
params:
|
||||
configs/build_model.yaml:
|
||||
default:
|
||||
build_model:
|
||||
model_type: AutogluonAutoML
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
model_save_filepath: ./data/model/optimised/
|
||||
fit_metrics_filepath: ./metrics/fit_metrics.json
|
||||
SKLearnLinearRegression:
|
||||
SKLearnSVMRegression:
|
||||
kernel: linear
|
||||
AutogluonAutoML:
|
||||
output_filepath: ./data/model/autogluonmodel/
|
||||
output_filepath: ./data/model/allmodels/
|
||||
problem_type: regression
|
||||
eval_metric: mean_absolute_error
|
||||
time_limit: 1000
|
||||
eval_metric: mean_squared_error
|
||||
time_limit: 4000
|
||||
presets: medium_quality
|
||||
excluded_model_types:
|
||||
- KNN
|
||||
- RF
|
||||
outs:
|
||||
- path: data/model/
|
||||
hash: md5
|
||||
md5: d073af40ba5c7c2d9b8064665062f51e.dir
|
||||
size: 363710367
|
||||
nfiles: 20
|
||||
md5: 7bb5156243b4db39349e80a01ffecde4.dir
|
||||
size: 473398662
|
||||
nfiles: 27
|
||||
- path: metrics/fit_metrics.json
|
||||
hash: md5
|
||||
md5: dcd9ea03a2771077e1bd14018bb7fd18
|
||||
size: 183
|
||||
md5: 2bb16ac67de8778fbc08171d562b34d5
|
||||
size: 184
|
||||
generate_predictions:
|
||||
cmd: python 3_generate_predictions.py
|
||||
deps:
|
||||
- path: 3_generate_predictions.py
|
||||
hash: md5
|
||||
md5: 238b3fa9f3c6f3720e77c116857070ae
|
||||
size: 4720
|
||||
md5: 5ef2856a5a977304f1ec01f9b4205262
|
||||
size: 3028
|
||||
- path: data/model
|
||||
hash: md5
|
||||
md5: d073af40ba5c7c2d9b8064665062f51e.dir
|
||||
size: 363710367
|
||||
nfiles: 20
|
||||
md5: 7bb5156243b4db39349e80a01ffecde4.dir
|
||||
size: 473398662
|
||||
nfiles: 27
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
|
||||
size: 21115444
|
||||
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
|
||||
size: 33881619
|
||||
nfiles: 2
|
||||
params:
|
||||
configs/settings.yaml:
|
||||
|
|
@ -100,8 +100,8 @@ stages:
|
|||
outs:
|
||||
- path: data/predictions/
|
||||
hash: md5
|
||||
md5: a2ecfae1e418fe9cb9fe044c148bbb37.dir
|
||||
size: 381538
|
||||
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
|
||||
size: 374532
|
||||
nfiles: 1
|
||||
generate_metrics:
|
||||
cmd: python 4_generate_metrics.py
|
||||
|
|
@ -112,13 +112,13 @@ stages:
|
|||
size: 4487
|
||||
- path: data/predictions
|
||||
hash: md5
|
||||
md5: a2ecfae1e418fe9cb9fe044c148bbb37.dir
|
||||
size: 381538
|
||||
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
|
||||
size: 374532
|
||||
nfiles: 1
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: f9ef7ad073b43b249b43faa75c62fe07.dir
|
||||
size: 21115444
|
||||
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
|
||||
size: 33881619
|
||||
nfiles: 2
|
||||
params:
|
||||
configs/settings.yaml:
|
||||
|
|
@ -128,8 +128,8 @@ stages:
|
|||
outs:
|
||||
- path: metrics/metrics.json
|
||||
hash: md5
|
||||
md5: ec02774fd01243fa4706189c60087ccf
|
||||
size: 182
|
||||
md5: 2e13ae67759a64261d03224f1c0d4bf4
|
||||
size: 185
|
||||
startup_cleanup:
|
||||
cmd: python 0_startup_cleanup.py
|
||||
deps:
|
||||
|
|
|
|||
|
|
@ -175,3 +175,57 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
|
|||
# Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96
|
||||
#
|
||||
#
|
||||
|
||||
from core.MLModels import model_factory
|
||||
from core.DataClient import dataclient_factory
|
||||
import pandas as pd
|
||||
from config import settings
|
||||
|
||||
client_params = settings.client
|
||||
prepare_data_params = settings.prepare_data
|
||||
feature_process_params = settings.feature_processor
|
||||
build_model_params = settings.build_model
|
||||
generate_predictions_params = settings.generate_predictions
|
||||
prediction_analysis_params = settings.prediction_analysis
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
model.load_model(build_model_params["model_save_filepath"])
|
||||
dataclient_type = prediction_analysis_params["dataclient_type"]
|
||||
dataclient = dataclient_factory(
|
||||
dataclient_type=dataclient_type,
|
||||
dataclient_config=client_params[dataclient_type],
|
||||
)
|
||||
|
||||
target = feature_process_params["feature_processor_config"]["target"]
|
||||
predictions_column_name = generate_predictions_params["predictions_column_name"]
|
||||
output_test_filepath = prepare_data_params["output_test_filepath"]
|
||||
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
|
||||
|
||||
test_df = dataclient.load_data(output_test_filepath)
|
||||
predictions = dataclient.load_data(predictions_output_filepath)
|
||||
mix_df = pd.concat([test_df.copy(), predictions], axis=1)
|
||||
mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
|
||||
mix_df = mix_df.sort_values("residual", ascending=False)
|
||||
|
||||
cosine_similarity_df = mix_df[
|
||||
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
|
||||
]
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
row_index = 58199
|
||||
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
object_columns = cosine_similarity_df.select_dtypes(["object"])
|
||||
|
||||
cosine_similarity_df[object_columns.columns] = cosine_similarity_df[
|
||||
object_columns.columns
|
||||
].apply(LabelEncoder().fit_transform)
|
||||
|
||||
feature_vector = cosine_similarity_df.loc[[row_index]]
|
||||
|
||||
cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
|
||||
similar_index = (
|
||||
cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index
|
||||
)
|
||||
|
||||
check_df = mix_df.loc[similar_index]
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
dvc==3.18.0
|
||||
dvc-s3==2.23.0
|
||||
gto==1.0.4
|
||||
pyOpenSSL==23.2.0
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue