diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index 7630316..98f41d1 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -11,6 +11,9 @@ on: - closed branches: - "master" + - "sap_change-dev" + - "heat_change-dev" + - "carbon_change-dev" permissions: write-all diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index 3d5b24e..3cf830b 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -5,7 +5,7 @@ on: # branches: # - "model-**" pull_request: - branches: [ "master" ] + branches: [ "master", "sap_change-dev", "heat_change-dev", "carbon_change-dev"] label: types: ["created", "edited"] @@ -89,13 +89,14 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} + TARGET_BRANCH: ${{ github.base_ref }} run: | cd modules/ml-pipeline/src/pipeline echo "## Model metrics" > report.md # Compare metrics to master - git fetch --depth=1 origin master:master - dvc metrics diff --md --all master >> report.md + git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH} + dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md cml comment create report.md diff --git a/modules/ml-pipeline/src/README.md b/modules/ml-pipeline/src/README.md index 377206f..d7afc6a 100644 --- a/modules/ml-pipeline/src/README.md +++ b/modules/ml-pipeline/src/README.md @@ -1,3 +1,3 @@ # The generic reproducible ML-pipeline -Pipeline required to build a model to produce an output +Pipeline required to build a model to produce an output, that gets hashed via DVC diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index f7746f9..cae5cfd 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -6,7 +6,7 @@ Once we have the features, we build a model import os import yaml import pandas as pd -from typing import Union +from typing import Union, List from pathlib import Path from core.Logger import logger from core.interface.InterfaceMetrics import MLMetrics @@ -31,6 +31,9 @@ generate_metrics_params = settings.generate_metrics model_type = build_model_params["model_type"] target = feature_process_params["feature_processor_config"]["target"] +identifier_columns = feature_process_params["feature_processor_config"][ + "identifier_columns" +] model_save_location = build_model_params["model_save_filepath"] model_hyperparameters = build_model_params[model_type] train_filepath = prepare_data_params["output_train_filepath"] @@ -62,6 +65,7 @@ def build_model( model: MLModel, metrics: MLMetrics, target: str, + identifier_columns: List[str], model_save_location: str, model_hyperparameters: dict, fit_metrics_filepath: str, @@ -90,17 +94,17 @@ def build_model( logger.info("----------------------") model.train_model( - data=train_data, target=target, model_hyperparameters=model_hyperparameters + data=train_data.drop(columns=identifier_columns), + target=target, + model_hyperparameters=model_hyperparameters, ) logger.info("----------------------------------") logger.info("--- Generating fit predictions ---") logger.info("----------------------------------") - prediction_data = train_data.drop(columns=target) - fit_predictions = model.predict( - data=prediction_data, post_prediction_logic=post_prediction_logic + data=train_data, post_prediction_logic=post_prediction_logic ) logger.info("------------------------------") @@ -142,6 +146,7 @@ if __name__ == "__main__": model=model, metrics=metrics, target=target, + identifier_columns=identifier_columns, model_save_location=model_save_location, model_hyperparameters=model_hyperparameters, train_filepath=train_filepath, diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 49fbac4..d296e6a 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -1,7 +1,7 @@ default: build_model: model_type: AutogluonAutoML - model_save_filepath: ./data/model/autogluonmodel/ + model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json SKLearnLinearRegression: null @@ -10,9 +10,9 @@ default: kernel: "linear" AutogluonAutoML: - output_filepath: ./data/model/autogluonmodel/ + output_filepath: ./data/model/allmodels/ problem_type: regression - eval_metric: mean_absolute_error - time_limit: 1000 + eval_metric: mean_squared_error #mean_absolute_error + time_limit: 4000 presets: medium_quality - excluded_model_types: ['KNN'] + excluded_model_types: ['KNN', 'RF'] diff --git a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py index 1f97005..c32d2fe 100644 --- a/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py +++ b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py @@ -27,7 +27,7 @@ def remove_starting_columns(df): # return df business_logic = { - "remove_starting_columns": remove_starting_columns + # "remove_starting_columns": remove_starting_columns # "keep_ENDING_COLUMNS": keep_ending_columns } diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 588dd9a..a84c095 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -18,7 +18,10 @@ default: prepare_data: input_dataclient_type: aws-s3 output_dataclient_type: local - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet + # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet @@ -29,7 +32,9 @@ default: subsample_amount: null subsample_seed: 0 target: SAP_ENDING - drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] + identifier_columns: ["UPRN"] + drop_columns: ["HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"] + # retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"] retain_features: null generate_predictions: diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 4d9a9e9..4cf8b08 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -165,8 +165,12 @@ class AutogluonAutoML: if self.model is None: raise KeyError("No model trained/ loaded - unable to save") - logger.info("In local development mode - no need for s3 client") - logger.info("Using AutoGluon Model - Model saving already occured") + logger.info( + "Using AutoGluon Model - Model saving is using optimised deployment mode" + ) + + logger.info("Saving optimised model") + self.model.clone_for_deployment(str(path)) return str(path) diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 9b9a3e0..c499874 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -10,7 +10,6 @@ stages: params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: - - UPRN - HEAT_DEMAND_CHANGE - CARBON_CHANGE - RDSAP_CHANGE @@ -21,7 +20,7 @@ stages: default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.target: SAP_ENDING default.feature_processor.feature_processor_type: dataframe - default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet + default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.output_dataclient_type: local default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet @@ -30,65 +29,66 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: f9ef7ad073b43b249b43faa75c62fe07.dir - size: 21115444 + md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir + size: 33881619 nfiles: 2 build_model: cmd: python 2_build_model.py deps: - path: 2_build_model.py hash: md5 - md5: 039578b629d7cd204016e92cd079ea90 - size: 5181 + md5: 84699d208874c52accaff61c6af9bb0a + size: 5359 - path: data/prepared_data hash: md5 - md5: f9ef7ad073b43b249b43faa75c62fe07.dir - size: 21115444 + md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir + size: 33881619 nfiles: 2 params: configs/build_model.yaml: default: build_model: model_type: AutogluonAutoML - model_save_filepath: ./data/model/autogluonmodel/ + model_save_filepath: ./data/model/optimised/ fit_metrics_filepath: ./metrics/fit_metrics.json SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear AutogluonAutoML: - output_filepath: ./data/model/autogluonmodel/ + output_filepath: ./data/model/allmodels/ problem_type: regression - eval_metric: mean_absolute_error - time_limit: 1000 + eval_metric: mean_squared_error + time_limit: 4000 presets: medium_quality excluded_model_types: - KNN + - RF outs: - path: data/model/ hash: md5 - md5: d073af40ba5c7c2d9b8064665062f51e.dir - size: 363710367 - nfiles: 20 + md5: 7bb5156243b4db39349e80a01ffecde4.dir + size: 473398662 + nfiles: 27 - path: metrics/fit_metrics.json hash: md5 - md5: dcd9ea03a2771077e1bd14018bb7fd18 - size: 183 + md5: 2bb16ac67de8778fbc08171d562b34d5 + size: 184 generate_predictions: cmd: python 3_generate_predictions.py deps: - path: 3_generate_predictions.py hash: md5 - md5: 238b3fa9f3c6f3720e77c116857070ae - size: 4720 + md5: 5ef2856a5a977304f1ec01f9b4205262 + size: 3028 - path: data/model hash: md5 - md5: d073af40ba5c7c2d9b8064665062f51e.dir - size: 363710367 - nfiles: 20 + md5: 7bb5156243b4db39349e80a01ffecde4.dir + size: 473398662 + nfiles: 27 - path: data/prepared_data hash: md5 - md5: f9ef7ad073b43b249b43faa75c62fe07.dir - size: 21115444 + md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir + size: 33881619 nfiles: 2 params: configs/settings.yaml: @@ -100,8 +100,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: a2ecfae1e418fe9cb9fe044c148bbb37.dir - size: 381538 + md5: 0bb3cf991906953def81c8204cdcfaf0.dir + size: 374532 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -112,13 +112,13 @@ stages: size: 4487 - path: data/predictions hash: md5 - md5: a2ecfae1e418fe9cb9fe044c148bbb37.dir - size: 381538 + md5: 0bb3cf991906953def81c8204cdcfaf0.dir + size: 374532 nfiles: 1 - path: data/prepared_data hash: md5 - md5: f9ef7ad073b43b249b43faa75c62fe07.dir - size: 21115444 + md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir + size: 33881619 nfiles: 2 params: configs/settings.yaml: @@ -128,8 +128,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: ec02774fd01243fa4706189c60087ccf - size: 182 + md5: 2e13ae67759a64261d03224f1c0d4bf4 + size: 185 startup_cleanup: cmd: python 0_startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/eda.py b/modules/ml-pipeline/src/pipeline/eda.py index 1260d09..2fdd8be 100644 --- a/modules/ml-pipeline/src/pipeline/eda.py +++ b/modules/ml-pipeline/src/pipeline/eda.py @@ -175,3 +175,57 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6}) # Use shap package to explain why 9158 has a 35 prediction when its sap ending is 96 # # + +from core.MLModels import model_factory +from core.DataClient import dataclient_factory +import pandas as pd +from config import settings + +client_params = settings.client +prepare_data_params = settings.prepare_data +feature_process_params = settings.feature_processor +build_model_params = settings.build_model +generate_predictions_params = settings.generate_predictions +prediction_analysis_params = settings.prediction_analysis +model = model_factory(build_model_params["model_type"]) +model.load_model(build_model_params["model_save_filepath"]) +dataclient_type = prediction_analysis_params["dataclient_type"] +dataclient = dataclient_factory( + dataclient_type=dataclient_type, + dataclient_config=client_params[dataclient_type], +) + +target = feature_process_params["feature_processor_config"]["target"] +predictions_column_name = generate_predictions_params["predictions_column_name"] +output_test_filepath = prepare_data_params["output_test_filepath"] +predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] + +test_df = dataclient.load_data(output_test_filepath) +predictions = dataclient.load_data(predictions_output_filepath) +mix_df = pd.concat([test_df.copy(), predictions], axis=1) +mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target]) +mix_df = mix_df.sort_values("residual", ascending=False) + +cosine_similarity_df = mix_df[ + mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"]) +] +from sklearn.metrics.pairwise import cosine_similarity + +row_index = 58199 + +from sklearn.preprocessing import LabelEncoder + +object_columns = cosine_similarity_df.select_dtypes(["object"]) + +cosine_similarity_df[object_columns.columns] = cosine_similarity_df[ + object_columns.columns +].apply(LabelEncoder().fit_transform) + +feature_vector = cosine_similarity_df.loc[[row_index]] + +cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector) +similar_index = ( + cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index +) + +check_df = mix_df.loc[similar_index] diff --git a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt index 8459d38..91cb005 100644 --- a/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt @@ -1,3 +1,4 @@ dvc==3.18.0 dvc-s3==2.23.0 gto==1.0.4 +pyOpenSSL==23.2.0