From 57934d0ae37daf61d67534a2671eda318659ba45 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 12:35:34 +0100 Subject: [PATCH 01/16] fixed buffer bug and add id --- deployment/handlers/prediction_app.py | 1 + .../ml-pipeline/src/pipeline/configs/settings.yaml | 1 + modules/ml-pipeline/src/pipeline/core/DataClient.py | 6 ++++++ .../ml-pipeline/src/pipeline/generate_predictions.py | 11 +++++++++-- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index 6f9d162..36a906c 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -107,6 +107,7 @@ def handler(event, context): predictions_column_name=generate_predictions_params[ "predictions_column_name" ], + identifier_column=generate_predictions_params["identifier_column"], ) return { diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index a84c095..ce7ed2c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -43,6 +43,7 @@ default: test_data_filepath: ./data/prepared_data/test.parquet predictions_output_filepath: ./data/predictions/predictions.parquet predictions_column_name: predictions + identifier_column: id generate_metrics: dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/core/DataClient.py b/modules/ml-pipeline/src/pipeline/core/DataClient.py index 28ffff7..53f4072 100644 --- a/modules/ml-pipeline/src/pipeline/core/DataClient.py +++ b/modules/ml-pipeline/src/pipeline/core/DataClient.py @@ -142,9 +142,15 @@ class AWSS3Client: buffer = BytesIO() obj.to_parquet(buffer, index=False) + # Reset the buffer position to the beginning + buffer.seek(0) + bucket, key = location.strip("s3://").split("/", 1) self.client.upload_fileobj(buffer, bucket, key) + # Close the buffer + buffer.close() + def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame: """ Load a parquet file diff --git a/modules/ml-pipeline/src/pipeline/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py index 85b3022..83ea103 100644 --- a/modules/ml-pipeline/src/pipeline/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py @@ -14,6 +14,7 @@ def generate_predictions( test_data_filepath: str, predictions_output_filepath: str, predictions_column_name: str, + identifier_column: str = "id", ): """ For a given model, we generate prediction and evaluate this against the true target @@ -52,6 +53,12 @@ def generate_predictions( predictions_df = pd.DataFrame(predictions) predictions_df.columns = [predictions_column_name] - output_dataclient.save_data( - obj=predictions_df, location=predictions_output_filepath, save_config=None + output_df = ( + pd.concat([test_data[identifier_column], predictions_df], axis=1) + if identifier_column in test_data.columns + else predictions_df + ) + + output_dataclient.save_data( + obj=output_df, location=predictions_output_filepath, save_config=None ) From 69c5c77a8805ca9b95a49bcab783696b437b10d6 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 11:48:13 +0000 Subject: [PATCH 02/16] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 9ab31e5..e0f4245 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.0.3", + "version": "v0.0.4", "stage": { "dev": "v0.0.3" }, From 7a1b9aed5ff82b7173ba8a2c25484e3b04f4001e Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 11:49:02 +0000 Subject: [PATCH 03/16] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index e0f4245..abd2ddb 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.0.4", "stage": { - "dev": "v0.0.3" + "dev": "v0.0.4" }, "registered": true, "active": true From 051f07df77a1e1f1775df32c59b3b96a7438d72a Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Tue, 10 Oct 2023 14:02:54 +0100 Subject: [PATCH 04/16] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 55cae8e..22a6002 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ tracking and a model registry - A bolt-on service that can implement model monitoring There are multiple protected branches which adapt the generic pipeline to produce different models: -- sap_change-** -- heat_change-** -- carbon_change-** +- sap-{dev/staging/prod}-** +- heat-{dev/staging/prod}-** +- carbon-{dev/staging/prod}-** These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline - There can be different additional logic for each branch but the pipeline will be the same. From 8dd784255a57d36da94b59778057e57e5fbf2c80 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:28:30 +0100 Subject: [PATCH 05/16] add smape --- .../ml-pipeline/src/pipeline/core/MLMetrics.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py index 4b14386..845b819 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py +++ b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py @@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods: - Generate Plot Suite """ +import numpy as np import pandas as pd from typing import Union from sklearn.metrics import ( @@ -14,6 +15,18 @@ from sklearn.metrics import ( ) from core.interface.InterfaceMetrics import MLMetrics +# Define the function to return the SMAPE value +def symmetric_mape(actual, predicted) -> float: + + # Convert actual and predicted to numpy + # array data type if not already + if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]): + actual, predicted = np.array(actual), np.array(predicted) + + return np.mean( + np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2) + ) + def metrics_factory(metrics_type: str) -> MLMetrics: metrics = { @@ -34,7 +47,7 @@ class RegressionMetrics: median_absolute_error, mean_squared_error, mean_absolute_percentage_error, - # max_error + symmetric_mape, ] def generate_metrics( From 6552e975552d99b5fe9fb8757e672c5c94fa833e Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:41:06 +0100 Subject: [PATCH 06/16] fix the register increments --- .github/workflows/MLPipelinePostMerge.yml | 27 ++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index f7c4a8f..daef5d9 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -42,7 +42,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="1.0.0" else - increment_version=$(echo ${latest_version} | awk -F'.' '{OFS="."; $1+=1; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + major = $1 + 1 # Increment the major version + print major, "0", "0" # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} @@ -80,7 +87,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="0.1.0" else - increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$2++; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + minor = $2 + 1 # Increment the minor version + print $1, minor, "0" # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} @@ -118,7 +132,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="0.0.1" else - increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$3++; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + patch = $3 + 1 # Increment the patch version + print $1, $2, patch # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} From 755d00e0e40c15fc9f2844bfe47e297b00d14d6a Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 22:42:45 +0000 Subject: [PATCH 07/16] Update Registry --- MODEL_REGISTRY.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index abd2ddb..10b6ccd 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,11 +8,19 @@ "active": true }, "sap": { - "version": "v0.0.4", + "version": "v0.0.5", "stage": { "dev": "v0.0.4" }, "registered": true, "active": true + }, + "heat": { + "version": "v0.0.1", + "stage": { + "dev": "v0.0.1" + }, + "registered": true, + "active": true } } From 7a113f790e7222642e4bf6fcb42ee6f610dd7a6f Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 22:43:36 +0000 Subject: [PATCH 08/16] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 10b6ccd..0770e78 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.0.5", "stage": { - "dev": "v0.0.4" + "dev": "v0.0.5" }, "registered": true, "active": true From 8bdedf25a240e4e4c55948a491ea075bf0cf8337 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:56:35 +0100 Subject: [PATCH 09/16] final fix for workflow on post merge --- .github/workflows/MLPipelinePostMerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index daef5d9..506b93b 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -217,7 +217,7 @@ jobs: fi new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version} - latest_version=$(gto show model@latest --ref | awk -F"@" '{print $2}') + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}') echo ${new_tag} From bd80c3d69d9cd1d463967413b8f12d340b9949ce Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:58:07 +0100 Subject: [PATCH 10/16] final fix for workflow on post merge --- .github/workflows/MLPipelinePostMerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index 506b93b..e4411b3 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -209,7 +209,7 @@ jobs: git config user.name "Github-Bot" git config user.email "Github-Bot@no-reply.com" - latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/') + latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}') if [ -z "${latest_dev_version}" ]; then increment_version="1" else From c668e4227c30cca24744bdd24a6ddbf92f268049 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 22:59:21 +0000 Subject: [PATCH 11/16] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 0770e78..ec8859d 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.0.5", + "version": "v0.1.0", "stage": { "dev": "v0.0.5" }, From 4597c12795017a19eab68095711c0dc2b51f5e65 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 23:00:04 +0000 Subject: [PATCH 12/16] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index ec8859d..b3ad75a 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.1.0", "stage": { - "dev": "v0.0.5" + "dev": "v0.1.0" }, "registered": true, "active": true From 7589977cda6530f2c6837b6a68e9035122449ca2 Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Thu, 12 Oct 2023 10:19:22 +0100 Subject: [PATCH 13/16] Update Makefile --- modules/ml-pipeline/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index 5c5d563..6ccb4c4 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -10,15 +10,15 @@ init: dev-conda dev-conda: # conda deactivate || echo "Not in conda environment" # conda remove --name $CONDA_ENV --all -y || echo "No environment created previously" - conda create --name $CONDA_ENV python=$(PYTHON_VERSION) -y + conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y conda init bash - conda run -vvvv -n $CONDA_ENV pip install --upgrade pip - conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/training/requirements-dev.txt - conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/version_control/requirements.txt - conda run -vvvv -n $CONDA_ENV pre-commit install - conda run -vvvv -n $CONDA_ENV pip install ipykernel + conda run -vvvv -n ${CONDA_ENV} pip install --upgrade pip + conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt + conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt + conda run -vvvv -n ${CONDA_ENV} pre-commit install + conda run -vvvv -n ${CONDA_ENV} pip install ipykernel echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" - echo "conda activate $CONDA_ENV" + echo "conda activate ${CONDA_ENV}" .PHONY: dev-pyenv From 96153f82489c4aa09008262589840b98c770fccf Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Tue, 17 Oct 2023 03:08:01 +0100 Subject: [PATCH 14/16] Update Makefile --- modules/ml-pipeline/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index 6ccb4c4..0bef7d6 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -9,14 +9,14 @@ init: dev-conda .PHONY: dev-conda dev-conda: # conda deactivate || echo "Not in conda environment" - # conda remove --name $CONDA_ENV --all -y || echo "No environment created previously" + # conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously" conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y conda init bash - conda run -vvvv -n ${CONDA_ENV} pip install --upgrade pip - conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt - conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt - conda run -vvvv -n ${CONDA_ENV} pre-commit install - conda run -vvvv -n ${CONDA_ENV} pip install ipykernel + conda run -v -n ${CONDA_ENV} pip install --upgrade pip + conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt + conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt + conda run -v -n ${CONDA_ENV} pre-commit install + conda run -v -n ${CONDA_ENV} pip install ipykernel echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "conda activate ${CONDA_ENV}" From e9417ca73d0cdb6f73132fea8231dc9a36e85177 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 27 Nov 2023 15:17:01 +0000 Subject: [PATCH 15/16] Added additional workflows for new models --- .github/workflows/Deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 48375c3..6e34d36 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -2,7 +2,7 @@ name: Sap Change Model Deploy on: push: - branches: [ sap-dev, sap-prod ] + branches: [ sap-dev, sap-prod, heat-dev, heat-prod, carbon-dev, carbon-prod] jobs: deploy: From 639ba9dd113f9402c28223bc7594568e47b29c5d Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 27 Nov 2023 21:50:08 +0000 Subject: [PATCH 16/16] add infernce limit --- deployment/handlers/prediction_app.py | 6 ----- .../src/pipeline/0_startup_cleanup.py | 10 ------- .../src/pipeline/1_prepare_data.py | 20 -------------- .../ml-pipeline/src/pipeline/2_build_model.py | 26 ------------------- .../src/pipeline/3_generate_predictions.py | 12 --------- .../src/pipeline/4_generate_metrics.py | 22 ---------------- .../src/pipeline/configs/build_model.yaml | 2 ++ .../src/pipeline/configs/settings.yaml | 2 +- .../ml-pipeline/src/pipeline/core/Logger.py | 1 + .../ml-pipeline/src/pipeline/core/MLModels.py | 4 +++ .../src/pipeline/generate_predictions.py | 8 ------ 11 files changed, 8 insertions(+), 105 deletions(-) diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index 36a906c..ac397b9 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -69,9 +69,7 @@ def handler(event, context): storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet" - logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") - logger.info("-------------------------") build_model_params = settings.build_model client_params = settings.client @@ -80,17 +78,13 @@ def handler(event, context): model = model_factory(build_model_params["model_type"]) - logger.info("----------------------------") logger.info(f"--- Initiate Input DataClient ---") - logger.info("----------------------------") input_dataclient = dataclient_factory( dataclient_type="aws-s3", dataclient_config=client_params["aws-s3"], ) - logger.info("----------------------------") logger.info(f"--- Initiate Output DataClient ---") - logger.info("----------------------------") output_dataclient = dataclient_factory( dataclient_type="aws-s3", dataclient_config=client_params["aws-s3"], diff --git a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py index 0bfa37f..32e8a1b 100644 --- a/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py +++ b/modules/ml-pipeline/src/pipeline/0_startup_cleanup.py @@ -16,13 +16,9 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: Remove the directory where artefacts are stored """ - logger.info("---------------------") logger.info(f"--- Run Clean up ---") - logger.info("---------------------") - logger.info("-------------------------") logger.info(f"--- Delete artefacts ---") - logger.info("-------------------------") artefact_directory_path = Path(artefacts_directory) @@ -31,9 +27,7 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: logger.info(f"Removing the directory: {artefacts_directory}") shutil.rmtree(artefact_directory_path) - logger.info("-----------------------") logger.info(f"--- Delete metrics ---") - logger.info("-----------------------") metrics_directory_path = Path(metrics_directory) @@ -45,15 +39,11 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") run_cleanup( artefacts_directory=startup_cleanup_params["artefacts"], metrics_directory=startup_cleanup_params["metrics"], ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index 32daa19..ed7e057 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -17,9 +17,7 @@ from core.DataClient import dataclient_factory from core.FeatureProcessor import feature_processor_factory from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -33,9 +31,7 @@ output_train_filepath = prepare_data_params["output_train_filepath"] output_test_filepath = prepare_data_params["output_test_filepath"] feature_processor_config = feature_process_params["feature_processor_config"] -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") input_dataclient_type = prepare_data_params["input_dataclient_type"] output_dataclient_type = prepare_data_params["output_dataclient_type"] @@ -49,9 +45,7 @@ output_dataclient = dataclient_factory( dataclient_config=client_params[output_dataclient_type], ) -logger.info("----------------------------------") logger.info(f"--- Initiate FeatureProcessor ---") -logger.info("----------------------------------") feature_processor = feature_processor_factory( feature_process_params["feature_processor_type"] @@ -76,15 +70,11 @@ def prepare_data( :param pipeline_mode: bool, Default False, this caches out the file for experimentation, objects returned in pipeline mode """ - logger.info("--------------------") logger.info("--- Loading data ---") - logger.info("--------------------") data = input_dataclient.load_data(location=data_filepath, load_config={}) - logger.info("--------------------------") logger.info("--- Feature Processing ---") - logger.info("--------------------------") data = feature_processor.feature_process( data, @@ -93,9 +83,7 @@ def prepare_data( new_feature_funcs=new_feature_funcs, ) - logger.info("----------------------") logger.info("--- Splitting data ---") - logger.info("----------------------") if train_proportion == 1: train = data @@ -108,9 +96,7 @@ def prepare_data( train = train.reset_index(drop=True) - logger.info("-----------------------") logger.info("--- Outputting data ---") - logger.info("-----------------------") output_dataclient.save_data( obj=train, location=output_train_filepath, save_config=None @@ -126,13 +112,9 @@ def prepare_data( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("---------------------------") logger.info(f"--- Prepare Data Stage ---") - logger.info("---------------------------") prepare_data( input_dataclient=input_dataclient, @@ -147,6 +129,4 @@ if __name__ == "__main__": new_feature_funcs=new_feature_funcs, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/2_build_model.py b/modules/ml-pipeline/src/pipeline/2_build_model.py index cae5cfd..7ca4951 100644 --- a/modules/ml-pipeline/src/pipeline/2_build_model.py +++ b/modules/ml-pipeline/src/pipeline/2_build_model.py @@ -18,9 +18,7 @@ from core.MLMetrics import metrics_factory from configs.post_prediction_logic import post_prediction_logic from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -40,22 +38,16 @@ train_filepath = prepare_data_params["output_train_filepath"] test_filepath = prepare_data_params["output_test_filepath"] fit_metrics_filepath = build_model_params["fit_metrics_filepath"] -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # Output of previous prepare data step, will be where the data is dataclient = dataclient_factory(prepare_data_params["output_dataclient_type"]) -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(model_type) -logger.info("-------------------------") logger.info(f"--- Initiate Metrics ---") -logger.info("-------------------------") metrics = metrics_factory(generate_metrics_params["metrics_type"]) @@ -75,9 +67,7 @@ def build_model( test_data: Union[pd.DataFrame, None] = None, pipeline_mode: bool = False, ): - logger.info("--------------------------------------") logger.info("--- Loading Data for build process ---") - logger.info("--------------------------------------") if train_data is None: if train_filepath is None: @@ -89,9 +79,7 @@ def build_model( raise ValueError(f"Need {test_filepath} if no data supplied") test_data = dataclient.load_data(location=test_filepath, load_config=None) - logger.info("----------------------") logger.info("--- Training model ---") - logger.info("----------------------") model.train_model( data=train_data.drop(columns=identifier_columns), @@ -99,32 +87,24 @@ def build_model( model_hyperparameters=model_hyperparameters, ) - logger.info("----------------------------------") logger.info("--- Generating fit predictions ---") - logger.info("----------------------------------") fit_predictions = model.predict( data=train_data, post_prediction_logic=post_prediction_logic ) - logger.info("------------------------------") logger.info("--- Generating fit metrics ---") - logger.info("------------------------------") metrics_output = metrics.generate_metrics( target=train_data[target], predictions=pd.Series(fit_predictions), ) - logger.info("--------------------") logger.info("--- Saving model ---") - logger.info("--------------------") model.save_model(path=Path(model_save_location)) - logger.info("--------------------------") logger.info("--- Saving fit metrics ---") - logger.info("--------------------------") dataclient.save_data( obj=metrics_output, location=fit_metrics_filepath, save_config=None @@ -133,13 +113,9 @@ def build_model( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("--------------------------") logger.info(f"--- Build Model Stage ---") - logger.info("--------------------------") build_model( dataclient=dataclient, @@ -154,6 +130,4 @@ if __name__ == "__main__": fit_metrics_filepath=fit_metrics_filepath, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py index 9461392..acb9e99 100644 --- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py @@ -10,9 +10,7 @@ from core.Logger import logger from config import settings from generate_predictions import generate_predictions -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -33,15 +31,11 @@ model_filepath = build_model_params["model_save_filepath"] predictions_output_filepath = generate_predictions_params["predictions_output_filepath"] predictions_column_name = generate_predictions_params["predictions_column_name"] -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(build_model_params["model_type"]) -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # We may have different locations of loading hence why we use one specified in generate_predictions.yaml # I.e. for metric runs, this will be a local data client @@ -59,13 +53,9 @@ output_dataclient = dataclient_factory( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("----------------------------------") logger.info(f"--- Generate Predictions Stage---") - logger.info("----------------------------------") generate_predictions( input_dataclient=input_dataclient, @@ -78,6 +68,4 @@ if __name__ == "__main__": predictions_column_name=predictions_column_name, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py index 7b115a2..937c5be 100644 --- a/modules/ml-pipeline/src/pipeline/4_generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/4_generate_metrics.py @@ -16,9 +16,7 @@ from core.MLMetrics import metrics_factory from core.Logger import logger from config import settings -logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") -logger.info("----------------------------") RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") @@ -36,15 +34,11 @@ predictions_column_name = generate_predictions_params["predictions_column_name"] metrics_output_filepath = generate_metrics_params["metrics_output_filepath"] -logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") -logger.info("-------------------------") model = model_factory(build_model_params["model_type"]) -logger.info("----------------------------") logger.info(f"--- Initiate DataClient ---") -logger.info("----------------------------") # Use data client for input and output, as we use dvc to cache later to the cloud dataclient_type = generate_metrics_params["dataclient_type"] @@ -53,9 +47,7 @@ dataclient = dataclient_factory( dataclient_config=client_params[dataclient_type], ) -logger.info("---------------------------") logger.info(f"--- Initiate MLMetrics ---") -logger.info("---------------------------") metrics = metrics_factory(generate_metrics_params["metrics_type"]) @@ -75,34 +67,26 @@ def generate_metrics( For a given model, we generate prediction and evaluate this against the true target """ - logger.info("-------------------------") logger.info("--- Loading test data ---") - logger.info("-------------------------") test_data = input_dataclient.load_data( location=test_data_filepath, load_config=None ) - logger.info("---------------------------") logger.info("--- Loading predictions ---") - logger.info("---------------------------") predictions = input_dataclient.load_data( location=predictions_output_filepath, load_config=None ) - logger.info("--------------------------") logger.info("--- Generating metrics ---") - logger.info("--------------------------") metrics_output = metrics.generate_metrics( target=test_data[target], predictions=pd.Series(predictions[predictions_column_name]), ) - logger.info("----------------------") logger.info("--- Saving metrics ---") - logger.info("----------------------") output_dataclient.save_data( obj=metrics_output, location=metrics_output_filepath, save_config=None @@ -111,13 +95,9 @@ def generate_metrics( if __name__ == "__main__": - logger.info("----------------------------") logger.info(f"--- {__file__} - Start! ---") - logger.info("----------------------------") - logger.info("------------------------------") logger.info(f"--- Generate Metrics Stage---") - logger.info("------------------------------") generate_metrics( input_dataclient=dataclient, @@ -131,6 +111,4 @@ if __name__ == "__main__": metrics_output_filepath=metrics_output_filepath, ) - logger.info("-------------------------------") logger.info(f"--- {__file__} - Complete! ---") - logger.info("-------------------------------") diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index d296e6a..1ebb62d 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -16,3 +16,5 @@ default: time_limit: 4000 presets: medium_quality excluded_model_types: ['KNN', 'RF'] + infer_limit: 0.05 + infer_limit_batch_size: 10000 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index ce7ed2c..9333c46 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -21,7 +21,7 @@ default: # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet - data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet + data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/core/Logger.py b/modules/ml-pipeline/src/pipeline/core/Logger.py index a0fc231..d2f6c61 100644 --- a/modules/ml-pipeline/src/pipeline/core/Logger.py +++ b/modules/ml-pipeline/src/pipeline/core/Logger.py @@ -21,6 +21,7 @@ def setup_logger(): # Add the stream handler to the logger logger.addHandler(stream_handler) + logger.propagate = False return logger diff --git a/modules/ml-pipeline/src/pipeline/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py index 4cf8b08..4fc572a 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLModels.py +++ b/modules/ml-pipeline/src/pipeline/core/MLModels.py @@ -149,6 +149,8 @@ class AutogluonAutoML: "time_limit", "presets", "excluded_model_types", + "infer_limit", + "infer_limit_batch_size", ] def load_model(self, path: Union[Path, str]) -> None: @@ -203,6 +205,8 @@ class AutogluonAutoML: time_limit=model_hyperparameters["time_limit"], presets=model_hyperparameters["presets"], excluded_model_types=model_hyperparameters["excluded_model_types"], + infer_limit=model_hyperparameters["infer_limit"], + infer_limit_batch_size=model_hyperparameters["infer_limit_batch_size"], ) def predict( diff --git a/modules/ml-pipeline/src/pipeline/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py index 83ea103..59ce732 100644 --- a/modules/ml-pipeline/src/pipeline/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py @@ -20,23 +20,17 @@ def generate_predictions( For a given model, we generate prediction and evaluate this against the true target """ - logger.info("-------------------------") logger.info("--- Loading test data ---") - logger.info("-------------------------") test_data = input_dataclient.load_data( location=test_data_filepath, load_config=None ) - logger.info("---------------------") logger.info("--- Loading model ---") - logger.info("---------------------") model.load_model(model_filepath) - logger.info("------------------------------") logger.info("--- Generating predictions ---") - logger.info("------------------------------") prediction_data = ( test_data.drop(columns=target) if target in test_data.columns else test_data @@ -46,9 +40,7 @@ def generate_predictions( data=prediction_data, post_prediction_logic=post_prediction_logic ) - logger.info("--------------------------") logger.info("--- Saving predictions ---") - logger.info("--------------------------") predictions_df = pd.DataFrame(predictions) predictions_df.columns = [predictions_column_name]