From 57934d0ae37daf61d67534a2671eda318659ba45 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 12:35:34 +0100 Subject: [PATCH 01/11] fixed buffer bug and add id --- deployment/handlers/prediction_app.py | 1 + .../ml-pipeline/src/pipeline/configs/settings.yaml | 1 + modules/ml-pipeline/src/pipeline/core/DataClient.py | 6 ++++++ .../ml-pipeline/src/pipeline/generate_predictions.py | 11 +++++++++-- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index 6f9d162..36a906c 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -107,6 +107,7 @@ def handler(event, context): predictions_column_name=generate_predictions_params[ "predictions_column_name" ], + identifier_column=generate_predictions_params["identifier_column"], ) return { diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index a84c095..ce7ed2c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -43,6 +43,7 @@ default: test_data_filepath: ./data/prepared_data/test.parquet predictions_output_filepath: ./data/predictions/predictions.parquet predictions_column_name: predictions + identifier_column: id generate_metrics: dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/core/DataClient.py b/modules/ml-pipeline/src/pipeline/core/DataClient.py index 28ffff7..53f4072 100644 --- a/modules/ml-pipeline/src/pipeline/core/DataClient.py +++ b/modules/ml-pipeline/src/pipeline/core/DataClient.py @@ -142,9 +142,15 @@ class AWSS3Client: buffer = BytesIO() obj.to_parquet(buffer, index=False) + # Reset the buffer position to the beginning + buffer.seek(0) + bucket, key = location.strip("s3://").split("/", 1) self.client.upload_fileobj(buffer, bucket, key) + # Close the buffer + buffer.close() + def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame: """ Load a parquet file diff --git a/modules/ml-pipeline/src/pipeline/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py index 85b3022..83ea103 100644 --- a/modules/ml-pipeline/src/pipeline/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py @@ -14,6 +14,7 @@ def generate_predictions( test_data_filepath: str, predictions_output_filepath: str, predictions_column_name: str, + identifier_column: str = "id", ): """ For a given model, we generate prediction and evaluate this against the true target @@ -52,6 +53,12 @@ def generate_predictions( predictions_df = pd.DataFrame(predictions) predictions_df.columns = [predictions_column_name] - output_dataclient.save_data( - obj=predictions_df, location=predictions_output_filepath, save_config=None + output_df = ( + pd.concat([test_data[identifier_column], predictions_df], axis=1) + if identifier_column in test_data.columns + else predictions_df + ) + + output_dataclient.save_data( + obj=output_df, location=predictions_output_filepath, save_config=None ) From 69c5c77a8805ca9b95a49bcab783696b437b10d6 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 11:48:13 +0000 Subject: [PATCH 02/11] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 9ab31e5..e0f4245 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.0.3", + "version": "v0.0.4", "stage": { "dev": "v0.0.3" }, From 7a1b9aed5ff82b7173ba8a2c25484e3b04f4001e Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 11:49:02 +0000 Subject: [PATCH 03/11] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index e0f4245..abd2ddb 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.0.4", "stage": { - "dev": "v0.0.3" + "dev": "v0.0.4" }, "registered": true, "active": true From 8dd784255a57d36da94b59778057e57e5fbf2c80 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:28:30 +0100 Subject: [PATCH 04/11] add smape --- .../ml-pipeline/src/pipeline/core/MLMetrics.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py index 4b14386..845b819 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py +++ b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py @@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods: - Generate Plot Suite """ +import numpy as np import pandas as pd from typing import Union from sklearn.metrics import ( @@ -14,6 +15,18 @@ from sklearn.metrics import ( ) from core.interface.InterfaceMetrics import MLMetrics +# Define the function to return the SMAPE value +def symmetric_mape(actual, predicted) -> float: + + # Convert actual and predicted to numpy + # array data type if not already + if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]): + actual, predicted = np.array(actual), np.array(predicted) + + return np.mean( + np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2) + ) + def metrics_factory(metrics_type: str) -> MLMetrics: metrics = { @@ -34,7 +47,7 @@ class RegressionMetrics: median_absolute_error, mean_squared_error, mean_absolute_percentage_error, - # max_error + symmetric_mape, ] def generate_metrics( From 6552e975552d99b5fe9fb8757e672c5c94fa833e Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:41:06 +0100 Subject: [PATCH 05/11] fix the register increments --- .github/workflows/MLPipelinePostMerge.yml | 27 ++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index f7c4a8f..daef5d9 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -42,7 +42,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="1.0.0" else - increment_version=$(echo ${latest_version} | awk -F'.' '{OFS="."; $1+=1; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + major = $1 + 1 # Increment the major version + print major, "0", "0" # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} @@ -80,7 +87,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="0.1.0" else - increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$2++; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + minor = $2 + 1 # Increment the minor version + print $1, minor, "0" # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} @@ -118,7 +132,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="0.0.1" else - increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$3++; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + patch = $3 + 1 # Increment the patch version + print $1, $2, patch # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} From 755d00e0e40c15fc9f2844bfe47e297b00d14d6a Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 22:42:45 +0000 Subject: [PATCH 06/11] Update Registry --- MODEL_REGISTRY.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index abd2ddb..10b6ccd 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,11 +8,19 @@ "active": true }, "sap": { - "version": "v0.0.4", + "version": "v0.0.5", "stage": { "dev": "v0.0.4" }, "registered": true, "active": true + }, + "heat": { + "version": "v0.0.1", + "stage": { + "dev": "v0.0.1" + }, + "registered": true, + "active": true } } From 7a113f790e7222642e4bf6fcb42ee6f610dd7a6f Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 22:43:36 +0000 Subject: [PATCH 07/11] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 10b6ccd..0770e78 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.0.5", "stage": { - "dev": "v0.0.4" + "dev": "v0.0.5" }, "registered": true, "active": true From 8bdedf25a240e4e4c55948a491ea075bf0cf8337 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:56:35 +0100 Subject: [PATCH 08/11] final fix for workflow on post merge --- .github/workflows/MLPipelinePostMerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index daef5d9..506b93b 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -217,7 +217,7 @@ jobs: fi new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version} - latest_version=$(gto show model@latest --ref | awk -F"@" '{print $2}') + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}') echo ${new_tag} From bd80c3d69d9cd1d463967413b8f12d340b9949ce Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:58:07 +0100 Subject: [PATCH 09/11] final fix for workflow on post merge --- .github/workflows/MLPipelinePostMerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index 506b93b..e4411b3 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -209,7 +209,7 @@ jobs: git config user.name "Github-Bot" git config user.email "Github-Bot@no-reply.com" - latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/') + latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}') if [ -z "${latest_dev_version}" ]; then increment_version="1" else From c668e4227c30cca24744bdd24a6ddbf92f268049 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 22:59:21 +0000 Subject: [PATCH 10/11] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 0770e78..ec8859d 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.0.5", + "version": "v0.1.0", "stage": { "dev": "v0.0.5" }, From 4597c12795017a19eab68095711c0dc2b51f5e65 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 23:00:04 +0000 Subject: [PATCH 11/11] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index ec8859d..b3ad75a 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.1.0", "stage": { - "dev": "v0.0.5" + "dev": "v0.1.0" }, "registered": true, "active": true