From dda9065a88b604b1c2ccf8e163d5289cbe328d5e Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Tue, 10 Oct 2023 11:45:50 +0100 Subject: [PATCH 01/16] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 5879c41..6a63c61 100644 --- a/README.md +++ b/README.md @@ -81,4 +81,8 @@ To test the Lambda function, use the following curl command: ```json curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"' ``` + +```json +curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"}' +``` This will send a POST request to the running Lambda function and pass in the required data as JSON. From d3b1bb4bb9bf03a1d5708e1c4f77642d4e33112a Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Tue, 10 Oct 2023 11:49:37 +0100 Subject: [PATCH 02/16] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6a63c61..a37f5fb 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,6 @@ curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d ``` ```json -curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"}' +curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"}' ``` This will send a POST request to the running Lambda function and pass in the required data as JSON. From 391cc6643591c4eeb1dd516d4d0465d0b9ea817a Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Tue, 10 Oct 2023 11:53:52 +0100 Subject: [PATCH 03/16] Update README.md --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index a37f5fb..7f018ad 100644 --- a/README.md +++ b/README.md @@ -79,10 +79,7 @@ Sets the RUNTIME_ENVIRONMENT variable to dev. To test the Lambda function, use the following curl command: ```json -curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"' +curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"}' ``` -```json -curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"}' -``` This will send a POST request to the running Lambda function and pass in the required data as JSON. From 70b3008dc5f19214cecbbec377c53529cadd428d Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Tue, 10 Oct 2023 11:56:56 +0100 Subject: [PATCH 04/16] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7f018ad..55cae8e 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ In order for this to be set up, some key environment variables needs to be inser secrets. Each different model and protected branch has its own set of secrets which allows for flexibility between different pipelines. -For example, for the branch sap_change-dev, the prefix=SAP_CHANGE_DEV, and the following secrets are: +For example, for the branch sap-dev, the prefix=SAP_DEV, and the following secrets are: - {prefix}_ECR_URI, which is the URI of the ECR repository to push to. For example, for the sap change model this is the lambda-sap-prediction-dev repository. @@ -58,7 +58,7 @@ First, navigate to the root directory of the repository. Open a terminal and exe 2. command to build the Docker image: ```bash -docker build -t sap_change -f deployment/Dockerfile.prediction.lambda . +docker build -t sap -f deployment/Dockerfile.prediction.lambda . ``` This will build a Docker image tagged as sap_change using the Dockerfile.prediction.lambda located @@ -68,7 +68,7 @@ in the deployment directory. Once the image is built, you can run it using the following command: ```bash -docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev sap_change +docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev -e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev sap ``` This command does the following: From 57934d0ae37daf61d67534a2671eda318659ba45 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 12:35:34 +0100 Subject: [PATCH 05/16] fixed buffer bug and add id --- deployment/handlers/prediction_app.py | 1 + .../ml-pipeline/src/pipeline/configs/settings.yaml | 1 + modules/ml-pipeline/src/pipeline/core/DataClient.py | 6 ++++++ .../ml-pipeline/src/pipeline/generate_predictions.py | 11 +++++++++-- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index 6f9d162..36a906c 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -107,6 +107,7 @@ def handler(event, context): predictions_column_name=generate_predictions_params[ "predictions_column_name" ], + identifier_column=generate_predictions_params["identifier_column"], ) return { diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index a84c095..ce7ed2c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -43,6 +43,7 @@ default: test_data_filepath: ./data/prepared_data/test.parquet predictions_output_filepath: ./data/predictions/predictions.parquet predictions_column_name: predictions + identifier_column: id generate_metrics: dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/core/DataClient.py b/modules/ml-pipeline/src/pipeline/core/DataClient.py index 28ffff7..53f4072 100644 --- a/modules/ml-pipeline/src/pipeline/core/DataClient.py +++ b/modules/ml-pipeline/src/pipeline/core/DataClient.py @@ -142,9 +142,15 @@ class AWSS3Client: buffer = BytesIO() obj.to_parquet(buffer, index=False) + # Reset the buffer position to the beginning + buffer.seek(0) + bucket, key = location.strip("s3://").split("/", 1) self.client.upload_fileobj(buffer, bucket, key) + # Close the buffer + buffer.close() + def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame: """ Load a parquet file diff --git a/modules/ml-pipeline/src/pipeline/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py index 85b3022..83ea103 100644 --- a/modules/ml-pipeline/src/pipeline/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py @@ -14,6 +14,7 @@ def generate_predictions( test_data_filepath: str, predictions_output_filepath: str, predictions_column_name: str, + identifier_column: str = "id", ): """ For a given model, we generate prediction and evaluate this against the true target @@ -52,6 +53,12 @@ def generate_predictions( predictions_df = pd.DataFrame(predictions) predictions_df.columns = [predictions_column_name] - output_dataclient.save_data( - obj=predictions_df, location=predictions_output_filepath, save_config=None + output_df = ( + pd.concat([test_data[identifier_column], predictions_df], axis=1) + if identifier_column in test_data.columns + else predictions_df + ) + + output_dataclient.save_data( + obj=output_df, location=predictions_output_filepath, save_config=None ) From 69c5c77a8805ca9b95a49bcab783696b437b10d6 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 11:48:13 +0000 Subject: [PATCH 06/16] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 9ab31e5..e0f4245 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.0.3", + "version": "v0.0.4", "stage": { "dev": "v0.0.3" }, From 7a1b9aed5ff82b7173ba8a2c25484e3b04f4001e Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 11:49:02 +0000 Subject: [PATCH 07/16] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index e0f4245..abd2ddb 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.0.4", "stage": { - "dev": "v0.0.3" + "dev": "v0.0.4" }, "registered": true, "active": true From 051f07df77a1e1f1775df32c59b3b96a7438d72a Mon Sep 17 00:00:00 2001 From: quandanrepo <45804868+quandanrepo@users.noreply.github.com> Date: Tue, 10 Oct 2023 14:02:54 +0100 Subject: [PATCH 08/16] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 55cae8e..22a6002 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ tracking and a model registry - A bolt-on service that can implement model monitoring There are multiple protected branches which adapt the generic pipeline to produce different models: -- sap_change-** -- heat_change-** -- carbon_change-** +- sap-{dev/staging/prod}-** +- heat-{dev/staging/prod}-** +- carbon-{dev/staging/prod}-** These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline - There can be different additional logic for each branch but the pipeline will be the same. From 8dd784255a57d36da94b59778057e57e5fbf2c80 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:28:30 +0100 Subject: [PATCH 09/16] add smape --- .../ml-pipeline/src/pipeline/core/MLMetrics.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py index 4b14386..845b819 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py +++ b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py @@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods: - Generate Plot Suite """ +import numpy as np import pandas as pd from typing import Union from sklearn.metrics import ( @@ -14,6 +15,18 @@ from sklearn.metrics import ( ) from core.interface.InterfaceMetrics import MLMetrics +# Define the function to return the SMAPE value +def symmetric_mape(actual, predicted) -> float: + + # Convert actual and predicted to numpy + # array data type if not already + if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]): + actual, predicted = np.array(actual), np.array(predicted) + + return np.mean( + np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2) + ) + def metrics_factory(metrics_type: str) -> MLMetrics: metrics = { @@ -34,7 +47,7 @@ class RegressionMetrics: median_absolute_error, mean_squared_error, mean_absolute_percentage_error, - # max_error + symmetric_mape, ] def generate_metrics( From 6552e975552d99b5fe9fb8757e672c5c94fa833e Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:41:06 +0100 Subject: [PATCH 10/16] fix the register increments --- .github/workflows/MLPipelinePostMerge.yml | 27 ++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index f7c4a8f..daef5d9 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -42,7 +42,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="1.0.0" else - increment_version=$(echo ${latest_version} | awk -F'.' '{OFS="."; $1+=1; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + major = $1 + 1 # Increment the major version + print major, "0", "0" # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} @@ -80,7 +87,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="0.1.0" else - increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$2++; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + minor = $2 + 1 # Increment the minor version + print $1, minor, "0" # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} @@ -118,7 +132,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="0.0.1" else - increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$3++; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + patch = $3 + 1 # Increment the patch version + print $1, $2, patch # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} From 755d00e0e40c15fc9f2844bfe47e297b00d14d6a Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 22:42:45 +0000 Subject: [PATCH 11/16] Update Registry --- MODEL_REGISTRY.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index abd2ddb..10b6ccd 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,11 +8,19 @@ "active": true }, "sap": { - "version": "v0.0.4", + "version": "v0.0.5", "stage": { "dev": "v0.0.4" }, "registered": true, "active": true + }, + "heat": { + "version": "v0.0.1", + "stage": { + "dev": "v0.0.1" + }, + "registered": true, + "active": true } } From 7a113f790e7222642e4bf6fcb42ee6f610dd7a6f Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 22:43:36 +0000 Subject: [PATCH 12/16] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 10b6ccd..0770e78 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.0.5", "stage": { - "dev": "v0.0.4" + "dev": "v0.0.5" }, "registered": true, "active": true From 8bdedf25a240e4e4c55948a491ea075bf0cf8337 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:56:35 +0100 Subject: [PATCH 13/16] final fix for workflow on post merge --- .github/workflows/MLPipelinePostMerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index daef5d9..506b93b 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -217,7 +217,7 @@ jobs: fi new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version} - latest_version=$(gto show model@latest --ref | awk -F"@" '{print $2}') + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}') echo ${new_tag} From bd80c3d69d9cd1d463967413b8f12d340b9949ce Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 10 Oct 2023 23:58:07 +0100 Subject: [PATCH 14/16] final fix for workflow on post merge --- .github/workflows/MLPipelinePostMerge.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index 506b93b..e4411b3 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -209,7 +209,7 @@ jobs: git config user.name "Github-Bot" git config user.email "Github-Bot@no-reply.com" - latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/') + latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}') if [ -z "${latest_dev_version}" ]; then increment_version="1" else From c668e4227c30cca24744bdd24a6ddbf92f268049 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 22:59:21 +0000 Subject: [PATCH 15/16] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 0770e78..ec8859d 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,7 +8,7 @@ "active": true }, "sap": { - "version": "v0.0.5", + "version": "v0.1.0", "stage": { "dev": "v0.0.5" }, From 4597c12795017a19eab68095711c0dc2b51f5e65 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 10 Oct 2023 23:00:04 +0000 Subject: [PATCH 16/16] Update Registry --- MODEL_REGISTRY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index ec8859d..b3ad75a 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -10,7 +10,7 @@ "sap": { "version": "v0.1.0", "stage": { - "dev": "v0.0.5" + "dev": "v0.1.0" }, "registered": true, "active": true