diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index f7c4a8f..e4411b3 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -42,7 +42,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="1.0.0" else - increment_version=$(echo ${latest_version} | awk -F'.' '{OFS="."; $1+=1; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + major = $1 + 1 # Increment the major version + print major, "0", "0" # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} @@ -80,7 +87,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="0.1.0" else - increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$2++; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + minor = $2 + 1 # Increment the minor version + print $1, minor, "0" # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} @@ -118,7 +132,14 @@ jobs: if [ -z "${latest_version}" ]; then increment_version="0.0.1" else - increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$3++; print}') + increment_version=$(echo ${latest_version} | awk 'BEGIN { + FS="\\." # Set the field separator to a period + OFS="." # Set the output field separator to a period + } + { + patch = $3 + 1 # Increment the patch version + print $1, $2, patch # Print the new version + }') fi new_tag=${REGISTER_MODEL_NAME}@v${increment_version} @@ -188,7 +209,7 @@ jobs: git config user.name "Github-Bot" git config user.email "Github-Bot@no-reply.com" - latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/') + latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}') if [ -z "${latest_dev_version}" ]; then increment_version="1" else @@ -196,7 +217,7 @@ jobs: fi new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version} - latest_version=$(gto show model@latest --ref | awk -F"@" '{print $2}') + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}') echo ${new_tag} diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index 9ab31e5..b3ad75a 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -8,9 +8,17 @@ "active": true }, "sap": { - "version": "v0.0.3", + "version": "v0.1.0", "stage": { - "dev": "v0.0.3" + "dev": "v0.1.0" + }, + "registered": true, + "active": true + }, + "heat": { + "version": "v0.0.1", + "stage": { + "dev": "v0.0.1" }, "registered": true, "active": true diff --git a/README.md b/README.md index 5879c41..22a6002 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ tracking and a model registry - A bolt-on service that can implement model monitoring There are multiple protected branches which adapt the generic pipeline to produce different models: -- sap_change-** -- heat_change-** -- carbon_change-** +- sap-{dev/staging/prod}-** +- heat-{dev/staging/prod}-** +- carbon-{dev/staging/prod}-** These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline - There can be different additional logic for each branch but the pipeline will be the same. @@ -31,7 +31,7 @@ In order for this to be set up, some key environment variables needs to be inser secrets. Each different model and protected branch has its own set of secrets which allows for flexibility between different pipelines. -For example, for the branch sap_change-dev, the prefix=SAP_CHANGE_DEV, and the following secrets are: +For example, for the branch sap-dev, the prefix=SAP_DEV, and the following secrets are: - {prefix}_ECR_URI, which is the URI of the ECR repository to push to. For example, for the sap change model this is the lambda-sap-prediction-dev repository. @@ -58,7 +58,7 @@ First, navigate to the root directory of the repository. Open a terminal and exe 2. command to build the Docker image: ```bash -docker build -t sap_change -f deployment/Dockerfile.prediction.lambda . +docker build -t sap -f deployment/Dockerfile.prediction.lambda . ``` This will build a Docker image tagged as sap_change using the Dockerfile.prediction.lambda located @@ -68,7 +68,7 @@ in the deployment directory. Once the image is built, you can run it using the following command: ```bash -docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev sap_change +docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev -e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev sap ``` This command does the following: @@ -79,6 +79,7 @@ Sets the RUNTIME_ENVIRONMENT variable to dev. To test the Lambda function, use the following curl command: ```json -curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"' +curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"}' ``` + This will send a POST request to the running Lambda function and pass in the required data as JSON. diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index 6f9d162..36a906c 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -107,6 +107,7 @@ def handler(event, context): predictions_column_name=generate_predictions_params[ "predictions_column_name" ], + identifier_column=generate_predictions_params["identifier_column"], ) return { diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index a84c095..ce7ed2c 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -43,6 +43,7 @@ default: test_data_filepath: ./data/prepared_data/test.parquet predictions_output_filepath: ./data/predictions/predictions.parquet predictions_column_name: predictions + identifier_column: id generate_metrics: dataclient_type: local diff --git a/modules/ml-pipeline/src/pipeline/core/DataClient.py b/modules/ml-pipeline/src/pipeline/core/DataClient.py index 28ffff7..53f4072 100644 --- a/modules/ml-pipeline/src/pipeline/core/DataClient.py +++ b/modules/ml-pipeline/src/pipeline/core/DataClient.py @@ -142,9 +142,15 @@ class AWSS3Client: buffer = BytesIO() obj.to_parquet(buffer, index=False) + # Reset the buffer position to the beginning + buffer.seek(0) + bucket, key = location.strip("s3://").split("/", 1) self.client.upload_fileobj(buffer, bucket, key) + # Close the buffer + buffer.close() + def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame: """ Load a parquet file diff --git a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py index 4b14386..845b819 100644 --- a/modules/ml-pipeline/src/pipeline/core/MLMetrics.py +++ b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py @@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods: - Generate Plot Suite """ +import numpy as np import pandas as pd from typing import Union from sklearn.metrics import ( @@ -14,6 +15,18 @@ from sklearn.metrics import ( ) from core.interface.InterfaceMetrics import MLMetrics +# Define the function to return the SMAPE value +def symmetric_mape(actual, predicted) -> float: + + # Convert actual and predicted to numpy + # array data type if not already + if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]): + actual, predicted = np.array(actual), np.array(predicted) + + return np.mean( + np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2) + ) + def metrics_factory(metrics_type: str) -> MLMetrics: metrics = { @@ -34,7 +47,7 @@ class RegressionMetrics: median_absolute_error, mean_squared_error, mean_absolute_percentage_error, - # max_error + symmetric_mape, ] def generate_metrics( diff --git a/modules/ml-pipeline/src/pipeline/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py index 85b3022..83ea103 100644 --- a/modules/ml-pipeline/src/pipeline/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py @@ -14,6 +14,7 @@ def generate_predictions( test_data_filepath: str, predictions_output_filepath: str, predictions_column_name: str, + identifier_column: str = "id", ): """ For a given model, we generate prediction and evaluate this against the true target @@ -52,6 +53,12 @@ def generate_predictions( predictions_df = pd.DataFrame(predictions) predictions_df.columns = [predictions_column_name] - output_dataclient.save_data( - obj=predictions_df, location=predictions_output_filepath, save_config=None + output_df = ( + pd.concat([test_data[identifier_column], predictions_df], axis=1) + if identifier_column in test_data.columns + else predictions_df + ) + + output_dataclient.save_data( + obj=output_df, location=predictions_output_filepath, save_config=None )