Merge branch 'sap-dev' of github.com:Hestia-Homes/ML into sap-dev-model

This commit is contained in:
Michael Duong 2023-10-17 23:53:43 +00:00
commit 17fad3cf0a
8 changed files with 75 additions and 17 deletions

View file

@ -42,7 +42,14 @@ jobs:
if [ -z "${latest_version}" ]; then
increment_version="1.0.0"
else
increment_version=$(echo ${latest_version} | awk -F'.' '{OFS="."; $1+=1; print}')
increment_version=$(echo ${latest_version} | awk 'BEGIN {
FS="\\." # Set the field separator to a period
OFS="." # Set the output field separator to a period
}
{
major = $1 + 1 # Increment the major version
print major, "0", "0" # Print the new version
}')
fi
new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -80,7 +87,14 @@ jobs:
if [ -z "${latest_version}" ]; then
increment_version="0.1.0"
else
increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$2++; print}')
increment_version=$(echo ${latest_version} | awk 'BEGIN {
FS="\\." # Set the field separator to a period
OFS="." # Set the output field separator to a period
}
{
minor = $2 + 1 # Increment the minor version
print $1, minor, "0" # Print the new version
}')
fi
new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -118,7 +132,14 @@ jobs:
if [ -z "${latest_version}" ]; then
increment_version="0.0.1"
else
increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$3++; print}')
increment_version=$(echo ${latest_version} | awk 'BEGIN {
FS="\\." # Set the field separator to a period
OFS="." # Set the output field separator to a period
}
{
patch = $3 + 1 # Increment the patch version
print $1, $2, patch # Print the new version
}')
fi
new_tag=${REGISTER_MODEL_NAME}@v${increment_version}
@ -188,7 +209,7 @@ jobs:
git config user.name "Github-Bot"
git config user.email "Github-Bot@no-reply.com"
latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/')
latest_dev_version=$(gto history ${REGISTER_MODEL_NAME} --asc --plain | awk '{print $NF}' | awk '/dev/' | awk 'END {print}')
if [ -z "${latest_dev_version}" ]; then
increment_version="1"
else
@ -196,7 +217,7 @@ jobs:
fi
new_tag=${REGISTER_MODEL_NAME}#dev#${increment_version}
latest_version=$(gto show model@latest --ref | awk -F"@" '{print $2}')
latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@" '{print $2}')
echo ${new_tag}

View file

@ -8,9 +8,17 @@
"active": true
},
"sap": {
"version": "v0.0.3",
"version": "v0.1.0",
"stage": {
"dev": "v0.0.3"
"dev": "v0.1.0"
},
"registered": true,
"active": true
},
"heat": {
"version": "v0.0.1",
"stage": {
"dev": "v0.0.1"
},
"registered": true,
"active": true

View file

@ -10,9 +10,9 @@ tracking and a model registry
- A bolt-on service that can implement model monitoring
There are multiple protected branches which adapt the generic pipeline to produce different models:
- sap_change-**
- heat_change-**
- carbon_change-**
- sap-{dev/staging/prod}-**
- heat-{dev/staging/prod}-**
- carbon-{dev/staging/prod}-**
These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline
- There can be different additional logic for each branch but the pipeline will be the same.
@ -31,7 +31,7 @@ In order for this to be set up, some key environment variables needs to be inser
secrets. Each different model and protected branch has its own set of secrets which allows for flexibility
between different pipelines.
For example, for the branch sap_change-dev, the prefix=SAP_CHANGE_DEV, and the following secrets are:
For example, for the branch sap-dev, the prefix=SAP_DEV, and the following secrets are:
- {prefix}_ECR_URI, which is the URI of the ECR repository to push to. For example, for the
sap change model this is the lambda-sap-prediction-dev repository.
@ -58,7 +58,7 @@ First, navigate to the root directory of the repository. Open a terminal and exe
2. command to build the Docker image:
```bash
docker build -t sap_change -f deployment/Dockerfile.prediction.lambda .
docker build -t sap -f deployment/Dockerfile.prediction.lambda .
```
This will build a Docker image tagged as sap_change using the Dockerfile.prediction.lambda located
@ -68,7 +68,7 @@ in the deployment directory.
Once the image is built, you can run it using the following command:
```bash
docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev sap_change
docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev -e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev sap
```
This command does the following:
@ -79,6 +79,7 @@ Sets the RUNTIME_ENVIRONMENT variable to dev.
To test the Lambda function, use the following curl command:
```json
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"'
curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/sap_change_model/one_sample_test_dataset.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"}'
```
This will send a POST request to the running Lambda function and pass in the required data as JSON.

View file

@ -107,6 +107,7 @@ def handler(event, context):
predictions_column_name=generate_predictions_params[
"predictions_column_name"
],
identifier_column=generate_predictions_params["identifier_column"],
)
return {

View file

@ -43,6 +43,7 @@ default:
test_data_filepath: ./data/prepared_data/test.parquet
predictions_output_filepath: ./data/predictions/predictions.parquet
predictions_column_name: predictions
identifier_column: id
generate_metrics:
dataclient_type: local

View file

@ -142,9 +142,15 @@ class AWSS3Client:
buffer = BytesIO()
obj.to_parquet(buffer, index=False)
# Reset the buffer position to the beginning
buffer.seek(0)
bucket, key = location.strip("s3://").split("/", 1)
self.client.upload_fileobj(buffer, bucket, key)
# Close the buffer
buffer.close()
def _load_parquet(self, location: str, load_config: dict) -> pd.DataFrame:
"""
Load a parquet file

View file

@ -4,6 +4,7 @@ Implementation of MLMetrics, all of which will have two methods:
- Generate Plot Suite
"""
import numpy as np
import pandas as pd
from typing import Union
from sklearn.metrics import (
@ -14,6 +15,18 @@ from sklearn.metrics import (
)
from core.interface.InterfaceMetrics import MLMetrics
# Define the function to return the SMAPE value
def symmetric_mape(actual, predicted) -> float:
# Convert actual and predicted to numpy
# array data type if not already
if not all([isinstance(actual, np.ndarray), isinstance(predicted, np.ndarray)]):
actual, predicted = np.array(actual), np.array(predicted)
return np.mean(
np.abs(predicted - actual) / ((np.abs(predicted) + np.abs(actual)) / 2)
)
def metrics_factory(metrics_type: str) -> MLMetrics:
metrics = {
@ -34,7 +47,7 @@ class RegressionMetrics:
median_absolute_error,
mean_squared_error,
mean_absolute_percentage_error,
# max_error
symmetric_mape,
]
def generate_metrics(

View file

@ -14,6 +14,7 @@ def generate_predictions(
test_data_filepath: str,
predictions_output_filepath: str,
predictions_column_name: str,
identifier_column: str = "id",
):
"""
For a given model, we generate prediction and evaluate this against the true target
@ -52,6 +53,12 @@ def generate_predictions(
predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name]
output_dataclient.save_data(
obj=predictions_df, location=predictions_output_filepath, save_config=None
output_df = (
pd.concat([test_data[identifier_column], predictions_df], axis=1)
if identifier_column in test_data.columns
else predictions_df
)
output_dataclient.save_data(
obj=output_df, location=predictions_output_filepath, save_config=None
)