From e2ce04aa0db275c8ce0b26bc419fb06e744f0281 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 11:45:51 +0100 Subject: [PATCH 01/16] formatted --- .github/workflows/Deploy.yml | 115 ++++++++++++++++++++++++ README.md | 25 +++++- deployment/Dockerfile.prediction.lambda | 21 +++++ deployment/handlers/prediction_app.py | 93 +++++++++++++++++++ deployment/serverless.yml | 56 ++++++++++++ 5 files changed, 307 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/Deploy.yml create mode 100644 deployment/Dockerfile.prediction.lambda create mode 100644 deployment/handlers/prediction_app.py create mode 100644 deployment/serverless.yml diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml new file mode 100644 index 0000000..f97bddc --- /dev/null +++ b/.github/workflows/Deploy.yml @@ -0,0 +1,115 @@ +name: Sap Change Model Deploy + +on: + push: + branches: [ sap_change-dev, sap_change-prod ] + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.10.12 + + - name: Install Serverless and plugins + run: | + npm install -g serverless + npm install -g serverless-domain-manager + + - name: AWS credentials for dev + if: github.ref == 'refs/heads/dev' + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + aws-region: eu-west-2 + + - name: AWS credentials for prod + if: github.ref == 'refs/heads/prod' + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }} + aws-region: eu-west-2 + + # Set up all of the secrets required for the deployment + - name: set secret prefix which is used across multiple steps + id: secret_prefix + run: | + # Convert branch name to uppercase and replace hyphens with underscores + echo "::set-output name=secret_prefix::$(echo "${{ github.ref_name }}" | tr 'a-z-' 'A-Z_')" + + - name: Set domain name + id: set_domain + run: echo "::set-output name=domain::${{ secrets[format('{0}_DOMAIN_NAME', steps.secret_prefix.outputs.secret_prefix)] }}" + + - name: Set ECR credentials + id: set_ecr_credentials + run: | + # Fetch the secret using the secret prefix + echo "::set-output name=ecr_uri::${{ secrets[format('{0}_ECR_URI', steps.secret_prefix.outputs.secret_prefix)] }}" + + - name: Set S3 buckets + id: set_s3_buckets + run: | + # Fetch the secret using the secret prefix + echo "::set-output name=data_bucket::${{ secrets[format('{0}_DATA_BUCKET', steps.secret_prefix.outputs.secret_prefix)] }}" + echo "::set-output name=predictions_bucket::${{ secrets[format('{0}_PREDICTIONS_BUCKET', steps.secret_prefix.outputs.secret_prefix)] }}" + echo "::set-output name=model_directory_bucket::${{ secrets[format('{0}_MODEL_DIRECTORY_BUCKET', steps.secret_prefix.outputs.secret_prefix)] }}" + + - name: Set stack_name + id: set_stack_name + run: | + if [[ "${{ github.ref_name }}" == "sap_change-dev" || "${{ github.ref_name }}" == "sap_change-prod" ]]; then + echo "::set-output name=stack_name::sapmodel" + else + echo "::set-output name=stack_name::" + fi + + - name: Set runtime_environment + id: set_runtime_environment + run: | + # Extract the suffix after the hyphen from the branch name + runtime_environment=$(echo "${{ github.ref_name }}" | awk -F'-' '{print $NF}') + echo "::set-output name=runtime_environment::$runtime_environment" + + + - name: Setup Docker + uses: docker/setup-buildx-action@v1 + + - name: Login to ECR + run: | + aws ecr get-login-password --region eu-west-2 | docker login --username AWS --password-stdin ${{ steps.set_ecr_credentials.outputs.ecr_uri }} + + # Building and pushing Docker image with caching + - name: Build and push Docker image + uses: docker/build-push-action@v3 + with: + context: ./model_data/simulation_system + file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda + push: true + tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }} + cache-from: type=gha + cache-to: type=gha,mode=max + platform: linux/amd64 + provenance: false + + - name: Deploy to AWS Lambda via Serverless + env: + RUNTIME_ENVIRONMENT: ${{ steps.set_runtime_environment.outputs.runtime_environment }} + MODEL_DIRECTORY_BUCKET: ${{ steps.s3_buckets.outputs.model_directory_bucket }} + PREDICTIONS_BUCKET: ${{ steps.s3_buckets.outputs.predictions_bucket }} + DATA_BUCKET: ${{ steps.s3_buckets.outputs.data_bucket }} + DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }} + ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }} + GITHUB_SHA: ${{ github.sha }} + STACK_NAME: ${{ steps.set_stack_name.outputs.stack_name }} + run: | + # Deploy to AWS Lambda via Serverless + sls deploy --config serverless.yml --stage ${{ steps.set_runtime_environment.outputs.runtime_environment }} --verbose diff --git a/README.md b/README.md index 35242a0..638520a 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Creating a ML-toolkit that can be reused: - ML pipeline: - - A generic pipeline that has data version control, experiment + - A generic pipeline that has data version control, experiment tracking and a model registry - ML monitoring: @@ -17,7 +17,26 @@ There are multiple protected branches which adapt the generic pipeline to produc These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline - There can be different additional logic for each branch but the pipeline will be the same. -# Deployment +# Deployment -TBD +Scripts associated to deployment can be found in the deployment/ folder. +Deployment is automated via Github Actions, where a deployment is triggered by a push to one of the +protected branch, with one of dev or prod as the suffix, describing the target environment. + +The github actions file will build and push a docker image to ECR and then deploy a lambda +which produces predictions for the relevant model. + +In order for this to be set up, some key environment variables needs to be inserted into Github +secrets. Each different model and protected branch has its own set of secrets which allows for flexibility +between different pipelines. + +For example, for the branch sap_change-dev, the prefix=SAP_CHANGE_DEV, and the following secrets are: + +- {prefix}_ECR_URI, which is the URI of the ECR repository to push to. For example, for the + sap change model this is the lambda-sap-prediction-dev repository. +- {prefix}_DOMAIN_NAME, is the custom domain name. This is likely going to be the same across the different + models, but is still included in the secrets for flexibility. +- {prefix}_DATA_BUCKET, is the name of the s3 data bucket where data to be scored by the model is stored +- {prefix}_MODEL_BUCKET, is the name of the s3 bucket where the model is stored +- {prefix}_PREDICTIONS_BUCKET, is the name of the s3 bucket where the predictions are stored diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda new file mode 100644 index 0000000..8a759fd --- /dev/null +++ b/deployment/Dockerfile.prediction.lambda @@ -0,0 +1,21 @@ +FROM public.ecr.aws/lambda/python:3.10 + +# Set the working directory +WORKDIR ${LAMBDA_TASK_ROOT} +ENV PYTHONPATH "${PYTHONPATH}:${LAMBDA_TASK_ROOT}" + +# Install necessary build tools - required to test locally +RUN yum install -y gcc python3-devel + +# Install python packages +COPY modules/ml-pipeline/src/requirements/predictions/requirements.txt ./requirements.txt +RUN pip install --no-cache-dir -r ./requirements.txt + +# Copy the project code +COPY modules/ml-pipeline/src/pipeline ./pipeline +# Copy the handler +COPY deployment/handlers/prediction_app.py prediction_app.py +# Get the model +# RUN dev pull -r ${RUNTIME_ENVIRONMENT} + +CMD [ "prediction_handler.handler" ] diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py new file mode 100644 index 0000000..da73742 --- /dev/null +++ b/deployment/handlers/prediction_app.py @@ -0,0 +1,93 @@ +""" +This script is the handler for the lambda prediction function, responsible +for producting predictions for a model +""" + +import boto3 +from botocore.exceptions import NoCredentialsError +import json +from io import StringIO +import os +import logging + +logger = logging.getLogger() +logger.setLevel(logging.INFO) + +RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "dev") + + +def upload_dataframe_to_s3(df, bucket, s3_file_name): + """ + Upload a pandas DataFrame to an S3 bucket as CSV + + :param df: DataFrame to upload + :param bucket: Bucket to upload to + :param s3_file_name: S3 object name + :return: True if file was uploaded, else False + """ + + # Initialize the S3 client + s3 = boto3.client("s3") + csv_buffer = StringIO() + + # Write the DataFrame to the buffer as CSV + df.to_csv(csv_buffer, index=False) + + try: + # Upload the CSV from the buffer to S3 + s3.put_object(Bucket=bucket, Key=s3_file_name, Body=csv_buffer.getvalue()) + print(f"Successfully uploaded DataFrame to {bucket}/{s3_file_name}") + return True + except NoCredentialsError: + print("Credentials not available") + return False + + +def handler(event, context): + """ + Take in event and trigger the prediction pipeline + """ + + logger.info("received event: " + str(event)) + + try: + body = ( + json.loads(event["body"]) + if not isinstance(event["body"], dict) + else event["body"] + ) + + logger.info("Inside handler with body: " + str(body)) + + data_path = body["file_location"] + property_id = body["property_id"] + portfolio_id = body["portfolio_id"] + created_at = body["created_at"] + + # TODO: Implement the loading of the model and prediction + + storage_filepath = f"{portfolio_id}/{property_id}/{created_at}.csv" + + # upload_dataframe_to_s3( + # df=outputs, + # bucket=f"retrofit-sap-predictions-{RUNTIME_ENVIRONMENT}", + # s3_file_name=storage_filepath + # ) + + return { + "statusCode": 200, + "body": json.dumps( + { + "message": "Successfully processed input", + "storage_filepath": storage_filepath, + } + ), + } + + except (Exception, KeyError, ValueError) as e: + logger.info("Prediction failed") + logger.info(e) + return { + "statusCode": 500, + "body": json.dumps({"message": "Prediction failed", "error": str(e)}), + } diff --git a/deployment/serverless.yml b/deployment/serverless.yml new file mode 100644 index 0000000..e546f13 --- /dev/null +++ b/deployment/serverless.yml @@ -0,0 +1,56 @@ +service: ${env:STACK_NAME} + +provider: + name: aws + region: eu-west-2 + architecture: x86_64 + environment: + RUNTIME_ENVIRONMENT: ${env:RUNTIME_ENVIRONMENT} + MODEL_DIRECTORY_BUCKET: ${env:MODEL_DIRECTORY_BUCKET} + PREDICTIONS_BUCKET: ${env:PREDICTIONS_BUCKET} + DATA_BUCKET: ${env:DATA_BUCKET} + DOMAIN_NAME: ${env:DOMAIN_NAME} + ECR_URI: ${env:ECR_URI} + GITHUB_SHA: ${env:GITHUB_SHA} + iam: + role: + name: ${env:STACK_NAME}_s3_access + statements: + # Allow reading from MODEL_DIRECTORY_BUCKET and DATA_BUCKET + - Effect: Allow + Action: + - s3:* + Resource: + - arn:aws:s3:::${env:MODEL_DIRECTORY_BUCKET} + - arn:aws:s3:::${env:MODEL_DIRECTORY_BUCKET}/* + - arn:aws:s3:::${env:DATA_BUCKET} + - arn:aws:s3:::${env:DATA_BUCKET}/* + # Allow reading and writing to PREDICTIONS_BUCKET + - Effect: Allow + Action: + - s3:* + Resource: + - arn:aws:s3:::${env:PREDICTIONS_BUCKET} + - arn:aws:s3:::${env:PREDICTIONS_BUCKET}/* + + + +plugins: + - serverless-domain-manager + +custom: + customDomain: + domainName: api.${self:provider.environment.DOMAIN_NAME} + basePath: ${env:STACK_NAME} + createRoute53Record: true + certificateArn: ${ssm:/ssl_certificate_arn} + +functions: + sap_prediction_lambda: + image: + uri: ${env:ECR_URI}:${env:GITHUB_SHA} + events: + - http: + path: /predict + method: POST + timeout: 120 # Set max run time to 2 minutes - we shouldn't need this much time so this can be reviewed From 6b96c084c2cb1485b6f66106c6a9ecb13a7d65fc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 11:49:08 +0100 Subject: [PATCH 02/16] added build arguments to github actions --- .github/workflows/Deploy.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index f97bddc..67d8cb6 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -91,14 +91,16 @@ jobs: - name: Build and push Docker image uses: docker/build-push-action@v3 with: - context: ./model_data/simulation_system - file: ./model_data/simulation_system/Dockerfiles/Dockerfile.prediction.lambda + context: ./modules/ml-pipeline/src/pipeline + file: ./deployment/Dockerfile.prediction.lambda push: true tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }} cache-from: type=gha cache-to: type=gha,mode=max platform: linux/amd64 provenance: false + build-args: | + RUNTIME_ENVIRONMENT=${{ steps.set_runtime_environment.outputs.runtime_environment }} - name: Deploy to AWS Lambda via Serverless env: @@ -112,4 +114,4 @@ jobs: STACK_NAME: ${{ steps.set_stack_name.outputs.stack_name }} run: | # Deploy to AWS Lambda via Serverless - sls deploy --config serverless.yml --stage ${{ steps.set_runtime_environment.outputs.runtime_environment }} --verbose + sls deploy --config deployment/serverless.yml --stage ${{ steps.set_runtime_environment.outputs.runtime_environment }} --verbose From a6f9954125201cc45d2c2efd7bc91e57a5045f94 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 11:52:24 +0100 Subject: [PATCH 03/16] added environment variables to docker --- deployment/Dockerfile.prediction.lambda | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda index 8a759fd..a011d55 100644 --- a/deployment/Dockerfile.prediction.lambda +++ b/deployment/Dockerfile.prediction.lambda @@ -4,6 +4,10 @@ FROM public.ecr.aws/lambda/python:3.10 WORKDIR ${LAMBDA_TASK_ROOT} ENV PYTHONPATH "${PYTHONPATH}:${LAMBDA_TASK_ROOT}" +# Environment variables +ARG RUNTIME_ENVIRONMENT +ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT} + # Install necessary build tools - required to test locally RUN yum install -y gcc python3-devel @@ -16,6 +20,6 @@ COPY modules/ml-pipeline/src/pipeline ./pipeline # Copy the handler COPY deployment/handlers/prediction_app.py prediction_app.py # Get the model -# RUN dev pull -r ${RUNTIME_ENVIRONMENT} +# RUN dvc pull -r ${RUNTIME_ENVIRONMENT} CMD [ "prediction_handler.handler" ] From a3bd1967b6ad6a8deecdf6f0a686a399730e1f07 Mon Sep 17 00:00:00 2001 From: Github-Bot Date: Tue, 3 Oct 2023 10:54:48 +0000 Subject: [PATCH 04/16] Update Registry --- MODEL_REGISTRY.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md index d924d90..f4bd4d7 100644 --- a/MODEL_REGISTRY.md +++ b/MODEL_REGISTRY.md @@ -6,5 +6,13 @@ }, "registered": true, "active": true + }, + "migrate": { + "version": null, + "stage": { + "dev": "f320b9e0e9f3ea7735aed1abee07b1fb498c39c3" + }, + "registered": true, + "active": true } } From 21645968ad12256916cad21e76c2234cf090c6fa Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 12:03:07 +0100 Subject: [PATCH 05/16] Setting aws credentials --- .github/workflows/Deploy.yml | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 67d8cb6..52cce89 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -22,22 +22,6 @@ jobs: npm install -g serverless npm install -g serverless-domain-manager - - name: AWS credentials for dev - if: github.ref == 'refs/heads/dev' - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - - - name: AWS credentials for prod - if: github.ref == 'refs/heads/prod' - uses: aws-actions/configure-aws-credentials@v1 - with: - aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }} - aws-region: eu-west-2 - # Set up all of the secrets required for the deployment - name: set secret prefix which is used across multiple steps id: secret_prefix @@ -79,6 +63,21 @@ jobs: runtime_environment=$(echo "${{ github.ref_name }}" | awk -F'-' '{print $NF}') echo "::set-output name=runtime_environment::$runtime_environment" + - name: AWS credentials for dev + if: ${{ steps.set_runtime_environment.outputs.runtime_environment }} == 'dev' + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + aws-region: eu-west-2 + + - name: AWS credentials for prod + if: ${{ steps.set_runtime_environment.outputs.runtime_environment }} == 'prod' + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.PROD_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }} + aws-region: eu-west-2 - name: Setup Docker uses: docker/setup-buildx-action@v1 From c4d1d074b517973a7a58151d17ba0e1e1d4a4dd8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 12:12:36 +0100 Subject: [PATCH 06/16] getting docker context to the root --- .github/workflows/Deploy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 52cce89..21888e8 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -90,13 +90,13 @@ jobs: - name: Build and push Docker image uses: docker/build-push-action@v3 with: - context: ./modules/ml-pipeline/src/pipeline + context: . file: ./deployment/Dockerfile.prediction.lambda push: true tags: ${{ steps.set_ecr_credentials.outputs.ecr_uri }}:${{ github.sha }} cache-from: type=gha cache-to: type=gha,mode=max - platform: linux/amd64 + platforms: linux/amd64 provenance: false build-args: | RUNTIME_ENVIRONMENT=${{ steps.set_runtime_environment.outputs.runtime_environment }} From 749e824a9d8414157cc5b26e47a646482d348fd6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 12:48:00 +0100 Subject: [PATCH 07/16] fixed docker file and added instructions --- README.md | 42 +++++++++++++++++++++++++ deployment/Dockerfile.prediction.lambda | 4 +-- deployment/handlers/prediction_app.py | 2 -- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 638520a..5879c41 100644 --- a/README.md +++ b/README.md @@ -40,3 +40,45 @@ For example, for the branch sap_change-dev, the prefix=SAP_CHANGE_DEV, and the f - {prefix}_DATA_BUCKET, is the name of the s3 data bucket where data to be scored by the model is stored - {prefix}_MODEL_BUCKET, is the name of the s3 bucket where the model is stored - {prefix}_PREDICTIONS_BUCKET, is the name of the s3 bucket where the predictions are stored + + +# Building and Testing the Prediction Lambda Function Locally +TODO: Generalise these instructions for the various different pipelines + +This guide outlines the steps to build and test the Lambda function locally using Docker. These instructions assume you're working with a machine that has Docker installed. + +### Prerequisites +Docker: Make sure Docker is installed and running on your machine. +AWS Credentials: Ensure you have AWS credentials set up on your local machine, typically stored +in ~/.aws/credentials. +Root Directory: All commands should be run from the root directory of the repository. +Step-by-Step Guide +1. Building the Docker Image +First, navigate to the root directory of the repository. Open a terminal and execute the following +2. command to build the Docker image: + +```bash +docker build -t sap_change -f deployment/Dockerfile.prediction.lambda . +``` + +This will build a Docker image tagged as sap_change using the Dockerfile.prediction.lambda located +in the deployment directory. + +2. Running the Docker Image +Once the image is built, you can run it using the following command: + +```bash +docker run -p 9000:8080 -v ~/.aws/credentials:/root/.aws/credentials:ro -e RUNTIME_ENVIRONMENT=dev sap_change +``` +This command does the following: + +Maps port 9000 on your local machine to port 8080 on the Docker container. +Mounts your AWS credentials into the Docker container in read-only mode. +Sets the RUNTIME_ENVIRONMENT variable to dev. +3. Testing the Lambda Function +To test the Lambda function, use the following curl command: + +```json +curl -XPOST "http://localhost:9000/2015-03-31/functions/function/invocations" -d '{"body": "{\"file_location\": \"s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data_with_id.parquet\", \"property_id\": 1, \"portfolio_id\": 4, \"created_at\": \"now\"}"' +``` +This will send a POST request to the running Lambda function and pass in the required data as JSON. diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda index a011d55..b4f2323 100644 --- a/deployment/Dockerfile.prediction.lambda +++ b/deployment/Dockerfile.prediction.lambda @@ -12,7 +12,7 @@ ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT} RUN yum install -y gcc python3-devel # Install python packages -COPY modules/ml-pipeline/src/requirements/predictions/requirements.txt ./requirements.txt +COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt RUN pip install --no-cache-dir -r ./requirements.txt # Copy the project code @@ -22,4 +22,4 @@ COPY deployment/handlers/prediction_app.py prediction_app.py # Get the model # RUN dvc pull -r ${RUNTIME_ENVIRONMENT} -CMD [ "prediction_handler.handler" ] +CMD [ "prediction_app.handler" ] diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index da73742..31b5139 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -57,8 +57,6 @@ def handler(event, context): else event["body"] ) - logger.info("Inside handler with body: " + str(body)) - data_path = body["file_location"] property_id = body["property_id"] portfolio_id = body["portfolio_id"] From fd111146745b7499bd2ab42597be1282e185362c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 14:03:50 +0100 Subject: [PATCH 08/16] changing to the deployment directory for sls deploy --- .github/workflows/Deploy.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 21888e8..330855c 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -113,4 +113,5 @@ jobs: STACK_NAME: ${{ steps.set_stack_name.outputs.stack_name }} run: | # Deploy to AWS Lambda via Serverless - sls deploy --config deployment/serverless.yml --stage ${{ steps.set_runtime_environment.outputs.runtime_environment }} --verbose + cd deployment + sls deploy --config serverless.yml --stage ${{ steps.set_runtime_environment.outputs.runtime_environment }} --verbose From 9501130419568aa35a7a804779501f521b182e5a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 16:30:44 +0100 Subject: [PATCH 09/16] Trying dvc pull in github actions and copying into docker --- .github/workflows/Deploy.yml | 9 +++++++++ deployment/Dockerfile.prediction.lambda | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 330855c..5f42eae 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -22,6 +22,10 @@ jobs: npm install -g serverless npm install -g serverless-domain-manager + - name: Install DVC + run: | + pip install dvc + # Set up all of the secrets required for the deployment - name: set secret prefix which is used across multiple steps id: secret_prefix @@ -79,6 +83,11 @@ jobs: aws-secret-access-key: ${{ secrets.PROD_AWS_SECRET_ACCESS_KEY }} aws-region: eu-west-2 + - name: DVC Pull + run: | + cd modules/ml-pipeline/src/pipeline + dvc pull -r ${{ steps.set_runtime_environment.outputs.runtime_environment }} + - name: Setup Docker uses: docker/setup-buildx-action@v1 diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda index b4f2323..35fc756 100644 --- a/deployment/Dockerfile.prediction.lambda +++ b/deployment/Dockerfile.prediction.lambda @@ -14,12 +14,12 @@ RUN yum install -y gcc python3-devel # Install python packages COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt RUN pip install --no-cache-dir -r ./requirements.txt +RUN pip install --no-cache-dir -r ./requirements-vc.txt # Copy the project code COPY modules/ml-pipeline/src/pipeline ./pipeline # Copy the handler COPY deployment/handlers/prediction_app.py prediction_app.py -# Get the model -# RUN dvc pull -r ${RUNTIME_ENVIRONMENT} + CMD [ "prediction_app.handler" ] From 5960ebbf22729c8d96550ff9fedfccc790e6c3b2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 16:31:40 +0100 Subject: [PATCH 10/16] remove install of version control requirements --- deployment/Dockerfile.prediction.lambda | 1 - 1 file changed, 1 deletion(-) diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda index 35fc756..58c3a88 100644 --- a/deployment/Dockerfile.prediction.lambda +++ b/deployment/Dockerfile.prediction.lambda @@ -14,7 +14,6 @@ RUN yum install -y gcc python3-devel # Install python packages COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt RUN pip install --no-cache-dir -r ./requirements.txt -RUN pip install --no-cache-dir -r ./requirements-vc.txt # Copy the project code COPY modules/ml-pipeline/src/pipeline ./pipeline From 1400e6843c9e11264e0d358b144a3dd6018dd178 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 16:35:05 +0100 Subject: [PATCH 11/16] install vc requirements --- .github/workflows/Deploy.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 5f42eae..9ec57af 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -24,7 +24,8 @@ jobs: - name: Install DVC run: | - pip install dvc + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt # Set up all of the secrets required for the deployment - name: set secret prefix which is used across multiple steps From fd076055025a4f43e1bf6a46948f303662a1caae Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 17:08:48 +0100 Subject: [PATCH 12/16] Got deployment working --- deployment/Dockerfile.prediction.lambda | 3 +- deployment/handlers/prediction_app.py | 45 +++++++++++--- modules/ml-pipeline/.gitignore | 1 + .../src/pipeline/3_generate_predictions.py | 59 +------------------ .../ml-pipeline/src/pipeline/data/.gitignore | 3 - .../src/pipeline/generate_predictions.py | 57 ++++++++++++++++++ 6 files changed, 98 insertions(+), 70 deletions(-) delete mode 100644 modules/ml-pipeline/src/pipeline/data/.gitignore create mode 100644 modules/ml-pipeline/src/pipeline/generate_predictions.py diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda index 58c3a88..a2520ba 100644 --- a/deployment/Dockerfile.prediction.lambda +++ b/deployment/Dockerfile.prediction.lambda @@ -18,7 +18,8 @@ RUN pip install --no-cache-dir -r ./requirements.txt # Copy the project code COPY modules/ml-pipeline/src/pipeline ./pipeline # Copy the handler -COPY deployment/handlers/prediction_app.py prediction_app.py +COPY deployment/handlers/prediction_app.py ./pipeline/prediction_app.py +WORKDIR ${LAMBDA_TASK_ROOT}/pipeline CMD [ "prediction_app.handler" ] diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index 31b5139..fb64b83 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -9,12 +9,14 @@ import json from io import StringIO import os import logging +from generate_predictions import generate_predictions +from core.MLModels import model_factory +from config import settings +from core.DataClient import dataclient_factory logger = logging.getLogger() logger.setLevel(logging.INFO) -RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "dev") - def upload_dataframe_to_s3(df, bucket, s3_file_name): """ @@ -57,7 +59,6 @@ def handler(event, context): else event["body"] ) - data_path = body["file_location"] property_id = body["property_id"] portfolio_id = body["portfolio_id"] created_at = body["created_at"] @@ -66,11 +67,39 @@ def handler(event, context): storage_filepath = f"{portfolio_id}/{property_id}/{created_at}.csv" - # upload_dataframe_to_s3( - # df=outputs, - # bucket=f"retrofit-sap-predictions-{RUNTIME_ENVIRONMENT}", - # s3_file_name=storage_filepath - # ) + logger.info("-------------------------") + logger.info(f"--- Initiate MLModel ---") + logger.info("-------------------------") + + build_model_params = settings.build_model + client_params = settings.client + feature_process_params = settings.feature_processor + generate_predictions_params = settings.generate_predictions + + model = model_factory(build_model_params["model_type"]) + + input_dataclient = dataclient_factory( + dataclient_type="aws-s3", + dataclient_config=client_params["aws-s3"], + ) + + output_dataclient = dataclient_factory( + dataclient_type="aws-s3", + dataclient_config=client_params["aws-s3"], + ) + + generate_predictions( + input_dataclient=input_dataclient, + output_dataclient=output_dataclient, + model=model, + target=feature_process_params["feature_processor_config"]["target"], + model_filepath=build_model_params["model_save_filepath"], + test_data_filepath=body["file_location"], + predictions_output_filepath=storage_filepath, + predictions_column_name=generate_predictions_params[ + "predictions_column_name" + ], + ) return { "statusCode": 200, diff --git a/modules/ml-pipeline/.gitignore b/modules/ml-pipeline/.gitignore index 664bc8d..435bf5b 100644 --- a/modules/ml-pipeline/.gitignore +++ b/modules/ml-pipeline/.gitignore @@ -3,3 +3,4 @@ __pycache__/ .DS_Store .vscode/ +data/ diff --git a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py index f977d9a..9461392 100644 --- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py @@ -4,16 +4,11 @@ After the model is built, we can evaluate its performance """ import os -import yaml -import pandas as pd -from pathlib import Path -from core.interface.InterfaceModels import MLModel -from core.interface.InterfaceDataClient import DataClient from core.DataClient import dataclient_factory from core.MLModels import model_factory from core.Logger import logger -from configs.post_prediction_logic import post_prediction_logic from config import settings +from generate_predictions import generate_predictions logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") @@ -62,58 +57,6 @@ output_dataclient = dataclient_factory( ) -def generate_predictions( - input_dataclient: DataClient, - output_dataclient: DataClient, - model: MLModel, - target: str, - model_filepath: str, - test_data_filepath: str, - predictions_output_filepath: str, - predictions_column_name: str, -): - """ - For a given model, we generate prediction and evaluate this against the true target - """ - - logger.info("-------------------------") - logger.info("--- Loading test data ---") - logger.info("-------------------------") - - test_data = input_dataclient.load_data( - location=test_data_filepath, load_config=None - ) - - logger.info("---------------------") - logger.info("--- Loading model ---") - logger.info("---------------------") - - model.load_model(model_filepath) - - logger.info("------------------------------") - logger.info("--- Generating predictions ---") - logger.info("------------------------------") - - prediction_data = ( - test_data.drop(columns=target) if target in test_data.columns else test_data - ) - - predictions = model.predict( - data=prediction_data, post_prediction_logic=post_prediction_logic - ) - - logger.info("--------------------------") - logger.info("--- Saving predictions ---") - logger.info("--------------------------") - - predictions_df = pd.DataFrame(predictions) - predictions_df.columns = [predictions_column_name] - - output_dataclient.save_data( - obj=predictions_df, location=predictions_output_filepath, save_config=None - ) - - if __name__ == "__main__": logger.info("----------------------------") diff --git a/modules/ml-pipeline/src/pipeline/data/.gitignore b/modules/ml-pipeline/src/pipeline/data/.gitignore deleted file mode 100644 index 7c8e294..0000000 --- a/modules/ml-pipeline/src/pipeline/data/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/prepared_data -/model -/predictions diff --git a/modules/ml-pipeline/src/pipeline/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py new file mode 100644 index 0000000..85b3022 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py @@ -0,0 +1,57 @@ +import pandas as pd +from configs.post_prediction_logic import post_prediction_logic +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from core.Logger import logger + + +def generate_predictions( + input_dataclient: DataClient, + output_dataclient: DataClient, + model: MLModel, + target: str, + model_filepath: str, + test_data_filepath: str, + predictions_output_filepath: str, + predictions_column_name: str, +): + """ + For a given model, we generate prediction and evaluate this against the true target + """ + + logger.info("-------------------------") + logger.info("--- Loading test data ---") + logger.info("-------------------------") + + test_data = input_dataclient.load_data( + location=test_data_filepath, load_config=None + ) + + logger.info("---------------------") + logger.info("--- Loading model ---") + logger.info("---------------------") + + model.load_model(model_filepath) + + logger.info("------------------------------") + logger.info("--- Generating predictions ---") + logger.info("------------------------------") + + prediction_data = ( + test_data.drop(columns=target) if target in test_data.columns else test_data + ) + + predictions = model.predict( + data=prediction_data, post_prediction_logic=post_prediction_logic + ) + + logger.info("--------------------------") + logger.info("--- Saving predictions ---") + logger.info("--------------------------") + + predictions_df = pd.DataFrame(predictions) + predictions_df.columns = [predictions_column_name] + + output_dataclient.save_data( + obj=predictions_df, location=predictions_output_filepath, save_config=None + ) From 57ed666ea729b36e8c102ce75693ee6af5da97ab Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 17:38:56 +0100 Subject: [PATCH 13/16] removed redundant bucket and fixed storage filepath --- .github/workflows/Deploy.yml | 1 - deployment/handlers/prediction_app.py | 12 +++++++++++- deployment/serverless.yml | 1 - 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 9ec57af..0f1003f 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -114,7 +114,6 @@ jobs: - name: Deploy to AWS Lambda via Serverless env: RUNTIME_ENVIRONMENT: ${{ steps.set_runtime_environment.outputs.runtime_environment }} - MODEL_DIRECTORY_BUCKET: ${{ steps.s3_buckets.outputs.model_directory_bucket }} PREDICTIONS_BUCKET: ${{ steps.s3_buckets.outputs.predictions_bucket }} DATA_BUCKET: ${{ steps.s3_buckets.outputs.data_bucket }} DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }} diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index fb64b83..969439c 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -17,6 +17,8 @@ from core.DataClient import dataclient_factory logger = logging.getLogger() logger.setLevel(logging.INFO) +PREDICTIONS_BUCKET = os.getenv("PREDICTIONS_BUCKET", None) + def upload_dataframe_to_s3(df, bucket, s3_file_name): """ @@ -65,7 +67,9 @@ def handler(event, context): # TODO: Implement the loading of the model and prediction - storage_filepath = f"{portfolio_id}/{property_id}/{created_at}.csv" + storage_filepath = ( + f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.csv" + ) logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---") @@ -78,11 +82,17 @@ def handler(event, context): model = model_factory(build_model_params["model_type"]) + logger.info("----------------------------") + logger.info(f"--- Initiate Input DataClient ---") + logger.info("----------------------------") input_dataclient = dataclient_factory( dataclient_type="aws-s3", dataclient_config=client_params["aws-s3"], ) + logger.info("----------------------------") + logger.info(f"--- Initiate Output DataClient ---") + logger.info("----------------------------") output_dataclient = dataclient_factory( dataclient_type="aws-s3", dataclient_config=client_params["aws-s3"], diff --git a/deployment/serverless.yml b/deployment/serverless.yml index e546f13..7116a42 100644 --- a/deployment/serverless.yml +++ b/deployment/serverless.yml @@ -6,7 +6,6 @@ provider: architecture: x86_64 environment: RUNTIME_ENVIRONMENT: ${env:RUNTIME_ENVIRONMENT} - MODEL_DIRECTORY_BUCKET: ${env:MODEL_DIRECTORY_BUCKET} PREDICTIONS_BUCKET: ${env:PREDICTIONS_BUCKET} DATA_BUCKET: ${env:DATA_BUCKET} DOMAIN_NAME: ${env:DOMAIN_NAME} From b21a221f3b312a8d3b082cd4bc603b7d79870925 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 17:46:04 +0100 Subject: [PATCH 14/16] remove model bucket from serverless --- .github/workflows/Deploy.yml | 1 - deployment/serverless.yml | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 0f1003f..4b063ec 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -50,7 +50,6 @@ jobs: # Fetch the secret using the secret prefix echo "::set-output name=data_bucket::${{ secrets[format('{0}_DATA_BUCKET', steps.secret_prefix.outputs.secret_prefix)] }}" echo "::set-output name=predictions_bucket::${{ secrets[format('{0}_PREDICTIONS_BUCKET', steps.secret_prefix.outputs.secret_prefix)] }}" - echo "::set-output name=model_directory_bucket::${{ secrets[format('{0}_MODEL_DIRECTORY_BUCKET', steps.secret_prefix.outputs.secret_prefix)] }}" - name: Set stack_name id: set_stack_name diff --git a/deployment/serverless.yml b/deployment/serverless.yml index 7116a42..b23158d 100644 --- a/deployment/serverless.yml +++ b/deployment/serverless.yml @@ -15,13 +15,11 @@ provider: role: name: ${env:STACK_NAME}_s3_access statements: - # Allow reading from MODEL_DIRECTORY_BUCKET and DATA_BUCKET + # Allow reading from the DATA_BUCKET - Effect: Allow Action: - s3:* Resource: - - arn:aws:s3:::${env:MODEL_DIRECTORY_BUCKET} - - arn:aws:s3:::${env:MODEL_DIRECTORY_BUCKET}/* - arn:aws:s3:::${env:DATA_BUCKET} - arn:aws:s3:::${env:DATA_BUCKET}/* # Allow reading and writing to PREDICTIONS_BUCKET From e4352bda1ea8f3b1fb4a96cece575df68400bae4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 18:05:21 +0100 Subject: [PATCH 15/16] corrected reference to s3 bucekts --- .github/workflows/Deploy.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/Deploy.yml b/.github/workflows/Deploy.yml index 4b063ec..cc5c716 100644 --- a/.github/workflows/Deploy.yml +++ b/.github/workflows/Deploy.yml @@ -113,8 +113,8 @@ jobs: - name: Deploy to AWS Lambda via Serverless env: RUNTIME_ENVIRONMENT: ${{ steps.set_runtime_environment.outputs.runtime_environment }} - PREDICTIONS_BUCKET: ${{ steps.s3_buckets.outputs.predictions_bucket }} - DATA_BUCKET: ${{ steps.s3_buckets.outputs.data_bucket }} + PREDICTIONS_BUCKET: ${{ steps.set_s3_buckets.outputs.predictions_bucket }} + DATA_BUCKET: ${{ steps.set_s3_buckets.outputs.data_bucket }} DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }} ECR_URI: ${{ steps.set_ecr_credentials.outputs.ecr_uri }} GITHUB_SHA: ${{ github.sha }} From 5e62b2d43ed29df8050486296b7dcd2466eae632 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 18:23:51 +0100 Subject: [PATCH 16/16] updated save filetype to parquet --- deployment/handlers/prediction_app.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index 969439c..6f9d162 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -67,9 +67,7 @@ def handler(event, context): # TODO: Implement the loading of the model and prediction - storage_filepath = ( - f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.csv" - ) + storage_filepath = f"s3://{PREDICTIONS_BUCKET}/{portfolio_id}/{property_id}/{created_at}.parquet" logger.info("-------------------------") logger.info(f"--- Initiate MLModel ---")