From fd076055025a4f43e1bf6a46948f303662a1caae Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Oct 2023 17:08:48 +0100 Subject: [PATCH] Got deployment working --- deployment/Dockerfile.prediction.lambda | 3 +- deployment/handlers/prediction_app.py | 45 +++++++++++--- modules/ml-pipeline/.gitignore | 1 + .../src/pipeline/3_generate_predictions.py | 59 +------------------ .../ml-pipeline/src/pipeline/data/.gitignore | 3 - .../src/pipeline/generate_predictions.py | 57 ++++++++++++++++++ 6 files changed, 98 insertions(+), 70 deletions(-) delete mode 100644 modules/ml-pipeline/src/pipeline/data/.gitignore create mode 100644 modules/ml-pipeline/src/pipeline/generate_predictions.py diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda index 58c3a88..a2520ba 100644 --- a/deployment/Dockerfile.prediction.lambda +++ b/deployment/Dockerfile.prediction.lambda @@ -18,7 +18,8 @@ RUN pip install --no-cache-dir -r ./requirements.txt # Copy the project code COPY modules/ml-pipeline/src/pipeline ./pipeline # Copy the handler -COPY deployment/handlers/prediction_app.py prediction_app.py +COPY deployment/handlers/prediction_app.py ./pipeline/prediction_app.py +WORKDIR ${LAMBDA_TASK_ROOT}/pipeline CMD [ "prediction_app.handler" ] diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py index 31b5139..fb64b83 100644 --- a/deployment/handlers/prediction_app.py +++ b/deployment/handlers/prediction_app.py @@ -9,12 +9,14 @@ import json from io import StringIO import os import logging +from generate_predictions import generate_predictions +from core.MLModels import model_factory +from config import settings +from core.DataClient import dataclient_factory logger = logging.getLogger() logger.setLevel(logging.INFO) -RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "dev") - def upload_dataframe_to_s3(df, bucket, s3_file_name): """ @@ -57,7 +59,6 @@ def handler(event, context): else event["body"] ) - data_path = body["file_location"] property_id = body["property_id"] portfolio_id = body["portfolio_id"] created_at = body["created_at"] @@ -66,11 +67,39 @@ def handler(event, context): storage_filepath = f"{portfolio_id}/{property_id}/{created_at}.csv" - # upload_dataframe_to_s3( - # df=outputs, - # bucket=f"retrofit-sap-predictions-{RUNTIME_ENVIRONMENT}", - # s3_file_name=storage_filepath - # ) + logger.info("-------------------------") + logger.info(f"--- Initiate MLModel ---") + logger.info("-------------------------") + + build_model_params = settings.build_model + client_params = settings.client + feature_process_params = settings.feature_processor + generate_predictions_params = settings.generate_predictions + + model = model_factory(build_model_params["model_type"]) + + input_dataclient = dataclient_factory( + dataclient_type="aws-s3", + dataclient_config=client_params["aws-s3"], + ) + + output_dataclient = dataclient_factory( + dataclient_type="aws-s3", + dataclient_config=client_params["aws-s3"], + ) + + generate_predictions( + input_dataclient=input_dataclient, + output_dataclient=output_dataclient, + model=model, + target=feature_process_params["feature_processor_config"]["target"], + model_filepath=build_model_params["model_save_filepath"], + test_data_filepath=body["file_location"], + predictions_output_filepath=storage_filepath, + predictions_column_name=generate_predictions_params[ + "predictions_column_name" + ], + ) return { "statusCode": 200, diff --git a/modules/ml-pipeline/.gitignore b/modules/ml-pipeline/.gitignore index 664bc8d..435bf5b 100644 --- a/modules/ml-pipeline/.gitignore +++ b/modules/ml-pipeline/.gitignore @@ -3,3 +3,4 @@ __pycache__/ .DS_Store .vscode/ +data/ diff --git a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py index f977d9a..9461392 100644 --- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py @@ -4,16 +4,11 @@ After the model is built, we can evaluate its performance """ import os -import yaml -import pandas as pd -from pathlib import Path -from core.interface.InterfaceModels import MLModel -from core.interface.InterfaceDataClient import DataClient from core.DataClient import dataclient_factory from core.MLModels import model_factory from core.Logger import logger -from configs.post_prediction_logic import post_prediction_logic from config import settings +from generate_predictions import generate_predictions logger.info("----------------------------") logger.info(f"--- Initiate Parameters ---") @@ -62,58 +57,6 @@ output_dataclient = dataclient_factory( ) -def generate_predictions( - input_dataclient: DataClient, - output_dataclient: DataClient, - model: MLModel, - target: str, - model_filepath: str, - test_data_filepath: str, - predictions_output_filepath: str, - predictions_column_name: str, -): - """ - For a given model, we generate prediction and evaluate this against the true target - """ - - logger.info("-------------------------") - logger.info("--- Loading test data ---") - logger.info("-------------------------") - - test_data = input_dataclient.load_data( - location=test_data_filepath, load_config=None - ) - - logger.info("---------------------") - logger.info("--- Loading model ---") - logger.info("---------------------") - - model.load_model(model_filepath) - - logger.info("------------------------------") - logger.info("--- Generating predictions ---") - logger.info("------------------------------") - - prediction_data = ( - test_data.drop(columns=target) if target in test_data.columns else test_data - ) - - predictions = model.predict( - data=prediction_data, post_prediction_logic=post_prediction_logic - ) - - logger.info("--------------------------") - logger.info("--- Saving predictions ---") - logger.info("--------------------------") - - predictions_df = pd.DataFrame(predictions) - predictions_df.columns = [predictions_column_name] - - output_dataclient.save_data( - obj=predictions_df, location=predictions_output_filepath, save_config=None - ) - - if __name__ == "__main__": logger.info("----------------------------") diff --git a/modules/ml-pipeline/src/pipeline/data/.gitignore b/modules/ml-pipeline/src/pipeline/data/.gitignore deleted file mode 100644 index 7c8e294..0000000 --- a/modules/ml-pipeline/src/pipeline/data/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -/prepared_data -/model -/predictions diff --git a/modules/ml-pipeline/src/pipeline/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py new file mode 100644 index 0000000..85b3022 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py @@ -0,0 +1,57 @@ +import pandas as pd +from configs.post_prediction_logic import post_prediction_logic +from core.interface.InterfaceModels import MLModel +from core.interface.InterfaceDataClient import DataClient +from core.Logger import logger + + +def generate_predictions( + input_dataclient: DataClient, + output_dataclient: DataClient, + model: MLModel, + target: str, + model_filepath: str, + test_data_filepath: str, + predictions_output_filepath: str, + predictions_column_name: str, +): + """ + For a given model, we generate prediction and evaluate this against the true target + """ + + logger.info("-------------------------") + logger.info("--- Loading test data ---") + logger.info("-------------------------") + + test_data = input_dataclient.load_data( + location=test_data_filepath, load_config=None + ) + + logger.info("---------------------") + logger.info("--- Loading model ---") + logger.info("---------------------") + + model.load_model(model_filepath) + + logger.info("------------------------------") + logger.info("--- Generating predictions ---") + logger.info("------------------------------") + + prediction_data = ( + test_data.drop(columns=target) if target in test_data.columns else test_data + ) + + predictions = model.predict( + data=prediction_data, post_prediction_logic=post_prediction_logic + ) + + logger.info("--------------------------") + logger.info("--- Saving predictions ---") + logger.info("--------------------------") + + predictions_df = pd.DataFrame(predictions) + predictions_df.columns = [predictions_column_name] + + output_dataclient.save_data( + obj=predictions_df, location=predictions_output_filepath, save_config=None + )