From fd076055025a4f43e1bf6a46948f303662a1caae Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 3 Oct 2023 17:08:48 +0100
Subject: [PATCH] Got deployment working

---
 deployment/Dockerfile.prediction.lambda       |  3 +-
 deployment/handlers/prediction_app.py         | 45 +++++++++++---
 modules/ml-pipeline/.gitignore                |  1 +
 .../src/pipeline/3_generate_predictions.py    | 59 +------------------
 .../ml-pipeline/src/pipeline/data/.gitignore  |  3 -
 .../src/pipeline/generate_predictions.py      | 57 ++++++++++++++++++
 6 files changed, 98 insertions(+), 70 deletions(-)
 delete mode 100644 modules/ml-pipeline/src/pipeline/data/.gitignore
 create mode 100644 modules/ml-pipeline/src/pipeline/generate_predictions.py

diff --git a/deployment/Dockerfile.prediction.lambda b/deployment/Dockerfile.prediction.lambda
index 58c3a88..a2520ba 100644
--- a/deployment/Dockerfile.prediction.lambda
+++ b/deployment/Dockerfile.prediction.lambda
@@ -18,7 +18,8 @@ RUN pip install --no-cache-dir -r ./requirements.txt
 # Copy the project code
 COPY modules/ml-pipeline/src/pipeline ./pipeline
 # Copy the handler
-COPY deployment/handlers/prediction_app.py prediction_app.py
+COPY deployment/handlers/prediction_app.py ./pipeline/prediction_app.py
+WORKDIR ${LAMBDA_TASK_ROOT}/pipeline
 
 
 CMD [ "prediction_app.handler" ]
diff --git a/deployment/handlers/prediction_app.py b/deployment/handlers/prediction_app.py
index 31b5139..fb64b83 100644
--- a/deployment/handlers/prediction_app.py
+++ b/deployment/handlers/prediction_app.py
@@ -9,12 +9,14 @@ import json
 from io import StringIO
 import os
 import logging
+from generate_predictions import generate_predictions
+from core.MLModels import model_factory
+from config import settings
+from core.DataClient import dataclient_factory
 
 logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 
-RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "dev")
-
 
 def upload_dataframe_to_s3(df, bucket, s3_file_name):
     """
@@ -57,7 +59,6 @@ def handler(event, context):
             else event["body"]
         )
 
-        data_path = body["file_location"]
         property_id = body["property_id"]
         portfolio_id = body["portfolio_id"]
         created_at = body["created_at"]
@@ -66,11 +67,39 @@ def handler(event, context):
 
         storage_filepath = f"{portfolio_id}/{property_id}/{created_at}.csv"
 
-        # upload_dataframe_to_s3(
-        #     df=outputs,
-        #     bucket=f"retrofit-sap-predictions-{RUNTIME_ENVIRONMENT}",
-        #     s3_file_name=storage_filepath
-        # )
+        logger.info("-------------------------")
+        logger.info(f"--- Initiate MLModel ---")
+        logger.info("-------------------------")
+
+        build_model_params = settings.build_model
+        client_params = settings.client
+        feature_process_params = settings.feature_processor
+        generate_predictions_params = settings.generate_predictions
+
+        model = model_factory(build_model_params["model_type"])
+
+        input_dataclient = dataclient_factory(
+            dataclient_type="aws-s3",
+            dataclient_config=client_params["aws-s3"],
+        )
+
+        output_dataclient = dataclient_factory(
+            dataclient_type="aws-s3",
+            dataclient_config=client_params["aws-s3"],
+        )
+
+        generate_predictions(
+            input_dataclient=input_dataclient,
+            output_dataclient=output_dataclient,
+            model=model,
+            target=feature_process_params["feature_processor_config"]["target"],
+            model_filepath=build_model_params["model_save_filepath"],
+            test_data_filepath=body["file_location"],
+            predictions_output_filepath=storage_filepath,
+            predictions_column_name=generate_predictions_params[
+                "predictions_column_name"
+            ],
+        )
 
         return {
             "statusCode": 200,
diff --git a/modules/ml-pipeline/.gitignore b/modules/ml-pipeline/.gitignore
index 664bc8d..435bf5b 100644
--- a/modules/ml-pipeline/.gitignore
+++ b/modules/ml-pipeline/.gitignore
@@ -3,3 +3,4 @@
 __pycache__/
 .DS_Store
 .vscode/
+data/
diff --git a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py
index f977d9a..9461392 100644
--- a/modules/ml-pipeline/src/pipeline/3_generate_predictions.py
+++ b/modules/ml-pipeline/src/pipeline/3_generate_predictions.py
@@ -4,16 +4,11 @@ After the model is built, we can evaluate its performance
 """
 
 import os
-import yaml
-import pandas as pd
-from pathlib import Path
-from core.interface.InterfaceModels import MLModel
-from core.interface.InterfaceDataClient import DataClient
 from core.DataClient import dataclient_factory
 from core.MLModels import model_factory
 from core.Logger import logger
-from configs.post_prediction_logic import post_prediction_logic
 from config import settings
+from generate_predictions import generate_predictions
 
 logger.info("----------------------------")
 logger.info(f"--- Initiate Parameters ---")
@@ -62,58 +57,6 @@ output_dataclient = dataclient_factory(
 )
 
 
-def generate_predictions(
-    input_dataclient: DataClient,
-    output_dataclient: DataClient,
-    model: MLModel,
-    target: str,
-    model_filepath: str,
-    test_data_filepath: str,
-    predictions_output_filepath: str,
-    predictions_column_name: str,
-):
-    """
-    For a given model, we generate prediction and evaluate this against the true target
-    """
-
-    logger.info("-------------------------")
-    logger.info("--- Loading test data ---")
-    logger.info("-------------------------")
-
-    test_data = input_dataclient.load_data(
-        location=test_data_filepath, load_config=None
-    )
-
-    logger.info("---------------------")
-    logger.info("--- Loading model ---")
-    logger.info("---------------------")
-
-    model.load_model(model_filepath)
-
-    logger.info("------------------------------")
-    logger.info("--- Generating predictions ---")
-    logger.info("------------------------------")
-
-    prediction_data = (
-        test_data.drop(columns=target) if target in test_data.columns else test_data
-    )
-
-    predictions = model.predict(
-        data=prediction_data, post_prediction_logic=post_prediction_logic
-    )
-
-    logger.info("--------------------------")
-    logger.info("--- Saving predictions ---")
-    logger.info("--------------------------")
-
-    predictions_df = pd.DataFrame(predictions)
-    predictions_df.columns = [predictions_column_name]
-
-    output_dataclient.save_data(
-        obj=predictions_df, location=predictions_output_filepath, save_config=None
-    )
-
-
 if __name__ == "__main__":
 
     logger.info("----------------------------")
diff --git a/modules/ml-pipeline/src/pipeline/data/.gitignore b/modules/ml-pipeline/src/pipeline/data/.gitignore
deleted file mode 100644
index 7c8e294..0000000
--- a/modules/ml-pipeline/src/pipeline/data/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-/prepared_data
-/model
-/predictions
diff --git a/modules/ml-pipeline/src/pipeline/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py
new file mode 100644
index 0000000..85b3022
--- /dev/null
+++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py
@@ -0,0 +1,57 @@
+import pandas as pd
+from configs.post_prediction_logic import post_prediction_logic
+from core.interface.InterfaceModels import MLModel
+from core.interface.InterfaceDataClient import DataClient
+from core.Logger import logger
+
+
+def generate_predictions(
+    input_dataclient: DataClient,
+    output_dataclient: DataClient,
+    model: MLModel,
+    target: str,
+    model_filepath: str,
+    test_data_filepath: str,
+    predictions_output_filepath: str,
+    predictions_column_name: str,
+):
+    """
+    For a given model, we generate prediction and evaluate this against the true target
+    """
+
+    logger.info("-------------------------")
+    logger.info("--- Loading test data ---")
+    logger.info("-------------------------")
+
+    test_data = input_dataclient.load_data(
+        location=test_data_filepath, load_config=None
+    )
+
+    logger.info("---------------------")
+    logger.info("--- Loading model ---")
+    logger.info("---------------------")
+
+    model.load_model(model_filepath)
+
+    logger.info("------------------------------")
+    logger.info("--- Generating predictions ---")
+    logger.info("------------------------------")
+
+    prediction_data = (
+        test_data.drop(columns=target) if target in test_data.columns else test_data
+    )
+
+    predictions = model.predict(
+        data=prediction_data, post_prediction_logic=post_prediction_logic
+    )
+
+    logger.info("--------------------------")
+    logger.info("--- Saving predictions ---")
+    logger.info("--------------------------")
+
+    predictions_df = pd.DataFrame(predictions)
+    predictions_df.columns = [predictions_column_name]
+
+    output_dataclient.save_data(
+        obj=predictions_df, location=predictions_output_filepath, save_config=None
+    )