Merge pull request #54 from Hestia-Homes/master

Got deployment working
This commit is contained in:
KhalimCK 2023-10-03 17:09:36 +01:00 committed by GitHub
commit f8409ac63b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 98 additions and 70 deletions

View file

@ -18,7 +18,8 @@ RUN pip install --no-cache-dir -r ./requirements.txt
# Copy the project code
COPY modules/ml-pipeline/src/pipeline ./pipeline
# Copy the handler
COPY deployment/handlers/prediction_app.py prediction_app.py
COPY deployment/handlers/prediction_app.py ./pipeline/prediction_app.py
WORKDIR ${LAMBDA_TASK_ROOT}/pipeline
CMD [ "prediction_app.handler" ]

View file

@ -9,12 +9,14 @@ import json
from io import StringIO
import os
import logging
from generate_predictions import generate_predictions
from core.MLModels import model_factory
from config import settings
from core.DataClient import dataclient_factory
logger = logging.getLogger()
logger.setLevel(logging.INFO)
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "dev")
def upload_dataframe_to_s3(df, bucket, s3_file_name):
"""
@ -57,7 +59,6 @@ def handler(event, context):
else event["body"]
)
data_path = body["file_location"]
property_id = body["property_id"]
portfolio_id = body["portfolio_id"]
created_at = body["created_at"]
@ -66,11 +67,39 @@ def handler(event, context):
storage_filepath = f"{portfolio_id}/{property_id}/{created_at}.csv"
# upload_dataframe_to_s3(
# df=outputs,
# bucket=f"retrofit-sap-predictions-{RUNTIME_ENVIRONMENT}",
# s3_file_name=storage_filepath
# )
logger.info("-------------------------")
logger.info(f"--- Initiate MLModel ---")
logger.info("-------------------------")
build_model_params = settings.build_model
client_params = settings.client
feature_process_params = settings.feature_processor
generate_predictions_params = settings.generate_predictions
model = model_factory(build_model_params["model_type"])
input_dataclient = dataclient_factory(
dataclient_type="aws-s3",
dataclient_config=client_params["aws-s3"],
)
output_dataclient = dataclient_factory(
dataclient_type="aws-s3",
dataclient_config=client_params["aws-s3"],
)
generate_predictions(
input_dataclient=input_dataclient,
output_dataclient=output_dataclient,
model=model,
target=feature_process_params["feature_processor_config"]["target"],
model_filepath=build_model_params["model_save_filepath"],
test_data_filepath=body["file_location"],
predictions_output_filepath=storage_filepath,
predictions_column_name=generate_predictions_params[
"predictions_column_name"
],
)
return {
"statusCode": 200,

View file

@ -3,3 +3,4 @@
__pycache__/
.DS_Store
.vscode/
data/

View file

@ -4,16 +4,11 @@ After the model is built, we can evaluate its performance
"""
import os
import yaml
import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.Logger import logger
from configs.post_prediction_logic import post_prediction_logic
from config import settings
from generate_predictions import generate_predictions
logger.info("----------------------------")
logger.info(f"--- Initiate Parameters ---")
@ -62,58 +57,6 @@ output_dataclient = dataclient_factory(
)
def generate_predictions(
input_dataclient: DataClient,
output_dataclient: DataClient,
model: MLModel,
target: str,
model_filepath: str,
test_data_filepath: str,
predictions_output_filepath: str,
predictions_column_name: str,
):
"""
For a given model, we generate prediction and evaluate this against the true target
"""
logger.info("-------------------------")
logger.info("--- Loading test data ---")
logger.info("-------------------------")
test_data = input_dataclient.load_data(
location=test_data_filepath, load_config=None
)
logger.info("---------------------")
logger.info("--- Loading model ---")
logger.info("---------------------")
model.load_model(model_filepath)
logger.info("------------------------------")
logger.info("--- Generating predictions ---")
logger.info("------------------------------")
prediction_data = (
test_data.drop(columns=target) if target in test_data.columns else test_data
)
predictions = model.predict(
data=prediction_data, post_prediction_logic=post_prediction_logic
)
logger.info("--------------------------")
logger.info("--- Saving predictions ---")
logger.info("--------------------------")
predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name]
output_dataclient.save_data(
obj=predictions_df, location=predictions_output_filepath, save_config=None
)
if __name__ == "__main__":
logger.info("----------------------------")

View file

@ -1,3 +0,0 @@
/prepared_data
/model
/predictions

View file

@ -0,0 +1,57 @@
import pandas as pd
from configs.post_prediction_logic import post_prediction_logic
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from core.Logger import logger
def generate_predictions(
input_dataclient: DataClient,
output_dataclient: DataClient,
model: MLModel,
target: str,
model_filepath: str,
test_data_filepath: str,
predictions_output_filepath: str,
predictions_column_name: str,
):
"""
For a given model, we generate prediction and evaluate this against the true target
"""
logger.info("-------------------------")
logger.info("--- Loading test data ---")
logger.info("-------------------------")
test_data = input_dataclient.load_data(
location=test_data_filepath, load_config=None
)
logger.info("---------------------")
logger.info("--- Loading model ---")
logger.info("---------------------")
model.load_model(model_filepath)
logger.info("------------------------------")
logger.info("--- Generating predictions ---")
logger.info("------------------------------")
prediction_data = (
test_data.drop(columns=target) if target in test_data.columns else test_data
)
predictions = model.predict(
data=prediction_data, post_prediction_logic=post_prediction_logic
)
logger.info("--------------------------")
logger.info("--- Saving predictions ---")
logger.info("--------------------------")
predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name]
output_dataclient.save_data(
obj=predictions_df, location=predictions_output_filepath, save_config=None
)