mirror of
https://github.com/Hestia-Homes/ML.git
synced 2026-06-08 11:17:25 +00:00
commit
f8409ac63b
6 changed files with 98 additions and 70 deletions
|
|
@ -18,7 +18,8 @@ RUN pip install --no-cache-dir -r ./requirements.txt
|
|||
# Copy the project code
|
||||
COPY modules/ml-pipeline/src/pipeline ./pipeline
|
||||
# Copy the handler
|
||||
COPY deployment/handlers/prediction_app.py prediction_app.py
|
||||
COPY deployment/handlers/prediction_app.py ./pipeline/prediction_app.py
|
||||
WORKDIR ${LAMBDA_TASK_ROOT}/pipeline
|
||||
|
||||
|
||||
CMD [ "prediction_app.handler" ]
|
||||
|
|
|
|||
|
|
@ -9,12 +9,14 @@ import json
|
|||
from io import StringIO
|
||||
import os
|
||||
import logging
|
||||
from generate_predictions import generate_predictions
|
||||
from core.MLModels import model_factory
|
||||
from config import settings
|
||||
from core.DataClient import dataclient_factory
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "dev")
|
||||
|
||||
|
||||
def upload_dataframe_to_s3(df, bucket, s3_file_name):
|
||||
"""
|
||||
|
|
@ -57,7 +59,6 @@ def handler(event, context):
|
|||
else event["body"]
|
||||
)
|
||||
|
||||
data_path = body["file_location"]
|
||||
property_id = body["property_id"]
|
||||
portfolio_id = body["portfolio_id"]
|
||||
created_at = body["created_at"]
|
||||
|
|
@ -66,11 +67,39 @@ def handler(event, context):
|
|||
|
||||
storage_filepath = f"{portfolio_id}/{property_id}/{created_at}.csv"
|
||||
|
||||
# upload_dataframe_to_s3(
|
||||
# df=outputs,
|
||||
# bucket=f"retrofit-sap-predictions-{RUNTIME_ENVIRONMENT}",
|
||||
# s3_file_name=storage_filepath
|
||||
# )
|
||||
logger.info("-------------------------")
|
||||
logger.info(f"--- Initiate MLModel ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
build_model_params = settings.build_model
|
||||
client_params = settings.client
|
||||
feature_process_params = settings.feature_processor
|
||||
generate_predictions_params = settings.generate_predictions
|
||||
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
|
||||
input_dataclient = dataclient_factory(
|
||||
dataclient_type="aws-s3",
|
||||
dataclient_config=client_params["aws-s3"],
|
||||
)
|
||||
|
||||
output_dataclient = dataclient_factory(
|
||||
dataclient_type="aws-s3",
|
||||
dataclient_config=client_params["aws-s3"],
|
||||
)
|
||||
|
||||
generate_predictions(
|
||||
input_dataclient=input_dataclient,
|
||||
output_dataclient=output_dataclient,
|
||||
model=model,
|
||||
target=feature_process_params["feature_processor_config"]["target"],
|
||||
model_filepath=build_model_params["model_save_filepath"],
|
||||
test_data_filepath=body["file_location"],
|
||||
predictions_output_filepath=storage_filepath,
|
||||
predictions_column_name=generate_predictions_params[
|
||||
"predictions_column_name"
|
||||
],
|
||||
)
|
||||
|
||||
return {
|
||||
"statusCode": 200,
|
||||
|
|
|
|||
1
modules/ml-pipeline/.gitignore
vendored
1
modules/ml-pipeline/.gitignore
vendored
|
|
@ -3,3 +3,4 @@
|
|||
__pycache__/
|
||||
.DS_Store
|
||||
.vscode/
|
||||
data/
|
||||
|
|
|
|||
|
|
@ -4,16 +4,11 @@ After the model is built, we can evaluate its performance
|
|||
"""
|
||||
|
||||
import os
|
||||
import yaml
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from core.interface.InterfaceModels import MLModel
|
||||
from core.interface.InterfaceDataClient import DataClient
|
||||
from core.DataClient import dataclient_factory
|
||||
from core.MLModels import model_factory
|
||||
from core.Logger import logger
|
||||
from configs.post_prediction_logic import post_prediction_logic
|
||||
from config import settings
|
||||
from generate_predictions import generate_predictions
|
||||
|
||||
logger.info("----------------------------")
|
||||
logger.info(f"--- Initiate Parameters ---")
|
||||
|
|
@ -62,58 +57,6 @@ output_dataclient = dataclient_factory(
|
|||
)
|
||||
|
||||
|
||||
def generate_predictions(
|
||||
input_dataclient: DataClient,
|
||||
output_dataclient: DataClient,
|
||||
model: MLModel,
|
||||
target: str,
|
||||
model_filepath: str,
|
||||
test_data_filepath: str,
|
||||
predictions_output_filepath: str,
|
||||
predictions_column_name: str,
|
||||
):
|
||||
"""
|
||||
For a given model, we generate prediction and evaluate this against the true target
|
||||
"""
|
||||
|
||||
logger.info("-------------------------")
|
||||
logger.info("--- Loading test data ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
test_data = input_dataclient.load_data(
|
||||
location=test_data_filepath, load_config=None
|
||||
)
|
||||
|
||||
logger.info("---------------------")
|
||||
logger.info("--- Loading model ---")
|
||||
logger.info("---------------------")
|
||||
|
||||
model.load_model(model_filepath)
|
||||
|
||||
logger.info("------------------------------")
|
||||
logger.info("--- Generating predictions ---")
|
||||
logger.info("------------------------------")
|
||||
|
||||
prediction_data = (
|
||||
test_data.drop(columns=target) if target in test_data.columns else test_data
|
||||
)
|
||||
|
||||
predictions = model.predict(
|
||||
data=prediction_data, post_prediction_logic=post_prediction_logic
|
||||
)
|
||||
|
||||
logger.info("--------------------------")
|
||||
logger.info("--- Saving predictions ---")
|
||||
logger.info("--------------------------")
|
||||
|
||||
predictions_df = pd.DataFrame(predictions)
|
||||
predictions_df.columns = [predictions_column_name]
|
||||
|
||||
output_dataclient.save_data(
|
||||
obj=predictions_df, location=predictions_output_filepath, save_config=None
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info("----------------------------")
|
||||
|
|
|
|||
|
|
@ -1,3 +0,0 @@
|
|||
/prepared_data
|
||||
/model
|
||||
/predictions
|
||||
57
modules/ml-pipeline/src/pipeline/generate_predictions.py
Normal file
57
modules/ml-pipeline/src/pipeline/generate_predictions.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
import pandas as pd
|
||||
from configs.post_prediction_logic import post_prediction_logic
|
||||
from core.interface.InterfaceModels import MLModel
|
||||
from core.interface.InterfaceDataClient import DataClient
|
||||
from core.Logger import logger
|
||||
|
||||
|
||||
def generate_predictions(
|
||||
input_dataclient: DataClient,
|
||||
output_dataclient: DataClient,
|
||||
model: MLModel,
|
||||
target: str,
|
||||
model_filepath: str,
|
||||
test_data_filepath: str,
|
||||
predictions_output_filepath: str,
|
||||
predictions_column_name: str,
|
||||
):
|
||||
"""
|
||||
For a given model, we generate prediction and evaluate this against the true target
|
||||
"""
|
||||
|
||||
logger.info("-------------------------")
|
||||
logger.info("--- Loading test data ---")
|
||||
logger.info("-------------------------")
|
||||
|
||||
test_data = input_dataclient.load_data(
|
||||
location=test_data_filepath, load_config=None
|
||||
)
|
||||
|
||||
logger.info("---------------------")
|
||||
logger.info("--- Loading model ---")
|
||||
logger.info("---------------------")
|
||||
|
||||
model.load_model(model_filepath)
|
||||
|
||||
logger.info("------------------------------")
|
||||
logger.info("--- Generating predictions ---")
|
||||
logger.info("------------------------------")
|
||||
|
||||
prediction_data = (
|
||||
test_data.drop(columns=target) if target in test_data.columns else test_data
|
||||
)
|
||||
|
||||
predictions = model.predict(
|
||||
data=prediction_data, post_prediction_logic=post_prediction_logic
|
||||
)
|
||||
|
||||
logger.info("--------------------------")
|
||||
logger.info("--- Saving predictions ---")
|
||||
logger.info("--------------------------")
|
||||
|
||||
predictions_df = pd.DataFrame(predictions)
|
||||
predictions_df.columns = [predictions_column_name]
|
||||
|
||||
output_dataclient.save_data(
|
||||
obj=predictions_df, location=predictions_output_filepath, save_config=None
|
||||
)
|
||||
Loading…
Add table
Reference in a new issue