From 2ff57a83ede37495c0c35d4b3132c9bdb190d10e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Aug 2023 16:29:24 +0100 Subject: [PATCH] handling relative paths for autogluon --- .../simulation_system/MLModel/Models.py | 8 ++--- model_data/simulation_system/core/Helpers.py | 17 ++++++++++ model_data/simulation_system/predictions.py | 33 +++++++++++-------- .../requirements/prediction.txt | 0 model_data/simulation_system/training.py | 7 ++-- 5 files changed, 43 insertions(+), 22 deletions(-) create mode 100644 model_data/simulation_system/core/Helpers.py create mode 100644 model_data/simulation_system/requirements/prediction.txt diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py index 89bbe762..ccf6fdf8 100644 --- a/model_data/simulation_system/MLModel/Models.py +++ b/model_data/simulation_system/MLModel/Models.py @@ -122,17 +122,15 @@ class AutogluonModel: return metrics_df - def optimise_model_for_deployment(self, deployment_path: Path = None) -> None: + def optimise_model_for_deployment(self, deployment_path: Path = None) -> str: """ We can optimise the deployment for a autogluon model """ if self.model is None: - logger.error("No model to optimise for deployment") - exit(1) + raise ValueError("No model to optimise for deployment") if deployment_path is None: - logger.error("Deployment path required") - exit(1) + raise ValueError("Deployment path required") # This will return a string path of the location return self.model.clone_for_deployment(deployment_path) diff --git a/model_data/simulation_system/core/Helpers.py b/model_data/simulation_system/core/Helpers.py new file mode 100644 index 00000000..65491c42 --- /dev/null +++ b/model_data/simulation_system/core/Helpers.py @@ -0,0 +1,17 @@ +from pathlib import Path + + +def ensure_relative_path(file_path: str, relative_to: str | Path = None) -> Path: + """ + Convert the given path to a relative path. + + :param file_path: The path to check and possibly convert. + :param relative_to: Optional path to which the given path should be made relative. + If not provided, the current working directory is used. + :return: The relative path. + """ + path = Path(file_path) + if path.is_absolute(): + base_path = Path(relative_to) if relative_to else Path.cwd() + return path.relative_to(base_path) + return path diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py index bc1b113b..aa6c2d0f 100644 --- a/model_data/simulation_system/predictions.py +++ b/model_data/simulation_system/predictions.py @@ -4,14 +4,13 @@ Script to load MLModel class and generate predictions import json import argparse -from MLModel.Models import AutogluonModel -from core.Logger import logger -from core.DataLoader import DataLoader -from pathlib import Path +from model_data.simulation_system.MLModel.Models import AutogluonModel +from model_data.simulation_system.core.Logger import logger +from model_data.simulation_system.core.DataLoader import DataLoader import pandas as pd from typing import Optional from datetime import datetime -from core.Settings import ( +from model_data.simulation_system.core.Settings import ( BASE_REGISTRY_PATH, REGISTRY_FILE, PREDICTION_LOCATION, @@ -19,10 +18,12 @@ from core.Settings import ( METADATA_FILE ) -TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") +TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + # FOR TESTING -# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame) +# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to +# DataFrame) # TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet") # DATA = TEST_DATA.sample(1) @@ -33,18 +34,20 @@ def ingest_arguments() -> argparse.Namespace: """ parser = argparse.ArgumentParser(description='Inputs for training script') - parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE') - parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here') + parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', + choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE') + parser.add_argument('--model-path', type=str, + help='If you wish to use a specific model, specify the model path here') parser.add_argument('--data', type=str, help='Json data for predictions') parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training') args = parser.parse_args() return args - -def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None): +def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, + data_path: Optional[str] = None): """ Main pipeline function """ @@ -93,6 +96,7 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data logger.info("--- Loading Model ---") model = AutogluonModel() + model.load_model(filepath=model_location) logger.info("--- Generating Predictions ---") @@ -125,10 +129,11 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data return json_prediction -if __name__ == "__main__": +if __name__ == "__main__": args = ingest_arguments() # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}' - # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet - prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path) \ No newline at end of file + # Data path can be passed as so: python3 predictions.py --data-path + # ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet + prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path) diff --git a/model_data/simulation_system/requirements/prediction.txt b/model_data/simulation_system/requirements/prediction.txt new file mode 100644 index 00000000..e69de29b diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index 4d751c9b..d67a7e58 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -1,16 +1,13 @@ import argparse # import boto3 -import os from pathlib import Path from datetime import datetime -from typing import List from model_data.simulation_system.core.Logger import logger from model_data.simulation_system.core.DataLoader import DataLoader from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor from model_data.simulation_system.MLModel.Models import AutogluonModel import pandas as pd from model_data.simulation_system.core.Settings import ( - MODEL_DIRECTORY, BASE_REGISTRY_PATH, REGISTRY_FILE, MODEL_FOLDER, @@ -19,6 +16,7 @@ from model_data.simulation_system.core.Settings import ( SUBSAMPLE_FACTOR, MODEL_HYPERPARAMETERS ) +from model_data.simulation_system.core.Helpers import ensure_relative_path import seaborn as sns import matplotlib.pyplot as plt @@ -159,6 +157,9 @@ def training( logger.info("--- Optimising model for deployment ---") deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER) + # Autogluon requires models to be stored at relative paths. This will likely eventually be s3 however we + # make sure the path is relative to the location of this script + deployment_model_path = ensure_relative_path(deployment_model_path, Path(__file__).parent) logger.info(f"Optimised version of best model can be found at: {deployment_model_path}") # TODO: Need a model registry - for now have this as a CSV