From 2ff57a83ede37495c0c35d4b3132c9bdb190d10e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Aug 2023 16:29:24 +0100
Subject: [PATCH] handling relative paths for autogluon

---
 .../simulation_system/MLModel/Models.py       |  8 ++---
 model_data/simulation_system/core/Helpers.py  | 17 ++++++++++
 model_data/simulation_system/predictions.py   | 33 +++++++++++--------
 .../requirements/prediction.txt               |  0
 model_data/simulation_system/training.py      |  7 ++--
 5 files changed, 43 insertions(+), 22 deletions(-)
 create mode 100644 model_data/simulation_system/core/Helpers.py
 create mode 100644 model_data/simulation_system/requirements/prediction.txt

diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py
index 89bbe762..ccf6fdf8 100644
--- a/model_data/simulation_system/MLModel/Models.py
+++ b/model_data/simulation_system/MLModel/Models.py
@@ -122,17 +122,15 @@ class AutogluonModel:
 
         return metrics_df
 
-    def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
+    def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
         """
         We can optimise the deployment for a autogluon model
         """
         if self.model is None:
-            logger.error("No model to optimise for deployment")
-            exit(1)
+            raise ValueError("No model to optimise for deployment")
 
         if deployment_path is None:
-            logger.error("Deployment path required")
-            exit(1)
+            raise ValueError("Deployment path required")
 
         # This will return a string path of the location
         return self.model.clone_for_deployment(deployment_path)
diff --git a/model_data/simulation_system/core/Helpers.py b/model_data/simulation_system/core/Helpers.py
new file mode 100644
index 00000000..65491c42
--- /dev/null
+++ b/model_data/simulation_system/core/Helpers.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+
+def ensure_relative_path(file_path: str, relative_to: str | Path = None) -> Path:
+    """
+    Convert the given path to a relative path.
+
+    :param file_path: The path to check and possibly convert.
+    :param relative_to: Optional path to which the given path should be made relative.
+                        If not provided, the current working directory is used.
+    :return: The relative path.
+    """
+    path = Path(file_path)
+    if path.is_absolute():
+        base_path = Path(relative_to) if relative_to else Path.cwd()
+        return path.relative_to(base_path)
+    return path
diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py
index bc1b113b..aa6c2d0f 100644
--- a/model_data/simulation_system/predictions.py
+++ b/model_data/simulation_system/predictions.py
@@ -4,14 +4,13 @@ Script to load MLModel class and generate predictions
 
 import json
 import argparse
-from MLModel.Models import AutogluonModel
-from core.Logger import logger
-from core.DataLoader import DataLoader
-from pathlib import Path
+from model_data.simulation_system.MLModel.Models import AutogluonModel
+from model_data.simulation_system.core.Logger import logger
+from model_data.simulation_system.core.DataLoader import DataLoader
 import pandas as pd
 from typing import Optional
 from datetime import datetime
-from core.Settings import (
+from model_data.simulation_system.core.Settings import (
     BASE_REGISTRY_PATH,
     REGISTRY_FILE,
     PREDICTION_LOCATION,
@@ -19,10 +18,12 @@ from core.Settings import (
     METADATA_FILE
 )
 
-TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
+TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
 
 # FOR TESTING
-# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
+# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
+# DataFrame)
 # TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
 # DATA = TEST_DATA.sample(1)
 
@@ -33,18 +34,20 @@ def ingest_arguments() -> argparse.Namespace:
     """
 
     parser = argparse.ArgumentParser(description='Inputs for training script')
-    parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
-    parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
+    parser.add_argument('--target-column', type=str, help='The response variable you are predicting for',
+                        choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
+    parser.add_argument('--model-path', type=str,
+                        help='If you wish to use a specific model, specify the model path here')
     parser.add_argument('--data', type=str, help='Json data for predictions')
     parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
 
     args = parser.parse_args()
 
     return args
-            
 
 
-def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
+def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None,
+               data_path: Optional[str] = None):
     """
     Main pipeline function
     """
@@ -93,6 +96,7 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
 
     logger.info("--- Loading Model ---")
     model = AutogluonModel()
+
     model.load_model(filepath=model_location)
 
     logger.info("--- Generating Predictions ---")
@@ -125,10 +129,11 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
 
     return json_prediction
 
-if __name__ == "__main__":
 
+if __name__ == "__main__":
     args = ingest_arguments()
 
     # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
-    # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
-    prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
\ No newline at end of file
+    # Data path can be passed as so: python3 predictions.py --data-path
+    # ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
+    prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
diff --git a/model_data/simulation_system/requirements/prediction.txt b/model_data/simulation_system/requirements/prediction.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index 4d751c9b..d67a7e58 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -1,16 +1,13 @@
 import argparse
 # import boto3
-import os
 from pathlib import Path
 from datetime import datetime
-from typing import List
 from model_data.simulation_system.core.Logger import logger
 from model_data.simulation_system.core.DataLoader import DataLoader
 from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
 from model_data.simulation_system.MLModel.Models import AutogluonModel
 import pandas as pd
 from model_data.simulation_system.core.Settings import (
-    MODEL_DIRECTORY,
     BASE_REGISTRY_PATH,
     REGISTRY_FILE,
     MODEL_FOLDER,
@@ -19,6 +16,7 @@ from model_data.simulation_system.core.Settings import (
     SUBSAMPLE_FACTOR,
     MODEL_HYPERPARAMETERS
 )
+from model_data.simulation_system.core.Helpers import ensure_relative_path
 import seaborn as sns
 import matplotlib.pyplot as plt
 
@@ -159,6 +157,9 @@ def training(
     logger.info("--- Optimising model for deployment ---")
 
     deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
+    # Autogluon requires models to be stored at relative paths. This will likely eventually be s3 however we
+    # make sure the path is relative to the location of this script
+    deployment_model_path = ensure_relative_path(deployment_model_path, Path(__file__).parent)
     logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
 
     # TODO: Need a model registry - for now have this as a CSV