From 81d7e6afb7d3cf18c9e3f8750a830f526f9fe81a Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Aug 2023 15:21:17 +0100
Subject: [PATCH 1/6] added checking for directory before creation and made
 some minor style changes

---
 .idea/Model.iml                               |   2 +-
 .idea/misc.xml                                |   2 +-
 __init__.py                                   |   0
 .../simulation_system/core/DataLoader.py      |  14 +-
 .../simulation_system/core/DataProcessor.py   |   1 +
 .../requirements/training.txt                 |   3 +
 model_data/simulation_system/training.py      | 126 ++++++++++--------
 7 files changed, 82 insertions(+), 66 deletions(-)
 create mode 100644 __init__.py
 create mode 100644 model_data/simulation_system/requirements/training.txt
diff --git a/.idea/Model.iml b/.idea/Model.iml
index 05b9012b..03f5e8e2 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (simulation_system)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
 </module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 3b05c6ac..daffedc9 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (simulation_system)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/model_data/simulation_system/core/DataLoader.py b/model_data/simulation_system/core/DataLoader.py
index 1e811f8d..dcd7af16 100644
--- a/model_data/simulation_system/core/DataLoader.py
+++ b/model_data/simulation_system/core/DataLoader.py
@@ -1,13 +1,18 @@
 import pandas as pd
-from core.Logger import logger
+import os
 
-class DataLoader():
+
+class DataLoader:
 
     @staticmethod
     def load(filepath: str, index_col: str = None) -> pd.DataFrame:
         """
         Load different datasets
         """
+
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(f"File not found: {filepath}")
+
         if filepath.endswith('.parquet'):
             df = pd.read_parquet(filepath)
             if index_col is not None:
@@ -15,7 +20,6 @@ class DataLoader():
         elif filepath.endswith('.csv'):
             df = pd.read_csv(filepath, index_col=index_col)
         else:
-            logger.error('Not implemented!')
-            exit(1)
+            raise ValueError(f"File format not supported for file: {filepath}")
 
-        return df
\ No newline at end of file
+        return df
diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py
index 1ac53517..7b50f486 100644
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@@ -23,6 +23,7 @@ class DataProcessor:
 
     def __init__(self, filepath: Path) -> None:
         self.filepath = filepath
+        self.data = None
 
     def load_data(self, low_memory=False) -> None:
         self.data = pd.read_csv(self.filepath, low_memory=low_memory)
diff --git a/model_data/simulation_system/requirements/training.txt b/model_data/simulation_system/requirements/training.txt
new file mode 100644
index 00000000..17e4c8da
--- /dev/null
+++ b/model_data/simulation_system/requirements/training.txt
@@ -0,0 +1,3 @@
+autogluon==0.8.2
+pandas==1.5.3
+seaborn==0.12.2
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index b37e7154..d41e6c56 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -1,16 +1,15 @@
-
 import argparse
 # import boto3
-import os 
+import os
 from pathlib import Path
 from datetime import datetime
 from typing import List
-from core.Logger import logger
-from core.DataLoader import DataLoader
-from core.FeatureProcessor import FeatureProcessor
+from model_data.simulation_system.core.Logger import logger
+from model_data.simulation_system.core.DataLoader import DataLoader
+from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
 from MLModel.Models import AutogluonModel
 import pandas as pd
-from core.Settings import (
+from model_data.simulation_system.core.Settings import (
     MODEL_DIRECTORY,
     BASE_REGISTRY_PATH,
     REGISTRY_FILE,
@@ -23,7 +22,8 @@ from core.Settings import (
 import seaborn as sns
 import matplotlib.pyplot as plt
 
-TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")     
+TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
 
 # FOR TESTING
 # train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
@@ -52,23 +52,27 @@ def ingest_arguments() -> argparse.Namespace:
 
     parser = argparse.ArgumentParser(description='Inputs for training script')
 
-    parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
-    parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
-    parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
-    parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
+    parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training',
+                        required=True)
+    parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing',
+                        required=True)
+    parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"],
+                        default="autogluon")
+    parser.add_argument('--target-column', type=str, help='The response variable',
+                        choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
 
     args = parser.parse_args()
 
     return args
-            
+
 
 def training(
-        train_filepath: str, 
-        test_filepath: str, 
-        target_column: str = "RDSAP_CHANGE", 
-        model_type: str = "autogluon", 
-        hyperparameters: dict = None
-        ) -> None:
+    train_filepath: str,
+    test_filepath: str,
+    target_column: str = "RDSAP_CHANGE",
+    model_type: str = "autogluon",
+    hyperparameters: dict = None
+) -> None:
     """
     Pipeline to run training on the dataset
     """
@@ -77,12 +81,12 @@ def training(
     dataloader = DataLoader()
     train_df = dataloader.load(filepath=train_filepath)
     test_df = dataloader.load(filepath=test_filepath)
- 
+
     logger.info('--- Feature processing ---')
 
     feature_processor = FeatureProcessor()
 
-    subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR)
+    subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR)
 
     train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
     test_df = feature_processor.process(test_df, target_column=target_column)
@@ -98,65 +102,63 @@ def training(
 
     if model_type == "autogluon":
         model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
-        output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root 
+        output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
 
         model = AutogluonModel(
-            output_filepath = output_base / MODEL_FOLDER
-            )
-    else:
-        logger.error("No alternative model implemented yet")
-        exit(1)
-    
-    model.train_model(
-        data=train_df, 
-        target_column=target_column, 
-        hyperparameters=hyperparameters
+            output_filepath=output_base / MODEL_FOLDER
         )
-    
+    else:
+        raise ValueError("No alternative model implemented yet")
+
+    model.train_model(
+        data=train_df,
+        target_column=target_column,
+        hyperparameters=hyperparameters
+    )
+
     logger.info("--- Save Model ---")
     model.save_model(output_filepath=model.output_filepath)
 
     logger.info('--- Generate evaluation metrics ---')
     metrics_df = model.model_evaluation(
-        validation_data=test_df, 
+        validation_data=test_df,
         target_column=target_column,
-        metrics_location = output_base / METRICS_FOLDER
-        )
-    
+        metrics_location=output_base / METRICS_FOLDER
+    )
+
     logger.info("--- Generate metric outputs using predictions ---")
     # TODO: can have a model.metric_outputs method
     # FOr not just do it here
     residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred'])
-    
+
     # image formatting
     # TODO: move to settings file , AXIS_FONT, TITLE_FONT
-    axis_fs = 18 #fontsize
-    title_fs = 22 #fontsize
+    axis_fs = 18  # fontsize
+    title_fs = 22  # fontsize
     sns.set(style="whitegrid")
-    ax = sns.scatterplot(x="true", y="pred",data=residual_df)
+    ax = sns.scatterplot(x="true", y="pred", data=residual_df)
     ax.set_aspect('equal')
-    ax.set_xlabel(f'True {target_column}',fontsize = axis_fs) 
-    ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel
-    ax.set_title('Residuals', fontsize = title_fs)
+    ax.set_xlabel(f'True {target_column}', fontsize=axis_fs)
+    ax.set_ylabel(f'Predicted {target_column}', fontsize=axis_fs)  # ylabel
+    ax.set_title('Residuals', fontsize=title_fs)
 
     # Square aspect ratio
     ax.plot([-100, 100], [-100, 100], 'black', linewidth=1)
 
     plt.tight_layout()
     RESIDUAL_FILE = "residuals.png"
-    plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120) 
+    plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
 
     # TODO: for cml, we might want to have class that outputs all data and plots to add to the report
     # If we want residual plot/ any plots, we will need to self host
     # plt.savefig(RESIDUAL_FILE, dpi=120) 
 
-    
     # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
     # Imagining for now that the model trained here is the best model amongst all models built
 
     logger.info("--- Optimising model for deployment ---")
 
-    deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER)
+    deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
     logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
 
     # TODO: Need a model registry - for now have this as a CSV
@@ -170,43 +172,49 @@ def training(
         registry_df = pd.read_csv(registry_path, index_col=None)
     else:
         # TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns 
-        registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
+        registry_df = pd.DataFrame(
+            columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error',
+                     'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
 
     model_details_df = pd.DataFrame(
         [{
-            'model_type': model_type, 
-            'model_name': model_root, 
+            'model_type': model_type,
+            'model_name': model_root,
             'model_location': deployment_model_path
         }]
-        )
-    
+    )
+
     registry_row = pd.concat([model_details_df, metrics_df], axis=1)
     registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
 
-    # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
+    # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
+    #       regenerate new metrics
     # TODO: decide metric to optimise to
     registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
-    registry_df['best_model'] = [False]*len(registry_df)
+    registry_df['best_model'] = [False] * len(registry_df)
     registry_df.loc[0, 'best_model'] = True
 
     logger.info("--- Saving new model to registry ---")
+    # Ensure the directory exists
+    registry_path.parent.mkdir(parents=True, exist_ok=True)
     registry_df.to_csv(registry_path, index=False)
 
     logger.info("--- Training Pipeline Complete --- ")
 
 
 if __name__ == "__main__":
-
     logger.info('---Begin Pipeline---')
 
     logger.info('---Ingest Arguments---')
     args = ingest_arguments()
 
-    # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
+    # To run script: python3 training.py --train-filepath
+    # ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
+    # ./model_build_data/change_data/rdsap_full/test_data.parquet
     # TODO: Ingest hyper parameters from somewhere - currently change at the top of script
     training(
-        train_filepath=args.train_filepath, 
-        test_filepath=args.test_filepath, 
-        target_column=args.target_column, 
+        train_filepath=args.train_filepath,
+        test_filepath=args.test_filepath,
+        target_column=args.target_column,
         model_type=args.model_type
-        )
+    )

From 0e755626ded6a4010ee78ff7ed145a498c3ea333 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Aug 2023 15:22:55 +0100
Subject: [PATCH 2/6] updated import for featureprocessor

---
 .../core/FeatureProcessor.py                  | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/model_data/simulation_system/core/FeatureProcessor.py b/model_data/simulation_system/core/FeatureProcessor.py
index aef9605f..cefcee9b 100644
--- a/model_data/simulation_system/core/FeatureProcessor.py
+++ b/model_data/simulation_system/core/FeatureProcessor.py
@@ -4,13 +4,14 @@ Create additional features from the dataset
 
 import pandas as pd
 from typing import List
-from core.Logger import logger
+from model_data.simulation_system.core.Logger import logger
 
 RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
 HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
 
-RANDOM_SEED = 0 
-   
+RANDOM_SEED = 0
+
+
 class FeatureProcessor:
     """
     Handle all feature manipulation before modelling
@@ -38,11 +39,11 @@ class FeatureProcessor:
             if not set(features).issubset(df.columns):
                 logger.error('Features defined is not contained in data')
                 exit(1)
-        
+
         df = df[features]
 
         return df
-    
+
     @staticmethod
     def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
         """
@@ -53,14 +54,13 @@ class FeatureProcessor:
             df = df.sample(subsample_amount, random_state=RANDOM_SEED)
         return df
 
-    
     def process(
-            self, 
-            df: pd.DataFrame, 
-            target_column: str = "RDSAP_CHANGE", 
-            features: List[str] = None,
-            subsample_amount: int = None
-            ) -> pd.DataFrame:
+        self,
+        df: pd.DataFrame,
+        target_column: str = "RDSAP_CHANGE",
+        features: List[str] = None,
+        subsample_amount: int = None
+    ) -> pd.DataFrame:
         """
         Pipeline to get data ready for building a model
         """

From d6562bfab9b53f3bfb9dfdf2a920109ac768cae7 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Aug 2023 15:25:35 +0100
Subject: [PATCH 3/6] updating imports for MlModel

---
 .gitignore                                    |  1 +
 .../simulation_system/MLModel/Models.py       | 48 +++++++++----------
 model_data/simulation_system/training.py      |  2 +-
 3 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/.gitignore b/.gitignore
index cb17846e..be9da3aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -252,6 +252,7 @@ backend/.idea
 open_uprn/.idea/
 conservation_areas/.idea/
 model_data/.idea/
+model_data/simulation_system/.idea/
 
 model_data/simulation_system/data*
 
diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py
index 137f2f20..89bbe762 100644
--- a/model_data/simulation_system/MLModel/Models.py
+++ b/model_data/simulation_system/MLModel/Models.py
@@ -13,15 +13,17 @@ from pathlib import Path
 import pandas as pd
 from autogluon.tabular import TabularDataset, TabularPredictor
 from sklearn.metrics import mean_absolute_percentage_error
-from core.Logger import logger
+from model_data.simulation_system.core.Logger import logger
 
 AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
 METRIC_FILENAME = "metrics.csv"
 
+
 class AutogluonModel:
     """
     Autogluon model that implements the MLModel Protocol
     """
+
     def __init__(self, output_filepath: Path = None) -> None:
         self.model = None
         self.output_filepath = output_filepath
@@ -40,10 +42,10 @@ class AutogluonModel:
         logger.info("Using AutoGluon Model - Model saving already occured")
 
     def train_model(
-            self, 
-            data: pd.DataFrame, 
-            target_column: str, 
-            hyperparameters: dict = None) -> None:
+        self,
+        data: pd.DataFrame,
+        target_column: str,
+        hyperparameters: dict = None) -> None:
         """
         For the given data and hyperparameters, a model is trained
         """
@@ -58,17 +60,16 @@ class AutogluonModel:
         AGdata = TabularDataset(data=data)
 
         self.model = TabularPredictor(
-            label=target_column, 
-            path=self.output_filepath, 
+            label=target_column,
+            path=self.output_filepath,
             problem_type=hyperparameters['problem_type'],
             eval_metric=hyperparameters['eval_metric']
-            ).fit(
-            AGdata, 
-            time_limit=hyperparameters['time_limit'], 
-            presets=hyperparameters['presets'], 
+        ).fit(
+            AGdata,
+            time_limit=hyperparameters['time_limit'],
+            presets=hyperparameters['presets'],
             excluded_model_types=hyperparameters['excluded_model_types']
-            )
-
+        )
 
     def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
         """
@@ -84,12 +85,12 @@ class AutogluonModel:
         return predictions
 
     def model_evaluation(
-            self, 
-            validation_data: pd.DataFrame, 
-            target_column: str, 
-            metrics_location: Path = None, 
-            metric_filename: str = METRIC_FILENAME
-            ) -> pd.DataFrame:
+        self,
+        validation_data: pd.DataFrame,
+        target_column: str,
+        metrics_location: Path = None,
+        metric_filename: str = METRIC_FILENAME
+    ) -> pd.DataFrame:
         """
         For any validation data, a set of predictions and metrics are return
         """
@@ -105,7 +106,7 @@ class AutogluonModel:
 
         logger.info("Prediction used for evaluations are saved in self.prediction")
         self.predictions = predictions
-        
+
         # TODO: Can have a custom metric class that defines all different metrics we want 
         metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
 
@@ -117,7 +118,7 @@ class AutogluonModel:
         metrics_df = pd.DataFrame([performance])
         metrics_df.to_csv(metrics_location / metric_filename)
         markdown_filename = metric_filename.split(".")[0] + ".md"
-        metrics_df.to_markdown(metrics_location/ markdown_filename)
+        metrics_df.to_markdown(metrics_location / markdown_filename)
 
         return metrics_df
 
@@ -135,8 +136,3 @@ class AutogluonModel:
 
         # This will return a string path of the location
         return self.model.clone_for_deployment(deployment_path)
-
-        
-
-
-        
\ No newline at end of file
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index d41e6c56..561d1e1d 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -7,7 +7,7 @@ from typing import List
 from model_data.simulation_system.core.Logger import logger
 from model_data.simulation_system.core.DataLoader import DataLoader
 from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
-from MLModel.Models import AutogluonModel
+from model_data.simulation_system.MLModel.Models import AutogluonModel
 import pandas as pd
 from model_data.simulation_system.core.Settings import (
     MODEL_DIRECTORY,

From 67fd184ac570824a56406d14462e28e37e126f29 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Aug 2023 15:33:29 +0100
Subject: [PATCH 4/6] consolidated location of output storage

---
 model_data/simulation_system/training.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index 561d1e1d..4d751c9b 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -102,7 +102,7 @@ def training(
 
     if model_type == "autogluon":
         model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
-        output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
+        output_base = BASE_REGISTRY_PATH / target_column / model_type / model_root
 
         model = AutogluonModel(
             output_filepath=output_base / MODEL_FOLDER
@@ -164,7 +164,6 @@ def training(
     # TODO: Need a model registry - for now have this as a CSV
     # Save this in the model directory
     logger.info("--- Append registry with new model ---")
-
     registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
 
     if registry_path.exists():

From 2ff57a83ede37495c0c35d4b3132c9bdb190d10e Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Aug 2023 16:29:24 +0100
Subject: [PATCH 5/6] handling relative paths for autogluon

---
 .../simulation_system/MLModel/Models.py       |  8 ++---
 model_data/simulation_system/core/Helpers.py  | 17 ++++++++++
 model_data/simulation_system/predictions.py   | 33 +++++++++++--------
 .../requirements/prediction.txt               |  0
 model_data/simulation_system/training.py      |  7 ++--
 5 files changed, 43 insertions(+), 22 deletions(-)
 create mode 100644 model_data/simulation_system/core/Helpers.py
 create mode 100644 model_data/simulation_system/requirements/prediction.txt

diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py
index 89bbe762..ccf6fdf8 100644
--- a/model_data/simulation_system/MLModel/Models.py
+++ b/model_data/simulation_system/MLModel/Models.py
@@ -122,17 +122,15 @@ class AutogluonModel:
 
         return metrics_df
 
-    def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
+    def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
         """
         We can optimise the deployment for a autogluon model
         """
         if self.model is None:
-            logger.error("No model to optimise for deployment")
-            exit(1)
+            raise ValueError("No model to optimise for deployment")
 
         if deployment_path is None:
-            logger.error("Deployment path required")
-            exit(1)
+            raise ValueError("Deployment path required")
 
         # This will return a string path of the location
         return self.model.clone_for_deployment(deployment_path)
diff --git a/model_data/simulation_system/core/Helpers.py b/model_data/simulation_system/core/Helpers.py
new file mode 100644
index 00000000..65491c42
--- /dev/null
+++ b/model_data/simulation_system/core/Helpers.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+
+def ensure_relative_path(file_path: str, relative_to: str | Path = None) -> Path:
+    """
+    Convert the given path to a relative path.
+
+    :param file_path: The path to check and possibly convert.
+    :param relative_to: Optional path to which the given path should be made relative.
+                        If not provided, the current working directory is used.
+    :return: The relative path.
+    """
+    path = Path(file_path)
+    if path.is_absolute():
+        base_path = Path(relative_to) if relative_to else Path.cwd()
+        return path.relative_to(base_path)
+    return path
diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py
index bc1b113b..aa6c2d0f 100644
--- a/model_data/simulation_system/predictions.py
+++ b/model_data/simulation_system/predictions.py
@@ -4,14 +4,13 @@ Script to load MLModel class and generate predictions
 
 import json
 import argparse
-from MLModel.Models import AutogluonModel
-from core.Logger import logger
-from core.DataLoader import DataLoader
-from pathlib import Path
+from model_data.simulation_system.MLModel.Models import AutogluonModel
+from model_data.simulation_system.core.Logger import logger
+from model_data.simulation_system.core.DataLoader import DataLoader
 import pandas as pd
 from typing import Optional
 from datetime import datetime
-from core.Settings import (
+from model_data.simulation_system.core.Settings import (
     BASE_REGISTRY_PATH,
     REGISTRY_FILE,
     PREDICTION_LOCATION,
@@ -19,10 +18,12 @@ from core.Settings import (
     METADATA_FILE
 )
 
-TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
+TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+
 
 # FOR TESTING
-# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
+# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
+# DataFrame)
 # TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
 # DATA = TEST_DATA.sample(1)
 
@@ -33,18 +34,20 @@ def ingest_arguments() -> argparse.Namespace:
     """
 
     parser = argparse.ArgumentParser(description='Inputs for training script')
-    parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
-    parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
+    parser.add_argument('--target-column', type=str, help='The response variable you are predicting for',
+                        choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
+    parser.add_argument('--model-path', type=str,
+                        help='If you wish to use a specific model, specify the model path here')
     parser.add_argument('--data', type=str, help='Json data for predictions')
     parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
 
     args = parser.parse_args()
 
     return args
-            
 
 
-def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
+def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None,
+               data_path: Optional[str] = None):
     """
     Main pipeline function
     """
@@ -93,6 +96,7 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
 
     logger.info("--- Loading Model ---")
     model = AutogluonModel()
+
     model.load_model(filepath=model_location)
 
     logger.info("--- Generating Predictions ---")
@@ -125,10 +129,11 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
 
     return json_prediction
 
-if __name__ == "__main__":
 
+if __name__ == "__main__":
     args = ingest_arguments()
 
     # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
-    # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
-    prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
\ No newline at end of file
+    # Data path can be passed as so: python3 predictions.py --data-path
+    # ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
+    prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
diff --git a/model_data/simulation_system/requirements/prediction.txt b/model_data/simulation_system/requirements/prediction.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index 4d751c9b..d67a7e58 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -1,16 +1,13 @@
 import argparse
 # import boto3
-import os
 from pathlib import Path
 from datetime import datetime
-from typing import List
 from model_data.simulation_system.core.Logger import logger
 from model_data.simulation_system.core.DataLoader import DataLoader
 from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
 from model_data.simulation_system.MLModel.Models import AutogluonModel
 import pandas as pd
 from model_data.simulation_system.core.Settings import (
-    MODEL_DIRECTORY,
     BASE_REGISTRY_PATH,
     REGISTRY_FILE,
     MODEL_FOLDER,
@@ -19,6 +16,7 @@ from model_data.simulation_system.core.Settings import (
     SUBSAMPLE_FACTOR,
     MODEL_HYPERPARAMETERS
 )
+from model_data.simulation_system.core.Helpers import ensure_relative_path
 import seaborn as sns
 import matplotlib.pyplot as plt
 
@@ -159,6 +157,9 @@ def training(
     logger.info("--- Optimising model for deployment ---")
 
     deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
+    # Autogluon requires models to be stored at relative paths. This will likely eventually be s3 however we
+    # make sure the path is relative to the location of this script
+    deployment_model_path = ensure_relative_path(deployment_model_path, Path(__file__).parent)
     logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
 
     # TODO: Need a model registry - for now have this as a CSV

From a5062b24f0bfb53d4fb254a85a7874da37b058d1 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Fri, 25 Aug 2023 18:08:56 +0100
Subject: [PATCH 6/6] got the predictions working

---
 .idea/Model.iml                                 |  2 +-
 .idea/misc.xml                                  |  2 +-
 model_data/simulation_system/core/Helpers.py    | 17 -----------------
 .../requirements/prediction.txt                 |  2 ++
 model_data/simulation_system/training.py        |  7 ++-----
 5 files changed, 6 insertions(+), 24 deletions(-)
 delete mode 100644 model_data/simulation_system/core/Helpers.py

diff --git a/.idea/Model.iml b/.idea/Model.iml
index 03f5e8e2..0ded8e60 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (simulation_system)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (simulation_system_prediction)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
 </module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index daffedc9..ae87bfde 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (simulation_system)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (simulation_system_prediction)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>
diff --git a/model_data/simulation_system/core/Helpers.py b/model_data/simulation_system/core/Helpers.py
deleted file mode 100644
index 65491c42..00000000
--- a/model_data/simulation_system/core/Helpers.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from pathlib import Path
-
-
-def ensure_relative_path(file_path: str, relative_to: str | Path = None) -> Path:
-    """
-    Convert the given path to a relative path.
-
-    :param file_path: The path to check and possibly convert.
-    :param relative_to: Optional path to which the given path should be made relative.
-                        If not provided, the current working directory is used.
-    :return: The relative path.
-    """
-    path = Path(file_path)
-    if path.is_absolute():
-        base_path = Path(relative_to) if relative_to else Path.cwd()
-        return path.relative_to(base_path)
-    return path
diff --git a/model_data/simulation_system/requirements/prediction.txt b/model_data/simulation_system/requirements/prediction.txt
index e69de29b..f9ce32bf 100644
--- a/model_data/simulation_system/requirements/prediction.txt
+++ b/model_data/simulation_system/requirements/prediction.txt
@@ -0,0 +1,2 @@
+autogluon==0.8.2
+pandas==1.5.3
\ No newline at end of file
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index d67a7e58..2a1dfcfa 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -8,6 +8,7 @@ from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
 from model_data.simulation_system.MLModel.Models import AutogluonModel
 import pandas as pd
 from model_data.simulation_system.core.Settings import (
+    MODEL_DIRECTORY,
     BASE_REGISTRY_PATH,
     REGISTRY_FILE,
     MODEL_FOLDER,
@@ -16,7 +17,6 @@ from model_data.simulation_system.core.Settings import (
     SUBSAMPLE_FACTOR,
     MODEL_HYPERPARAMETERS
 )
-from model_data.simulation_system.core.Helpers import ensure_relative_path
 import seaborn as sns
 import matplotlib.pyplot as plt
 
@@ -100,7 +100,7 @@ def training(
 
     if model_type == "autogluon":
         model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
-        output_base = BASE_REGISTRY_PATH / target_column / model_type / model_root
+        output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
 
         model = AutogluonModel(
             output_filepath=output_base / MODEL_FOLDER
@@ -157,9 +157,6 @@ def training(
     logger.info("--- Optimising model for deployment ---")
 
     deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
-    # Autogluon requires models to be stored at relative paths. This will likely eventually be s3 however we
-    # make sure the path is relative to the location of this script
-    deployment_model_path = ensure_relative_path(deployment_model_path, Path(__file__).parent)
     logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
 
     # TODO: Need a model registry - for now have this as a CSV