added checking for directory before creation and made some minor style changes

2026-06-08 11:17:27 +00:00 · 2023-08-25 15:21:17 +01:00 · 2023-08-25 15:21:17 +01:00 · 81d7e6afb7
commit 81d7e6afb7
parent 4a73ebfb74
7 changed files with 82 additions and 66 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (simulation_system)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (simulation_system)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/init.py
+++ b/init.py
--- a/model_data/simulation_system/core/DataLoader.py
+++ b/model_data/simulation_system/core/DataLoader.py
@ -1,13 +1,18 @@
 import pandas as pd
-from core.Logger import logger
+import os

-class DataLoader():
+
+class DataLoader:

    @staticmethod
    def load(filepath: str, index_col: str = None) -> pd.DataFrame:
        """
        Load different datasets
        """
+
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(f"File not found: {filepath}")
+
        if filepath.endswith('.parquet'):
            df = pd.read_parquet(filepath)
            if index_col is not None:
@ -15,7 +20,6 @@ class DataLoader():
        elif filepath.endswith('.csv'):
            df = pd.read_csv(filepath, index_col=index_col)
        else:
-            logger.error('Not implemented!')
-            exit(1)
+            raise ValueError(f"File format not supported for file: {filepath}")

-        return df
+        return df
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@ -23,6 +23,7 @@ class DataProcessor:

    def __init__(self, filepath: Path) -> None:
        self.filepath = filepath
+        self.data = None

    def load_data(self, low_memory=False) -> None:
        self.data = pd.read_csv(self.filepath, low_memory=low_memory)
--- a/model_data/simulation_system/requirements/training.txt
+++ b/model_data/simulation_system/requirements/training.txt
@ -0,0 +1,3 @@
+autogluon==0.8.2
+pandas==1.5.3
+seaborn==0.12.2
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@ -1,16 +1,15 @@
-
 import argparse
 # import boto3
-import os 
+import os
 from pathlib import Path
 from datetime import datetime
 from typing import List
-from core.Logger import logger
-from core.DataLoader import DataLoader
-from core.FeatureProcessor import FeatureProcessor
+from model_data.simulation_system.core.Logger import logger
+from model_data.simulation_system.core.DataLoader import DataLoader
+from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
 from MLModel.Models import AutogluonModel
 import pandas as pd
-from core.Settings import (
+from model_data.simulation_system.core.Settings import (
    MODEL_DIRECTORY,
    BASE_REGISTRY_PATH,
    REGISTRY_FILE,
@ -23,7 +22,8 @@ from core.Settings import (
 import seaborn as sns
 import matplotlib.pyplot as plt

-TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")     
+TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+

 # FOR TESTING
 # train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
@ -52,23 +52,27 @@ def ingest_arguments() -> argparse.Namespace:

    parser = argparse.ArgumentParser(description='Inputs for training script')

-    parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
-    parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
-    parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
-    parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
+    parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training',
+                        required=True)
+    parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing',
+                        required=True)
+    parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"],
+                        default="autogluon")
+    parser.add_argument('--target-column', type=str, help='The response variable',
+                        choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')

    args = parser.parse_args()

    return args
-            
+

 def training(
-        train_filepath: str, 
-        test_filepath: str, 
-        target_column: str = "RDSAP_CHANGE", 
-        model_type: str = "autogluon", 
-        hyperparameters: dict = None
-        ) -> None:
+    train_filepath: str,
+    test_filepath: str,
+    target_column: str = "RDSAP_CHANGE",
+    model_type: str = "autogluon",
+    hyperparameters: dict = None
+) -> None:
    """
    Pipeline to run training on the dataset
    """
@ -77,12 +81,12 @@ def training(
    dataloader = DataLoader()
    train_df = dataloader.load(filepath=train_filepath)
    test_df = dataloader.load(filepath=test_filepath)
- 
+
    logger.info('--- Feature processing ---')

    feature_processor = FeatureProcessor()

-    subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR)
+    subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR)

    train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
    test_df = feature_processor.process(test_df, target_column=target_column)
@ -98,65 +102,63 @@ def training(

    if model_type == "autogluon":
        model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
-        output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root 
+        output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root

        model = AutogluonModel(
-            output_filepath = output_base / MODEL_FOLDER
-            )
-    else:
-        logger.error("No alternative model implemented yet")
-        exit(1)
-    
-    model.train_model(
-        data=train_df, 
-        target_column=target_column, 
-        hyperparameters=hyperparameters
+            output_filepath=output_base / MODEL_FOLDER
        )
-    
+    else:
+        raise ValueError("No alternative model implemented yet")
+
+    model.train_model(
+        data=train_df,
+        target_column=target_column,
+        hyperparameters=hyperparameters
+    )
+
    logger.info("--- Save Model ---")
    model.save_model(output_filepath=model.output_filepath)

    logger.info('--- Generate evaluation metrics ---')
    metrics_df = model.model_evaluation(
-        validation_data=test_df, 
+        validation_data=test_df,
        target_column=target_column,
-        metrics_location = output_base / METRICS_FOLDER
-        )
-    
+        metrics_location=output_base / METRICS_FOLDER
+    )
+
    logger.info("--- Generate metric outputs using predictions ---")
    # TODO: can have a model.metric_outputs method
    # FOr not just do it here
    residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred'])
-    
+
    # image formatting
    # TODO: move to settings file , AXIS_FONT, TITLE_FONT
-    axis_fs = 18 #fontsize
-    title_fs = 22 #fontsize
+    axis_fs = 18  # fontsize
+    title_fs = 22  # fontsize
    sns.set(style="whitegrid")
-    ax = sns.scatterplot(x="true", y="pred",data=residual_df)
+    ax = sns.scatterplot(x="true", y="pred", data=residual_df)
    ax.set_aspect('equal')
-    ax.set_xlabel(f'True {target_column}',fontsize = axis_fs) 
-    ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel
-    ax.set_title('Residuals', fontsize = title_fs)
+    ax.set_xlabel(f'True {target_column}', fontsize=axis_fs)
+    ax.set_ylabel(f'Predicted {target_column}', fontsize=axis_fs)  # ylabel
+    ax.set_title('Residuals', fontsize=title_fs)

    # Square aspect ratio
    ax.plot([-100, 100], [-100, 100], 'black', linewidth=1)

    plt.tight_layout()
    RESIDUAL_FILE = "residuals.png"
-    plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120) 
+    plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)

    # TODO: for cml, we might want to have class that outputs all data and plots to add to the report
    # If we want residual plot/ any plots, we will need to self host
    # plt.savefig(RESIDUAL_FILE, dpi=120) 

-    
    # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
    # Imagining for now that the model trained here is the best model amongst all models built

    logger.info("--- Optimising model for deployment ---")

-    deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER)
+    deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
    logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")

    # TODO: Need a model registry - for now have this as a CSV
@ -170,43 +172,49 @@ def training(
        registry_df = pd.read_csv(registry_path, index_col=None)
    else:
        # TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns 
-        registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
+        registry_df = pd.DataFrame(
+            columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error',
+                     'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])

    model_details_df = pd.DataFrame(
        [{
-            'model_type': model_type, 
-            'model_name': model_root, 
+            'model_type': model_type,
+            'model_name': model_root,
            'model_location': deployment_model_path
        }]
-        )
-    
+    )
+
    registry_row = pd.concat([model_details_df, metrics_df], axis=1)
    registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)

-    # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
+    # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
+    #       regenerate new metrics
    # TODO: decide metric to optimise to
    registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
-    registry_df['best_model'] = [False]*len(registry_df)
+    registry_df['best_model'] = [False] * len(registry_df)
    registry_df.loc[0, 'best_model'] = True

    logger.info("--- Saving new model to registry ---")
+    # Ensure the directory exists
+    registry_path.parent.mkdir(parents=True, exist_ok=True)
    registry_df.to_csv(registry_path, index=False)

    logger.info("--- Training Pipeline Complete --- ")


 if __name__ == "__main__":
-
    logger.info('---Begin Pipeline---')

    logger.info('---Ingest Arguments---')
    args = ingest_arguments()

-    # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
+    # To run script: python3 training.py --train-filepath
+    # ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
+    # ./model_build_data/change_data/rdsap_full/test_data.parquet
    # TODO: Ingest hyper parameters from somewhere - currently change at the top of script
    training(
-        train_filepath=args.train_filepath, 
-        test_filepath=args.test_filepath, 
-        target_column=args.target_column, 
+        train_filepath=args.train_filepath,
+        test_filepath=args.test_filepath,
+        target_column=args.target_column,
        model_type=args.model_type
-        )
+    )