From 81d7e6afb7d3cf18c9e3f8750a830f526f9fe81a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Aug 2023 15:21:17 +0100 Subject: [PATCH] added checking for directory before creation and made some minor style changes --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- __init__.py | 0 .../simulation_system/core/DataLoader.py | 14 +- .../simulation_system/core/DataProcessor.py | 1 + .../requirements/training.txt | 3 + model_data/simulation_system/training.py | 126 ++++++++++-------- 7 files changed, 82 insertions(+), 66 deletions(-) create mode 100644 __init__.py create mode 100644 model_data/simulation_system/requirements/training.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index 05b9012b..03f5e8e2 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 3b05c6ac..daffedc9 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/model_data/simulation_system/core/DataLoader.py b/model_data/simulation_system/core/DataLoader.py index 1e811f8d..dcd7af16 100644 --- a/model_data/simulation_system/core/DataLoader.py +++ b/model_data/simulation_system/core/DataLoader.py @@ -1,13 +1,18 @@ import pandas as pd -from core.Logger import logger +import os -class DataLoader(): + +class DataLoader: @staticmethod def load(filepath: str, index_col: str = None) -> pd.DataFrame: """ Load different datasets """ + + if not os.path.exists(filepath): + raise FileNotFoundError(f"File not found: {filepath}") + if filepath.endswith('.parquet'): df = pd.read_parquet(filepath) if index_col is not None: @@ -15,7 +20,6 @@ class DataLoader(): elif filepath.endswith('.csv'): df = pd.read_csv(filepath, index_col=index_col) else: - logger.error('Not implemented!') - exit(1) + raise ValueError(f"File format not supported for file: {filepath}") - return df \ No newline at end of file + return df diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py index 1ac53517..7b50f486 100644 --- a/model_data/simulation_system/core/DataProcessor.py +++ b/model_data/simulation_system/core/DataProcessor.py @@ -23,6 +23,7 @@ class DataProcessor: def __init__(self, filepath: Path) -> None: self.filepath = filepath + self.data = None def load_data(self, low_memory=False) -> None: self.data = pd.read_csv(self.filepath, low_memory=low_memory) diff --git a/model_data/simulation_system/requirements/training.txt b/model_data/simulation_system/requirements/training.txt new file mode 100644 index 00000000..17e4c8da --- /dev/null +++ b/model_data/simulation_system/requirements/training.txt @@ -0,0 +1,3 @@ +autogluon==0.8.2 +pandas==1.5.3 +seaborn==0.12.2 diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index b37e7154..d41e6c56 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -1,16 +1,15 @@ - import argparse # import boto3 -import os +import os from pathlib import Path from datetime import datetime from typing import List -from core.Logger import logger -from core.DataLoader import DataLoader -from core.FeatureProcessor import FeatureProcessor +from model_data.simulation_system.core.Logger import logger +from model_data.simulation_system.core.DataLoader import DataLoader +from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor from MLModel.Models import AutogluonModel import pandas as pd -from core.Settings import ( +from model_data.simulation_system.core.Settings import ( MODEL_DIRECTORY, BASE_REGISTRY_PATH, REGISTRY_FILE, @@ -23,7 +22,8 @@ from core.Settings import ( import seaborn as sns import matplotlib.pyplot as plt -TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") +TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + # FOR TESTING # train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet" @@ -52,23 +52,27 @@ def ingest_arguments() -> argparse.Namespace: parser = argparse.ArgumentParser(description='Inputs for training script') - parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True) - parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True) - parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon") - parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE') + parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', + required=True) + parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', + required=True) + parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], + default="autogluon") + parser.add_argument('--target-column', type=str, help='The response variable', + choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE') args = parser.parse_args() return args - + def training( - train_filepath: str, - test_filepath: str, - target_column: str = "RDSAP_CHANGE", - model_type: str = "autogluon", - hyperparameters: dict = None - ) -> None: + train_filepath: str, + test_filepath: str, + target_column: str = "RDSAP_CHANGE", + model_type: str = "autogluon", + hyperparameters: dict = None +) -> None: """ Pipeline to run training on the dataset """ @@ -77,12 +81,12 @@ def training( dataloader = DataLoader() train_df = dataloader.load(filepath=train_filepath) test_df = dataloader.load(filepath=test_filepath) - + logger.info('--- Feature processing ---') feature_processor = FeatureProcessor() - subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR) + subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR) train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount) test_df = feature_processor.process(test_df, target_column=target_column) @@ -98,65 +102,63 @@ def training( if model_type == "autogluon": model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower() - output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root + output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root model = AutogluonModel( - output_filepath = output_base / MODEL_FOLDER - ) - else: - logger.error("No alternative model implemented yet") - exit(1) - - model.train_model( - data=train_df, - target_column=target_column, - hyperparameters=hyperparameters + output_filepath=output_base / MODEL_FOLDER ) - + else: + raise ValueError("No alternative model implemented yet") + + model.train_model( + data=train_df, + target_column=target_column, + hyperparameters=hyperparameters + ) + logger.info("--- Save Model ---") model.save_model(output_filepath=model.output_filepath) logger.info('--- Generate evaluation metrics ---') metrics_df = model.model_evaluation( - validation_data=test_df, + validation_data=test_df, target_column=target_column, - metrics_location = output_base / METRICS_FOLDER - ) - + metrics_location=output_base / METRICS_FOLDER + ) + logger.info("--- Generate metric outputs using predictions ---") # TODO: can have a model.metric_outputs method # FOr not just do it here residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred']) - + # image formatting # TODO: move to settings file , AXIS_FONT, TITLE_FONT - axis_fs = 18 #fontsize - title_fs = 22 #fontsize + axis_fs = 18 # fontsize + title_fs = 22 # fontsize sns.set(style="whitegrid") - ax = sns.scatterplot(x="true", y="pred",data=residual_df) + ax = sns.scatterplot(x="true", y="pred", data=residual_df) ax.set_aspect('equal') - ax.set_xlabel(f'True {target_column}',fontsize = axis_fs) - ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel - ax.set_title('Residuals', fontsize = title_fs) + ax.set_xlabel(f'True {target_column}', fontsize=axis_fs) + ax.set_ylabel(f'Predicted {target_column}', fontsize=axis_fs) # ylabel + ax.set_title('Residuals', fontsize=title_fs) # Square aspect ratio ax.plot([-100, 100], [-100, 100], 'black', linewidth=1) plt.tight_layout() RESIDUAL_FILE = "residuals.png" - plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120) + plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120) # TODO: for cml, we might want to have class that outputs all data and plots to add to the report # If we want residual plot/ any plots, we will need to self host # plt.savefig(RESIDUAL_FILE, dpi=120) - # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment # Imagining for now that the model trained here is the best model amongst all models built logger.info("--- Optimising model for deployment ---") - deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER) + deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER) logger.info(f"Optimised version of best model can be found at: {deployment_model_path}") # TODO: Need a model registry - for now have this as a CSV @@ -170,43 +172,49 @@ def training( registry_df = pd.read_csv(registry_path, index_col=None) else: # TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns - registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model']) + registry_df = pd.DataFrame( + columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', + 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model']) model_details_df = pd.DataFrame( [{ - 'model_type': model_type, - 'model_name': model_root, + 'model_type': model_type, + 'model_name': model_root, 'model_location': deployment_model_path }] - ) - + ) + registry_row = pd.concat([model_details_df, metrics_df], axis=1) registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True) - # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics + # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and + # regenerate new metrics # TODO: decide metric to optimise to registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True) - registry_df['best_model'] = [False]*len(registry_df) + registry_df['best_model'] = [False] * len(registry_df) registry_df.loc[0, 'best_model'] = True logger.info("--- Saving new model to registry ---") + # Ensure the directory exists + registry_path.parent.mkdir(parents=True, exist_ok=True) registry_df.to_csv(registry_path, index=False) logger.info("--- Training Pipeline Complete --- ") if __name__ == "__main__": - logger.info('---Begin Pipeline---') logger.info('---Ingest Arguments---') args = ingest_arguments() - # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet + # To run script: python3 training.py --train-filepath + # ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath + # ./model_build_data/change_data/rdsap_full/test_data.parquet # TODO: Ingest hyper parameters from somewhere - currently change at the top of script training( - train_filepath=args.train_filepath, - test_filepath=args.test_filepath, - target_column=args.target_column, + train_filepath=args.train_filepath, + test_filepath=args.test_filepath, + target_column=args.target_column, model_type=args.model_type - ) + )