diff --git a/.gitignore b/.gitignore index cb17846e..2da626a8 100644 --- a/.gitignore +++ b/.gitignore @@ -127,6 +127,7 @@ venv/ ENV/ env.bak/ venv.bak/ +.training_env/ # Spyder project settings .spyderproject @@ -252,6 +253,7 @@ backend/.idea open_uprn/.idea/ conservation_areas/.idea/ model_data/.idea/ +model_data/simulation_system/.idea/ model_data/simulation_system/data* - +model_data/simulation_system/model_directory/ diff --git a/.idea/Model.iml b/.idea/Model.iml index 05b9012b..0ded8e60 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 3b05c6ac..ae87bfde 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py index d5b25e64..fcb25654 100644 --- a/model_data/simulation_system/MLModel/Models.py +++ b/model_data/simulation_system/MLModel/Models.py @@ -129,17 +129,15 @@ class AutogluonModel: return metrics_df - def optimise_model_for_deployment(self, deployment_path: Path = None) -> None: + def optimise_model_for_deployment(self, deployment_path: Path = None) -> str: """ We can optimise the deployment for a autogluon model """ if self.model is None: - logger.error("No model to optimise for deployment") - exit(1) + raise ValueError("No model to optimise for deployment") if deployment_path is None: - logger.error("Deployment path required") - exit(1) + raise ValueError("Deployment path required") # This will return a string path of the location return self.model.clone_for_deployment(deployment_path) diff --git a/model_data/simulation_system/core/DataLoader.py b/model_data/simulation_system/core/DataLoader.py index 1e811f8d..dcd7af16 100644 --- a/model_data/simulation_system/core/DataLoader.py +++ b/model_data/simulation_system/core/DataLoader.py @@ -1,13 +1,18 @@ import pandas as pd -from core.Logger import logger +import os -class DataLoader(): + +class DataLoader: @staticmethod def load(filepath: str, index_col: str = None) -> pd.DataFrame: """ Load different datasets """ + + if not os.path.exists(filepath): + raise FileNotFoundError(f"File not found: {filepath}") + if filepath.endswith('.parquet'): df = pd.read_parquet(filepath) if index_col is not None: @@ -15,7 +20,6 @@ class DataLoader(): elif filepath.endswith('.csv'): df = pd.read_csv(filepath, index_col=index_col) else: - logger.error('Not implemented!') - exit(1) + raise ValueError(f"File format not supported for file: {filepath}") - return df \ No newline at end of file + return df diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py index 1ac53517..7b50f486 100644 --- a/model_data/simulation_system/core/DataProcessor.py +++ b/model_data/simulation_system/core/DataProcessor.py @@ -23,6 +23,7 @@ class DataProcessor: def __init__(self, filepath: Path) -> None: self.filepath = filepath + self.data = None def load_data(self, low_memory=False) -> None: self.data = pd.read_csv(self.filepath, low_memory=low_memory) diff --git a/model_data/simulation_system/core/FeatureProcessor.py b/model_data/simulation_system/core/FeatureProcessor.py index aef9605f..8b53cb14 100644 --- a/model_data/simulation_system/core/FeatureProcessor.py +++ b/model_data/simulation_system/core/FeatureProcessor.py @@ -6,18 +6,21 @@ import pandas as pd from typing import List from core.Logger import logger -RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE'] -HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE'] +RDSAP_CHANGE_DROP_COLUMNS = ["UPRN", "HEAT_DEMAND_CHANGE"] +HEAT_DEMAND_CHANGE_DROP_COLUMNS = ["UPRN", "RDSAP_CHANGE"] + +RANDOM_SEED = 0 + -RANDOM_SEED = 0 - class FeatureProcessor: """ Handle all feature manipulation before modelling """ @staticmethod - def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame: + def drop_unused_columns( + df: pd.DataFrame, target_column: str = "RDSAP_CHANGE" + ) -> pd.DataFrame: """ Remove the unused columns for RDS """ @@ -36,13 +39,13 @@ class FeatureProcessor: features = df.columns else: if not set(features).issubset(df.columns): - logger.error('Features defined is not contained in data') + logger.error("Features defined is not contained in data") exit(1) - + df = df[features] return df - + @staticmethod def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame: """ @@ -53,14 +56,13 @@ class FeatureProcessor: df = df.sample(subsample_amount, random_state=RANDOM_SEED) return df - def process( - self, - df: pd.DataFrame, - target_column: str = "RDSAP_CHANGE", - features: List[str] = None, - subsample_amount: int = None - ) -> pd.DataFrame: + self, + df: pd.DataFrame, + target_column: str = "RDSAP_CHANGE", + features: List[str] = None, + subsample_amount: int = None, + ) -> pd.DataFrame: """ Pipeline to get data ready for building a model """ diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py index 591b85c7..bc85b74a 100644 --- a/model_data/simulation_system/predictions.py +++ b/model_data/simulation_system/predictions.py @@ -4,13 +4,12 @@ Script to load MLModel class and generate predictions import json import argparse -from MLModel.Models import AutogluonModel -from core.Logger import logger -from core.DataLoader import DataLoader -from pathlib import Path import pandas as pd from typing import Optional from datetime import datetime +from MLModel.Models import AutogluonModel +from core.Logger import logger +from core.DataLoader import DataLoader from core.Settings import ( BASE_REGISTRY_PATH, REGISTRY_FILE, @@ -23,7 +22,8 @@ from core.Settings import ( TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT) # FOR TESTING -# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame) +# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to +# DataFrame) # TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet") # DATA = TEST_DATA.sample(1) @@ -110,6 +110,7 @@ def prediction( logger.info("--- Loading Model ---") model = AutogluonModel() + model.load_model(filepath=model_location) logger.info("--- Generating Predictions ---") @@ -143,7 +144,6 @@ def prediction( if __name__ == "__main__": - args = ingest_arguments() # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}' diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index 6a9dae31..c2ed5c21 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -3,11 +3,13 @@ import argparse # import boto3 from pathlib import Path from datetime import datetime +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from MLModel.Models import AutogluonModel from core.Logger import logger from core.DataLoader import DataLoader from core.FeatureProcessor import FeatureProcessor -from MLModel.Models import AutogluonModel -import pandas as pd from core.Settings import ( MODEL_DIRECTORY, BASE_REGISTRY_PATH, @@ -30,8 +32,6 @@ from core.Settings import ( SEABORN_RESIDUAL_LINE_COLOUR, SEABORN_RESIDUAL_LINE_WIDTH, ) -import seaborn as sns -import matplotlib.pyplot as plt TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT) @@ -137,8 +137,7 @@ def training( model = AutogluonModel(output_filepath=output_base / MODEL_FOLDER) else: - logger.error("No alternative model implemented yet") - exit(1) + raise ValueError("No alternative model implemented yet") model.train_model( data=train_df, target_column=target_column, hyperparameters=hyperparameters @@ -207,7 +206,6 @@ def training( # TODO: Need a model registry - for now have this as a CSV # Save this in the model directory logger.info("--- Append registry with new model ---") - registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE if registry_path.exists(): @@ -244,7 +242,8 @@ def training( registry_row = pd.concat([model_details_df, metrics_df], axis=1) registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True) - # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics + # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and + # regenerate new metrics # TODO: decide metric to optimise to registry_df = registry_df.sort_values( "mean_absolute_error", ascending=False @@ -253,6 +252,8 @@ def training( registry_df.loc[0, "best_model"] = True logger.info("--- Saving new model to registry ---") + # Ensure the directory exists + registry_path.parent.mkdir(parents=True, exist_ok=True) registry_df.to_csv(registry_path, index=False) logger.info("--- Training Pipeline Complete --- ") @@ -265,7 +266,9 @@ if __name__ == "__main__": logger.info("---Ingest Arguments---") args = ingest_arguments() - # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet + # To run script: python3 training.py --train-filepath + # ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath + # ./model_build_data/change_data/rdsap_full/test_data.parquet # TODO: Ingest hyper parameters from somewhere - currently change at the top of script training( train_filepath=args.train_filepath,