From 81d7e6afb7d3cf18c9e3f8750a830f526f9fe81a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Aug 2023 15:21:17 +0100 Subject: [PATCH 1/6] added checking for directory before creation and made some minor style changes --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- __init__.py | 0 .../simulation_system/core/DataLoader.py | 14 +- .../simulation_system/core/DataProcessor.py | 1 + .../requirements/training.txt | 3 + model_data/simulation_system/training.py | 126 ++++++++++-------- 7 files changed, 82 insertions(+), 66 deletions(-) create mode 100644 __init__.py create mode 100644 model_data/simulation_system/requirements/training.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index 05b9012b..03f5e8e2 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 3b05c6ac..daffedc9 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/model_data/simulation_system/core/DataLoader.py b/model_data/simulation_system/core/DataLoader.py index 1e811f8d..dcd7af16 100644 --- a/model_data/simulation_system/core/DataLoader.py +++ b/model_data/simulation_system/core/DataLoader.py @@ -1,13 +1,18 @@ import pandas as pd -from core.Logger import logger +import os -class DataLoader(): + +class DataLoader: @staticmethod def load(filepath: str, index_col: str = None) -> pd.DataFrame: """ Load different datasets """ + + if not os.path.exists(filepath): + raise FileNotFoundError(f"File not found: {filepath}") + if filepath.endswith('.parquet'): df = pd.read_parquet(filepath) if index_col is not None: @@ -15,7 +20,6 @@ class DataLoader(): elif filepath.endswith('.csv'): df = pd.read_csv(filepath, index_col=index_col) else: - logger.error('Not implemented!') - exit(1) + raise ValueError(f"File format not supported for file: {filepath}") - return df \ No newline at end of file + return df diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py index 1ac53517..7b50f486 100644 --- a/model_data/simulation_system/core/DataProcessor.py +++ b/model_data/simulation_system/core/DataProcessor.py @@ -23,6 +23,7 @@ class DataProcessor: def __init__(self, filepath: Path) -> None: self.filepath = filepath + self.data = None def load_data(self, low_memory=False) -> None: self.data = pd.read_csv(self.filepath, low_memory=low_memory) diff --git a/model_data/simulation_system/requirements/training.txt b/model_data/simulation_system/requirements/training.txt new file mode 100644 index 00000000..17e4c8da --- /dev/null +++ b/model_data/simulation_system/requirements/training.txt @@ -0,0 +1,3 @@ +autogluon==0.8.2 +pandas==1.5.3 +seaborn==0.12.2 diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index b37e7154..d41e6c56 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -1,16 +1,15 @@ - import argparse # import boto3 -import os +import os from pathlib import Path from datetime import datetime from typing import List -from core.Logger import logger -from core.DataLoader import DataLoader -from core.FeatureProcessor import FeatureProcessor +from model_data.simulation_system.core.Logger import logger +from model_data.simulation_system.core.DataLoader import DataLoader +from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor from MLModel.Models import AutogluonModel import pandas as pd -from core.Settings import ( +from model_data.simulation_system.core.Settings import ( MODEL_DIRECTORY, BASE_REGISTRY_PATH, REGISTRY_FILE, @@ -23,7 +22,8 @@ from core.Settings import ( import seaborn as sns import matplotlib.pyplot as plt -TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") +TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + # FOR TESTING # train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet" @@ -52,23 +52,27 @@ def ingest_arguments() -> argparse.Namespace: parser = argparse.ArgumentParser(description='Inputs for training script') - parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True) - parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True) - parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon") - parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE') + parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', + required=True) + parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', + required=True) + parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], + default="autogluon") + parser.add_argument('--target-column', type=str, help='The response variable', + choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE') args = parser.parse_args() return args - + def training( - train_filepath: str, - test_filepath: str, - target_column: str = "RDSAP_CHANGE", - model_type: str = "autogluon", - hyperparameters: dict = None - ) -> None: + train_filepath: str, + test_filepath: str, + target_column: str = "RDSAP_CHANGE", + model_type: str = "autogluon", + hyperparameters: dict = None +) -> None: """ Pipeline to run training on the dataset """ @@ -77,12 +81,12 @@ def training( dataloader = DataLoader() train_df = dataloader.load(filepath=train_filepath) test_df = dataloader.load(filepath=test_filepath) - + logger.info('--- Feature processing ---') feature_processor = FeatureProcessor() - subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR) + subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR) train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount) test_df = feature_processor.process(test_df, target_column=target_column) @@ -98,65 +102,63 @@ def training( if model_type == "autogluon": model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower() - output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root + output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root model = AutogluonModel( - output_filepath = output_base / MODEL_FOLDER - ) - else: - logger.error("No alternative model implemented yet") - exit(1) - - model.train_model( - data=train_df, - target_column=target_column, - hyperparameters=hyperparameters + output_filepath=output_base / MODEL_FOLDER ) - + else: + raise ValueError("No alternative model implemented yet") + + model.train_model( + data=train_df, + target_column=target_column, + hyperparameters=hyperparameters + ) + logger.info("--- Save Model ---") model.save_model(output_filepath=model.output_filepath) logger.info('--- Generate evaluation metrics ---') metrics_df = model.model_evaluation( - validation_data=test_df, + validation_data=test_df, target_column=target_column, - metrics_location = output_base / METRICS_FOLDER - ) - + metrics_location=output_base / METRICS_FOLDER + ) + logger.info("--- Generate metric outputs using predictions ---") # TODO: can have a model.metric_outputs method # FOr not just do it here residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred']) - + # image formatting # TODO: move to settings file , AXIS_FONT, TITLE_FONT - axis_fs = 18 #fontsize - title_fs = 22 #fontsize + axis_fs = 18 # fontsize + title_fs = 22 # fontsize sns.set(style="whitegrid") - ax = sns.scatterplot(x="true", y="pred",data=residual_df) + ax = sns.scatterplot(x="true", y="pred", data=residual_df) ax.set_aspect('equal') - ax.set_xlabel(f'True {target_column}',fontsize = axis_fs) - ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel - ax.set_title('Residuals', fontsize = title_fs) + ax.set_xlabel(f'True {target_column}', fontsize=axis_fs) + ax.set_ylabel(f'Predicted {target_column}', fontsize=axis_fs) # ylabel + ax.set_title('Residuals', fontsize=title_fs) # Square aspect ratio ax.plot([-100, 100], [-100, 100], 'black', linewidth=1) plt.tight_layout() RESIDUAL_FILE = "residuals.png" - plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120) + plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120) # TODO: for cml, we might want to have class that outputs all data and plots to add to the report # If we want residual plot/ any plots, we will need to self host # plt.savefig(RESIDUAL_FILE, dpi=120) - # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment # Imagining for now that the model trained here is the best model amongst all models built logger.info("--- Optimising model for deployment ---") - deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER) + deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER) logger.info(f"Optimised version of best model can be found at: {deployment_model_path}") # TODO: Need a model registry - for now have this as a CSV @@ -170,43 +172,49 @@ def training( registry_df = pd.read_csv(registry_path, index_col=None) else: # TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns - registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model']) + registry_df = pd.DataFrame( + columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', + 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model']) model_details_df = pd.DataFrame( [{ - 'model_type': model_type, - 'model_name': model_root, + 'model_type': model_type, + 'model_name': model_root, 'model_location': deployment_model_path }] - ) - + ) + registry_row = pd.concat([model_details_df, metrics_df], axis=1) registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True) - # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics + # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and + # regenerate new metrics # TODO: decide metric to optimise to registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True) - registry_df['best_model'] = [False]*len(registry_df) + registry_df['best_model'] = [False] * len(registry_df) registry_df.loc[0, 'best_model'] = True logger.info("--- Saving new model to registry ---") + # Ensure the directory exists + registry_path.parent.mkdir(parents=True, exist_ok=True) registry_df.to_csv(registry_path, index=False) logger.info("--- Training Pipeline Complete --- ") if __name__ == "__main__": - logger.info('---Begin Pipeline---') logger.info('---Ingest Arguments---') args = ingest_arguments() - # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet + # To run script: python3 training.py --train-filepath + # ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath + # ./model_build_data/change_data/rdsap_full/test_data.parquet # TODO: Ingest hyper parameters from somewhere - currently change at the top of script training( - train_filepath=args.train_filepath, - test_filepath=args.test_filepath, - target_column=args.target_column, + train_filepath=args.train_filepath, + test_filepath=args.test_filepath, + target_column=args.target_column, model_type=args.model_type - ) + ) From 0e755626ded6a4010ee78ff7ed145a498c3ea333 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Aug 2023 15:22:55 +0100 Subject: [PATCH 2/6] updated import for featureprocessor --- .../core/FeatureProcessor.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/model_data/simulation_system/core/FeatureProcessor.py b/model_data/simulation_system/core/FeatureProcessor.py index aef9605f..cefcee9b 100644 --- a/model_data/simulation_system/core/FeatureProcessor.py +++ b/model_data/simulation_system/core/FeatureProcessor.py @@ -4,13 +4,14 @@ Create additional features from the dataset import pandas as pd from typing import List -from core.Logger import logger +from model_data.simulation_system.core.Logger import logger RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE'] HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE'] -RANDOM_SEED = 0 - +RANDOM_SEED = 0 + + class FeatureProcessor: """ Handle all feature manipulation before modelling @@ -38,11 +39,11 @@ class FeatureProcessor: if not set(features).issubset(df.columns): logger.error('Features defined is not contained in data') exit(1) - + df = df[features] return df - + @staticmethod def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame: """ @@ -53,14 +54,13 @@ class FeatureProcessor: df = df.sample(subsample_amount, random_state=RANDOM_SEED) return df - def process( - self, - df: pd.DataFrame, - target_column: str = "RDSAP_CHANGE", - features: List[str] = None, - subsample_amount: int = None - ) -> pd.DataFrame: + self, + df: pd.DataFrame, + target_column: str = "RDSAP_CHANGE", + features: List[str] = None, + subsample_amount: int = None + ) -> pd.DataFrame: """ Pipeline to get data ready for building a model """ From d6562bfab9b53f3bfb9dfdf2a920109ac768cae7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Aug 2023 15:25:35 +0100 Subject: [PATCH 3/6] updating imports for MlModel --- .gitignore | 1 + .../simulation_system/MLModel/Models.py | 48 +++++++++---------- model_data/simulation_system/training.py | 2 +- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index cb17846e..be9da3aa 100644 --- a/.gitignore +++ b/.gitignore @@ -252,6 +252,7 @@ backend/.idea open_uprn/.idea/ conservation_areas/.idea/ model_data/.idea/ +model_data/simulation_system/.idea/ model_data/simulation_system/data* diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py index 137f2f20..89bbe762 100644 --- a/model_data/simulation_system/MLModel/Models.py +++ b/model_data/simulation_system/MLModel/Models.py @@ -13,15 +13,17 @@ from pathlib import Path import pandas as pd from autogluon.tabular import TabularDataset, TabularPredictor from sklearn.metrics import mean_absolute_percentage_error -from core.Logger import logger +from model_data.simulation_system.core.Logger import logger AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types'] METRIC_FILENAME = "metrics.csv" + class AutogluonModel: """ Autogluon model that implements the MLModel Protocol """ + def __init__(self, output_filepath: Path = None) -> None: self.model = None self.output_filepath = output_filepath @@ -40,10 +42,10 @@ class AutogluonModel: logger.info("Using AutoGluon Model - Model saving already occured") def train_model( - self, - data: pd.DataFrame, - target_column: str, - hyperparameters: dict = None) -> None: + self, + data: pd.DataFrame, + target_column: str, + hyperparameters: dict = None) -> None: """ For the given data and hyperparameters, a model is trained """ @@ -58,17 +60,16 @@ class AutogluonModel: AGdata = TabularDataset(data=data) self.model = TabularPredictor( - label=target_column, - path=self.output_filepath, + label=target_column, + path=self.output_filepath, problem_type=hyperparameters['problem_type'], eval_metric=hyperparameters['eval_metric'] - ).fit( - AGdata, - time_limit=hyperparameters['time_limit'], - presets=hyperparameters['presets'], + ).fit( + AGdata, + time_limit=hyperparameters['time_limit'], + presets=hyperparameters['presets'], excluded_model_types=hyperparameters['excluded_model_types'] - ) - + ) def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame: """ @@ -84,12 +85,12 @@ class AutogluonModel: return predictions def model_evaluation( - self, - validation_data: pd.DataFrame, - target_column: str, - metrics_location: Path = None, - metric_filename: str = METRIC_FILENAME - ) -> pd.DataFrame: + self, + validation_data: pd.DataFrame, + target_column: str, + metrics_location: Path = None, + metric_filename: str = METRIC_FILENAME + ) -> pd.DataFrame: """ For any validation data, a set of predictions and metrics are return """ @@ -105,7 +106,7 @@ class AutogluonModel: logger.info("Prediction used for evaluations are saved in self.prediction") self.predictions = predictions - + # TODO: Can have a custom metric class that defines all different metrics we want metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions) @@ -117,7 +118,7 @@ class AutogluonModel: metrics_df = pd.DataFrame([performance]) metrics_df.to_csv(metrics_location / metric_filename) markdown_filename = metric_filename.split(".")[0] + ".md" - metrics_df.to_markdown(metrics_location/ markdown_filename) + metrics_df.to_markdown(metrics_location / markdown_filename) return metrics_df @@ -135,8 +136,3 @@ class AutogluonModel: # This will return a string path of the location return self.model.clone_for_deployment(deployment_path) - - - - - \ No newline at end of file diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index d41e6c56..561d1e1d 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -7,7 +7,7 @@ from typing import List from model_data.simulation_system.core.Logger import logger from model_data.simulation_system.core.DataLoader import DataLoader from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor -from MLModel.Models import AutogluonModel +from model_data.simulation_system.MLModel.Models import AutogluonModel import pandas as pd from model_data.simulation_system.core.Settings import ( MODEL_DIRECTORY, From 67fd184ac570824a56406d14462e28e37e126f29 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Aug 2023 15:33:29 +0100 Subject: [PATCH 4/6] consolidated location of output storage --- model_data/simulation_system/training.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index 561d1e1d..4d751c9b 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -102,7 +102,7 @@ def training( if model_type == "autogluon": model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower() - output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root + output_base = BASE_REGISTRY_PATH / target_column / model_type / model_root model = AutogluonModel( output_filepath=output_base / MODEL_FOLDER @@ -164,7 +164,6 @@ def training( # TODO: Need a model registry - for now have this as a CSV # Save this in the model directory logger.info("--- Append registry with new model ---") - registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE if registry_path.exists(): From 2ff57a83ede37495c0c35d4b3132c9bdb190d10e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Aug 2023 16:29:24 +0100 Subject: [PATCH 5/6] handling relative paths for autogluon --- .../simulation_system/MLModel/Models.py | 8 ++--- model_data/simulation_system/core/Helpers.py | 17 ++++++++++ model_data/simulation_system/predictions.py | 33 +++++++++++-------- .../requirements/prediction.txt | 0 model_data/simulation_system/training.py | 7 ++-- 5 files changed, 43 insertions(+), 22 deletions(-) create mode 100644 model_data/simulation_system/core/Helpers.py create mode 100644 model_data/simulation_system/requirements/prediction.txt diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py index 89bbe762..ccf6fdf8 100644 --- a/model_data/simulation_system/MLModel/Models.py +++ b/model_data/simulation_system/MLModel/Models.py @@ -122,17 +122,15 @@ class AutogluonModel: return metrics_df - def optimise_model_for_deployment(self, deployment_path: Path = None) -> None: + def optimise_model_for_deployment(self, deployment_path: Path = None) -> str: """ We can optimise the deployment for a autogluon model """ if self.model is None: - logger.error("No model to optimise for deployment") - exit(1) + raise ValueError("No model to optimise for deployment") if deployment_path is None: - logger.error("Deployment path required") - exit(1) + raise ValueError("Deployment path required") # This will return a string path of the location return self.model.clone_for_deployment(deployment_path) diff --git a/model_data/simulation_system/core/Helpers.py b/model_data/simulation_system/core/Helpers.py new file mode 100644 index 00000000..65491c42 --- /dev/null +++ b/model_data/simulation_system/core/Helpers.py @@ -0,0 +1,17 @@ +from pathlib import Path + + +def ensure_relative_path(file_path: str, relative_to: str | Path = None) -> Path: + """ + Convert the given path to a relative path. + + :param file_path: The path to check and possibly convert. + :param relative_to: Optional path to which the given path should be made relative. + If not provided, the current working directory is used. + :return: The relative path. + """ + path = Path(file_path) + if path.is_absolute(): + base_path = Path(relative_to) if relative_to else Path.cwd() + return path.relative_to(base_path) + return path diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py index bc1b113b..aa6c2d0f 100644 --- a/model_data/simulation_system/predictions.py +++ b/model_data/simulation_system/predictions.py @@ -4,14 +4,13 @@ Script to load MLModel class and generate predictions import json import argparse -from MLModel.Models import AutogluonModel -from core.Logger import logger -from core.DataLoader import DataLoader -from pathlib import Path +from model_data.simulation_system.MLModel.Models import AutogluonModel +from model_data.simulation_system.core.Logger import logger +from model_data.simulation_system.core.DataLoader import DataLoader import pandas as pd from typing import Optional from datetime import datetime -from core.Settings import ( +from model_data.simulation_system.core.Settings import ( BASE_REGISTRY_PATH, REGISTRY_FILE, PREDICTION_LOCATION, @@ -19,10 +18,12 @@ from core.Settings import ( METADATA_FILE ) -TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") +TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + # FOR TESTING -# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame) +# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to +# DataFrame) # TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet") # DATA = TEST_DATA.sample(1) @@ -33,18 +34,20 @@ def ingest_arguments() -> argparse.Namespace: """ parser = argparse.ArgumentParser(description='Inputs for training script') - parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE') - parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here') + parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', + choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE') + parser.add_argument('--model-path', type=str, + help='If you wish to use a specific model, specify the model path here') parser.add_argument('--data', type=str, help='Json data for predictions') parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training') args = parser.parse_args() return args - -def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None): +def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, + data_path: Optional[str] = None): """ Main pipeline function """ @@ -93,6 +96,7 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data logger.info("--- Loading Model ---") model = AutogluonModel() + model.load_model(filepath=model_location) logger.info("--- Generating Predictions ---") @@ -125,10 +129,11 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data return json_prediction -if __name__ == "__main__": +if __name__ == "__main__": args = ingest_arguments() # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}' - # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet - prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path) \ No newline at end of file + # Data path can be passed as so: python3 predictions.py --data-path + # ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet + prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path) diff --git a/model_data/simulation_system/requirements/prediction.txt b/model_data/simulation_system/requirements/prediction.txt new file mode 100644 index 00000000..e69de29b diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index 4d751c9b..d67a7e58 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -1,16 +1,13 @@ import argparse # import boto3 -import os from pathlib import Path from datetime import datetime -from typing import List from model_data.simulation_system.core.Logger import logger from model_data.simulation_system.core.DataLoader import DataLoader from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor from model_data.simulation_system.MLModel.Models import AutogluonModel import pandas as pd from model_data.simulation_system.core.Settings import ( - MODEL_DIRECTORY, BASE_REGISTRY_PATH, REGISTRY_FILE, MODEL_FOLDER, @@ -19,6 +16,7 @@ from model_data.simulation_system.core.Settings import ( SUBSAMPLE_FACTOR, MODEL_HYPERPARAMETERS ) +from model_data.simulation_system.core.Helpers import ensure_relative_path import seaborn as sns import matplotlib.pyplot as plt @@ -159,6 +157,9 @@ def training( logger.info("--- Optimising model for deployment ---") deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER) + # Autogluon requires models to be stored at relative paths. This will likely eventually be s3 however we + # make sure the path is relative to the location of this script + deployment_model_path = ensure_relative_path(deployment_model_path, Path(__file__).parent) logger.info(f"Optimised version of best model can be found at: {deployment_model_path}") # TODO: Need a model registry - for now have this as a CSV From a5062b24f0bfb53d4fb254a85a7874da37b058d1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 25 Aug 2023 18:08:56 +0100 Subject: [PATCH 6/6] got the predictions working --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- model_data/simulation_system/core/Helpers.py | 17 ----------------- .../requirements/prediction.txt | 2 ++ model_data/simulation_system/training.py | 7 ++----- 5 files changed, 6 insertions(+), 24 deletions(-) delete mode 100644 model_data/simulation_system/core/Helpers.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 03f5e8e2..0ded8e60 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index daffedc9..ae87bfde 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,6 +1,6 @@ - + diff --git a/model_data/simulation_system/core/Helpers.py b/model_data/simulation_system/core/Helpers.py deleted file mode 100644 index 65491c42..00000000 --- a/model_data/simulation_system/core/Helpers.py +++ /dev/null @@ -1,17 +0,0 @@ -from pathlib import Path - - -def ensure_relative_path(file_path: str, relative_to: str | Path = None) -> Path: - """ - Convert the given path to a relative path. - - :param file_path: The path to check and possibly convert. - :param relative_to: Optional path to which the given path should be made relative. - If not provided, the current working directory is used. - :return: The relative path. - """ - path = Path(file_path) - if path.is_absolute(): - base_path = Path(relative_to) if relative_to else Path.cwd() - return path.relative_to(base_path) - return path diff --git a/model_data/simulation_system/requirements/prediction.txt b/model_data/simulation_system/requirements/prediction.txt index e69de29b..f9ce32bf 100644 --- a/model_data/simulation_system/requirements/prediction.txt +++ b/model_data/simulation_system/requirements/prediction.txt @@ -0,0 +1,2 @@ +autogluon==0.8.2 +pandas==1.5.3 \ No newline at end of file diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index d67a7e58..2a1dfcfa 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -8,6 +8,7 @@ from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor from model_data.simulation_system.MLModel.Models import AutogluonModel import pandas as pd from model_data.simulation_system.core.Settings import ( + MODEL_DIRECTORY, BASE_REGISTRY_PATH, REGISTRY_FILE, MODEL_FOLDER, @@ -16,7 +17,6 @@ from model_data.simulation_system.core.Settings import ( SUBSAMPLE_FACTOR, MODEL_HYPERPARAMETERS ) -from model_data.simulation_system.core.Helpers import ensure_relative_path import seaborn as sns import matplotlib.pyplot as plt @@ -100,7 +100,7 @@ def training( if model_type == "autogluon": model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower() - output_base = BASE_REGISTRY_PATH / target_column / model_type / model_root + output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root model = AutogluonModel( output_filepath=output_base / MODEL_FOLDER @@ -157,9 +157,6 @@ def training( logger.info("--- Optimising model for deployment ---") deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER) - # Autogluon requires models to be stored at relative paths. This will likely eventually be s3 however we - # make sure the path is relative to the location of this script - deployment_model_path = ensure_relative_path(deployment_model_path, Path(__file__).parent) logger.info(f"Optimised version of best model can be found at: {deployment_model_path}") # TODO: Need a model registry - for now have this as a CSV