diff --git a/model_data/simulation_system/MLModel/BaseMLModel.py b/model_data/simulation_system/MLModel/BaseMLModel.py index 40ed53df..1b5c525a 100644 --- a/model_data/simulation_system/MLModel/BaseMLModel.py +++ b/model_data/simulation_system/MLModel/BaseMLModel.py @@ -5,7 +5,7 @@ This is the base protocol: Key tasks: - Template Model class for different model types - Save model -- Load Model +- Load Model - Generate Inference """ @@ -15,9 +15,9 @@ import pandas as pd class MLModel(Protocol): - ''' + """ Base ML Model protocol - ''' + """ def load_model(self, filepath: Path) -> None: """ @@ -30,11 +30,8 @@ class MLModel(Protocol): """ def train_model( - self, - data: pd.DataFrame, - target_column: str, - hyperparameter: dict - ) -> None: + self, data: pd.DataFrame, target_column: str, hyperparameter: dict + ) -> None: """ For the given data and hyperparameters (specified to the model), a model is trained """ @@ -44,7 +41,12 @@ class MLModel(Protocol): For the given dataframe, model is loaded and predictions are generated """ - def model_evaluation(self, validation_data: pd.DataFrame, target_column: str, metrics_location: Path = None) -> NamedTuple: + def model_evaluation( + self, + validation_data: pd.DataFrame, + target_column: str, + metrics_location: Path = None, + ) -> NamedTuple: """ For any validation data, a set of predictions and metrics are return """ @@ -53,7 +55,7 @@ class MLModel(Protocol): """ Perfomance post processing on Model to ensure ready for deployment """ - + def model_metadata(self) -> dict: """ Extract out model metadata as dictionary diff --git a/model_data/simulation_system/MLModel/Models.py b/model_data/simulation_system/MLModel/Models.py index 869ff02f..d5b25e64 100644 --- a/model_data/simulation_system/MLModel/Models.py +++ b/model_data/simulation_system/MLModel/Models.py @@ -1,27 +1,34 @@ """ -Different implementations of the MLModel Protocol +Different implementations of the MLModel Protocol Uses the BaseMLModel protocol Key tasks: - Template Model class for different model types - Save model -- Load Model +- Load Model - Generate Inference """ -from typing import NamedTuple from pathlib import Path import pandas as pd from autogluon.tabular import TabularDataset, TabularPredictor from sklearn.metrics import mean_absolute_percentage_error from core.Logger import logger -AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types'] +AUTOGLUON_HYPERPARAMETERS = [ + "problem_type", + "eval_metric", + "time_limit", + "presets", + "excluded_model_types", +] METRIC_FILENAME = "metrics.csv" + class AutogluonModel: """ Autogluon model that implements the MLModel Protocol """ + def __init__(self, output_filepath: Path = None) -> None: self.model = None self.output_filepath = output_filepath @@ -40,10 +47,8 @@ class AutogluonModel: logger.info("Using AutoGluon Model - Model saving already occured") def train_model( - self, - data: pd.DataFrame, - target_column: str, - hyperparameters: dict = None) -> None: + self, data: pd.DataFrame, target_column: str, hyperparameters: dict = None + ) -> None: """ For the given data and hyperparameters, a model is trained """ @@ -52,23 +57,24 @@ class AutogluonModel: exit(1) if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()): - print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required") + print( + "Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required" + ) exit(1) AGdata = TabularDataset(data=data) self.model = TabularPredictor( - label=target_column, - path=self.output_filepath, - problem_type=hyperparameters['problem_type'], - eval_metric=hyperparameters['eval_metric'] - ).fit( - AGdata, - time_limit=hyperparameters['time_limit'], - presets=hyperparameters['presets'], - excluded_model_types=hyperparameters['excluded_model_types'] - ) - + label=target_column, + path=self.output_filepath, + problem_type=hyperparameters["problem_type"], + eval_metric=hyperparameters["eval_metric"], + ).fit( + AGdata, + time_limit=hyperparameters["time_limit"], + presets=hyperparameters["presets"], + excluded_model_types=hyperparameters["excluded_model_types"], + ) def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame: """ @@ -84,12 +90,12 @@ class AutogluonModel: return predictions def model_evaluation( - self, - validation_data: pd.DataFrame, - target_column: str, - metrics_location: Path = None, - metric_filename: str = METRIC_FILENAME - ) -> pd.DataFrame: + self, + validation_data: pd.DataFrame, + target_column: str, + metrics_location: Path = None, + metric_filename: str = METRIC_FILENAME, + ) -> pd.DataFrame: """ For any validation data, a set of predictions and metrics are return """ @@ -105,11 +111,13 @@ class AutogluonModel: logger.info("Prediction used for evaluations are saved in self.prediction") self.predictions = predictions - - # TODO: Can have a custom metric class that defines all different metrics we want - metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions) - performance['mape'] = metric_mape + # TODO: Can have a custom metric class that defines all different metrics we want + metric_mape = mean_absolute_percentage_error( + validation_data[target_column], predictions + ) + + performance["mape"] = metric_mape logger.info("Saving metric file as metric.csv") metrics_location.mkdir(exist_ok=True) @@ -117,7 +125,7 @@ class AutogluonModel: metrics_df = pd.DataFrame([performance]) metrics_df.to_csv(metrics_location / metric_filename) markdown_filename = metric_filename.split(".")[0] + ".md" - metrics_df.to_markdown(metrics_location/ markdown_filename) + metrics_df.to_markdown(metrics_location / markdown_filename) return metrics_df @@ -135,14 +143,9 @@ class AutogluonModel: # This will return a string path of the location return self.model.clone_for_deployment(deployment_path) - + def model_metadata(self) -> dict: """ - For Autogluon model, use the inbuilt model info method + For Autogluon model, use the inbuilt model info method """ return self.model.info() - - - - - \ No newline at end of file diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py index e562a39b..c46a7dc0 100644 --- a/model_data/simulation_system/core/Settings.py +++ b/model_data/simulation_system/core/Settings.py @@ -1,4 +1,4 @@ -# Using a simply python file as settings for now +# Using a simply python file as settings for now # TODO: migrate to dynaconf from pathlib import Path @@ -6,40 +6,42 @@ from pathlib import Path # If anything we might want to have a file that can be loaded and sent to this script MODEL_HYPERPARAMETERS = { "autogluon": { - 'problem_type': 'regression', - 'eval_metric': 'mean_absolute_error', - 'time_limit': 30, - 'presets': 'medium_quality', - 'excluded_model_types': None + "problem_type": "regression", + "eval_metric": "mean_absolute_error", + "time_limit": 30, + "presets": "medium_quality", + "excluded_model_types": None, } } +TIMESTAMP_FORMAT = "%Y-%m-%d_%H-%M-%S" + RANDOM_SEED = 0 SUBSAMPLE_FACTOR = 200 -TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet' -TEST_DATA_NAME = 'test_data.parquet' +TRAIN_AND_VALIDATION_DATA_NAME = "train_validation_data.parquet" +TEST_DATA_NAME = "test_data.parquet" REGISTRY_FILE = "model_registry.csv" MODEL_DIRECTORY = "model_directory" -BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY +BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY PREDICTION_LOCATION = Path("predictions") -PREDICTION_FILE = 'prediction.json' -METADATA_FILE = 'metadata.json' +PREDICTION_FILE = "prediction.json" +METADATA_FILE = "metadata.json" MODEL_FOLDER = "model" METRICS_FOLDER = "metrics" -DEPLOYMENT_FOLDER = "deployment" +DEPLOYMENT_FOLDER = "deployment" TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 COLUMNS_TO_MERGE_ON = [ - "PROPERTY_TYPE", - "BUILT_FORM", - "CONSTRUCTION_AGE_BAND", + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", - "NUMBER_HEATED_ROOMS" - ] + "NUMBER_HEATED_ROOMS", +] FULLY_GLAZED_DESCRIPTIONS = [ "Fully double glazed", @@ -50,48 +52,45 @@ FULLY_GLAZED_DESCRIPTIONS = [ ] FIXED_FEATURES = [ - 'PROPERTY_TYPE', - 'BUILT_FORM', - 'CONSTRUCTION_AGE_BAND', - 'NUMBER_HABITABLE_ROOMS', - 'CONSTITUENCY', - 'NUMBER_HEATED_ROOMS', - 'FIXED_LIGHTING_OUTLETS_COUNT', - 'FLOOR_HEIGHT', - 'FLOOR_LEVEL', - 'TOTAL_FLOOR_AREA', + "PROPERTY_TYPE", + "BUILT_FORM", + "CONSTRUCTION_AGE_BAND", + "NUMBER_HABITABLE_ROOMS", + "CONSTITUENCY", + "NUMBER_HEATED_ROOMS", + "FIXED_LIGHTING_OUTLETS_COUNT", + "FLOOR_HEIGHT", + "FLOOR_LEVEL", + "TOTAL_FLOOR_AREA", ] COMPONENT_FEATURES = [ - 'TRANSACTION_TYPE', - 'WALLS_DESCRIPTION', - 'FLOOR_DESCRIPTION', - 'LIGHTING_DESCRIPTION', - 'ROOF_DESCRIPTION', - 'MAINHEAT_DESCRIPTION', - 'HOTWATER_DESCRIPTION', - 'MAIN_FUEL', - 'MECHANICAL_VENTILATION', - 'SECONDHEAT_DESCRIPTION', - 'ENERGY_TARIFF', # Not sure if this is relevant - 'SOLAR_WATER_HEATING_FLAG', - 'PHOTO_SUPPLY', - 'WINDOWS_DESCRIPTION', - 'GLAZED_TYPE', - 'MULTI_GLAZE_PROPORTION', - 'LIGHTING_DESCRIPTION', - 'LOW_ENERGY_LIGHTING', - 'NUMBER_OPEN_FIREPLACES', - 'MAINHEATCONT_DESCRIPTION', - 'EXTENSION_COUNT', + "TRANSACTION_TYPE", + "WALLS_DESCRIPTION", + "FLOOR_DESCRIPTION", + "LIGHTING_DESCRIPTION", + "ROOF_DESCRIPTION", + "MAINHEAT_DESCRIPTION", + "HOTWATER_DESCRIPTION", + "MAIN_FUEL", + "MECHANICAL_VENTILATION", + "SECONDHEAT_DESCRIPTION", + "ENERGY_TARIFF", # Not sure if this is relevant + "SOLAR_WATER_HEATING_FLAG", + "PHOTO_SUPPLY", + "WINDOWS_DESCRIPTION", + "GLAZED_TYPE", + "MULTI_GLAZE_PROPORTION", + "LIGHTING_DESCRIPTION", + "LOW_ENERGY_LIGHTING", + "NUMBER_OPEN_FIREPLACES", + "MAINHEATCONT_DESCRIPTION", + "EXTENSION_COUNT", # 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION ] # For these fields, we take an average if we have multiple values -AVERAGE_FIXED_FEATURES = [ - "TOTAL_FLOOR_AREA", - "FLOOR_HEIGHT" -] +AVERAGE_FIXED_FEATURES = ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] # For these fields, we take the latest value if we have multiple values # Since more recent EPCs have been conducted with more rigour, we assume that the latest value is @@ -105,11 +104,7 @@ LATEST_FIELD = [ ] # If we see thee features changing, we don't use the EPC, since deem it not to be reliable -MANDATORY_FIXED_FEATURES = [ - "PROPERTY_TYPE", - "BUILT_FORM", - "CONSTITUENCY" -] +MANDATORY_FIXED_FEATURES = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY"] # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were # conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England @@ -119,14 +114,16 @@ EARLIEST_EPC_DATE = "2014-08-01" RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY" HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT" + def ordinal(n): if 10 <= n % 100 <= 20: - suffix = 'th' + suffix = "th" else: - suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th') + suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th") return str(n) + suffix + FLOOR_LEVEL_MAP = { "Basement": -1, "Ground": 0, @@ -145,8 +142,7 @@ BUILT_FORM_REMAP = { } DATA_PROCESSOR_SETTINGS = { - 'low_memory': False, - 'epc_minimum_count': 1, - 'column_mappings': {'UPRN': [int, str]} + "low_memory": False, + "epc_minimum_count": 1, + "column_mappings": {"UPRN": [int, str]}, } - diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py index ba7db181..591b85c7 100644 --- a/model_data/simulation_system/predictions.py +++ b/model_data/simulation_system/predictions.py @@ -16,10 +16,11 @@ from core.Settings import ( REGISTRY_FILE, PREDICTION_LOCATION, PREDICTION_FILE, - METADATA_FILE + METADATA_FILE, + TIMESTAMP_FORMAT, ) -TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") +TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT) # FOR TESTING # For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame) @@ -32,19 +33,35 @@ def ingest_arguments() -> argparse.Namespace: Helper function to take in arguments from script start """ - parser = argparse.ArgumentParser(description='Inputs for training script') - parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE') - parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here') - parser.add_argument('--data', type=str, help='Json data for predictions') - parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training') + parser = argparse.ArgumentParser(description="Inputs for training script") + parser.add_argument( + "--target-column", + type=str, + help="The response variable you are predicting for", + choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], + default="RDSAP_CHANGE", + ) + parser.add_argument( + "--model-path", + type=str, + help="If you wish to use a specific model, specify the model path here", + ) + parser.add_argument("--data", type=str, help="Json data for predictions") + parser.add_argument( + "--data-path", type=str, help="Location of Parquet dataset to load for training" + ) args = parser.parse_args() return args - -def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None): +def prediction( + target_column: str = "RDSAP_CHANGE", + model_path: str = None, + data: pd.DataFrame = None, + data_path: Optional[str] = None, +): """ Main pipeline function """ @@ -64,11 +81,11 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data # TODO: Think about where registry will sit/ type logger.info("Loading best model from registry") registry_df = pd.read_csv(registry_path) - best_model_df = registry_df[registry_df['best_model']] + best_model_df = registry_df[registry_df["best_model"]] - model_location = best_model_df['model_location'].values[0] - model_type = best_model_df['model_type'].values[0] - model_name = best_model_df['model_name'].values[0] + model_location = best_model_df["model_location"].values[0] + model_type = best_model_df["model_type"].values[0] + model_name = best_model_df["model_name"].values[0] logger.info("--- Model Info: ---") logger.info(f"Model type: {model_type}") @@ -86,7 +103,7 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data # TODO: DOWNSAMPLING DOWN TO JUST USE ONE FOR PREDICTION data = data.sample(1) else: - logger.info('Using data provided') + logger.info("Using data provided") data = json.loads(data) data = pd.DataFrame([data]) print(data) @@ -117,17 +134,23 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data "model_type": model_type, "model_name": model_name, "model_location": model_location, - "model_settings": model.model_metadata() + "model_settings": model.model_metadata(), } pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE) return json_prediction + if __name__ == "__main__": args = ingest_arguments() # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}' # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet - prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path) \ No newline at end of file + prediction( + target_column=args.target_column, + model_path=args.model_path, + data=args.data, + data_path=args.data_path, + ) diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index 9dc17b2c..11acdf57 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -1,5 +1,5 @@ - import argparse + # import boto3 from pathlib import Path from datetime import datetime @@ -16,12 +16,13 @@ from core.Settings import ( METRICS_FOLDER, DEPLOYMENT_FOLDER, SUBSAMPLE_FACTOR, - MODEL_HYPERPARAMETERS + MODEL_HYPERPARAMETERS, + TIMESTAMP_FORMAT, ) import seaborn as sns import matplotlib.pyplot as plt -TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") +TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT) # FOR TESTING # train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet" @@ -43,119 +44,145 @@ TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") # S3_CLIENT.create_bucket # S3_CLIENT.list_buckets() + def ingest_arguments() -> argparse.Namespace: """ Helper function to take in arguments from script start """ - parser = argparse.ArgumentParser(description='Inputs for training script') + parser = argparse.ArgumentParser(description="Inputs for training script") - parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True) - parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True) - parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon") - parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE') + parser.add_argument( + "--train-filepath", + type=str, + help="Location of Parquet dataset to load for training", + required=True, + ) + parser.add_argument( + "--test-filepath", + type=str, + help="Location of Parquet dataset to load for testing", + required=True, + ) + parser.add_argument( + "--model-type", + type=str, + help="The type of model to train", + choices=["autogluon"], + default="autogluon", + ) + parser.add_argument( + "--target-column", + type=str, + help="The response variable", + choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], + default="RDSAP_CHANGE", + ) args = parser.parse_args() return args - + def training( - train_filepath: str, - test_filepath: str, - target_column: str = "RDSAP_CHANGE", - model_type: str = "autogluon", - hyperparameters: dict = None - ) -> None: + train_filepath: str, + test_filepath: str, + target_column: str = "RDSAP_CHANGE", + model_type: str = "autogluon", + hyperparameters: dict = None, +) -> None: """ Pipeline to run training on the dataset """ - logger.info('--- Loading data ---') + logger.info("--- Loading data ---") dataloader = DataLoader() train_df = dataloader.load(filepath=train_filepath) test_df = dataloader.load(filepath=test_filepath) - - logger.info('--- Feature processing ---') + + logger.info("--- Feature processing ---") feature_processor = FeatureProcessor() - subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR) + subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR) - train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount) + train_df = feature_processor.process( + train_df, target_column=target_column, subsample_amount=subsample_amount + ) test_df = feature_processor.process(test_df, target_column=target_column) - logger.info('--- Build Model ---') + logger.info("--- Build Model ---") logger.info("--- Load Hyperparameters ---") if hyperparameters is None: logger.info("Use base hyperparameters in settings") hyperparameters = MODEL_HYPERPARAMETERS[model_type] - logger.info(f'Hyperparameters are: {hyperparameters}') + logger.info(f"Hyperparameters are: {hyperparameters}") if model_type == "autogluon": model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower() - output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root + output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root - model = AutogluonModel( - output_filepath = output_base / MODEL_FOLDER - ) + model = AutogluonModel(output_filepath=output_base / MODEL_FOLDER) else: logger.error("No alternative model implemented yet") exit(1) - + model.train_model( - data=train_df, - target_column=target_column, - hyperparameters=hyperparameters - ) - + data=train_df, target_column=target_column, hyperparameters=hyperparameters + ) + logger.info("--- Save Model ---") model.save_model(output_filepath=model.output_filepath) - logger.info('--- Generate evaluation metrics ---') + logger.info("--- Generate evaluation metrics ---") metrics_df = model.model_evaluation( - validation_data=test_df, + validation_data=test_df, target_column=target_column, - metrics_location = output_base / METRICS_FOLDER - ) - + metrics_location=output_base / METRICS_FOLDER, + ) + logger.info("--- Generate metric outputs using predictions ---") # TODO: can have a model.metric_outputs method # FOr not just do it here - residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred']) - + residual_df = pd.DataFrame( + list(zip(test_df[target_column], model.predictions)), columns=["true", "pred"] + ) + # image formatting # TODO: move to settings file , AXIS_FONT, TITLE_FONT - axis_fs = 18 #fontsize - title_fs = 22 #fontsize + axis_fs = 18 # fontsize + title_fs = 22 # fontsize sns.set(style="whitegrid") - ax = sns.scatterplot(x="true", y="pred",data=residual_df) - ax.set_aspect('equal') - ax.set_xlabel(f'True {target_column}',fontsize = axis_fs) - ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel - ax.set_title('Residuals', fontsize = title_fs) + ax = sns.scatterplot(x="true", y="pred", data=residual_df) + ax.set_aspect("equal") + ax.set_xlabel(f"True {target_column}", fontsize=axis_fs) + ax.set_ylabel(f"Predicted {target_column}", fontsize=axis_fs) # ylabel + ax.set_title("Residuals", fontsize=title_fs) # Square aspect ratio - ax.plot([-100, 100], [-100, 100], 'black', linewidth=1) + ax.plot([-100, 100], [-100, 100], "black", linewidth=1) plt.tight_layout() RESIDUAL_FILE = "residuals.png" - plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120) + plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120) # TODO: for cml, we might want to have class that outputs all data and plots to add to the report # If we want residual plot/ any plots, we will need to self host - # plt.savefig(RESIDUAL_FILE, dpi=120) + # plt.savefig(RESIDUAL_FILE, dpi=120) - # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment # Imagining for now that the model trained here is the best model amongst all models built logger.info("--- Optimising model for deployment ---") - deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER) - logger.info(f"Optimised version of best model can be found at: {deployment_model_path}") + deployment_model_path = model.optimise_model_for_deployment( + deployment_path=output_base / DEPLOYMENT_FOLDER + ) + logger.info( + f"Optimised version of best model can be found at: {deployment_model_path}" + ) # TODO: Need a model registry - for now have this as a CSV # Save this in the model directory @@ -167,25 +194,43 @@ def training( logger.info("Registry file found - Loading into Dataframe") registry_df = pd.read_csv(registry_path, index_col=None) else: - # TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns - registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model']) + # TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns + registry_df = pd.DataFrame( + columns=[ + "model_type", + "model_name", + "model_location", + "mean_absolute_error", + "root_mean_squared_error", + "mean_squared_error", + "r2", + "pearsonr", + "median_absolute_error", + "mape", + "best_model", + ] + ) model_details_df = pd.DataFrame( - [{ - 'model_type': model_type, - 'model_name': model_root, - 'model_location': deployment_model_path - }] - ) - + [ + { + "model_type": model_type, + "model_name": model_root, + "model_location": deployment_model_path, + } + ] + ) + registry_row = pd.concat([model_details_df, metrics_df], axis=1) registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True) # TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics # TODO: decide metric to optimise to - registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True) - registry_df['best_model'] = [False]*len(registry_df) - registry_df.loc[0, 'best_model'] = True + registry_df = registry_df.sort_values( + "mean_absolute_error", ascending=False + ).reset_index(drop=True) + registry_df["best_model"] = [False] * len(registry_df) + registry_df.loc[0, "best_model"] = True logger.info("--- Saving new model to registry ---") registry_df.to_csv(registry_path, index=False) @@ -195,16 +240,16 @@ def training( if __name__ == "__main__": - logger.info('---Begin Pipeline---') + logger.info("---Begin Pipeline---") - logger.info('---Ingest Arguments---') + logger.info("---Ingest Arguments---") args = ingest_arguments() # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet # TODO: Ingest hyper parameters from somewhere - currently change at the top of script training( - train_filepath=args.train_filepath, - test_filepath=args.test_filepath, - target_column=args.target_column, - model_type=args.model_type - ) + train_filepath=args.train_filepath, + test_filepath=args.test_filepath, + target_column=args.target_column, + model_type=args.model_type, + )