Model/model_data/simulation_system/training.py
2023-08-25 16:29:24 +01:00

220 lines
8.4 KiB
Python

import argparse
# import boto3
from pathlib import Path
from datetime import datetime
from model_data.simulation_system.core.Logger import logger
from model_data.simulation_system.core.DataLoader import DataLoader
from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
from model_data.simulation_system.MLModel.Models import AutogluonModel
import pandas as pd
from model_data.simulation_system.core.Settings import (
BASE_REGISTRY_PATH,
REGISTRY_FILE,
MODEL_FOLDER,
METRICS_FOLDER,
DEPLOYMENT_FOLDER,
SUBSAMPLE_FACTOR,
MODEL_HYPERPARAMETERS
)
from model_data.simulation_system.core.Helpers import ensure_relative_path
import seaborn as sns
import matplotlib.pyplot as plt
TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# FOR TESTING
# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
# target_column = "RDSAP_CHANGE"
# model_type = "autogluon"
# hyperparameter = HYPERPARAMETERS
# SUBSAMPLE_FACTOR = 200
# SESSION = boto3.Session()
# S3_CLIENT = SESSION.client(
# service_name="s3",
# aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", 'admin'),
# aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", 'password'),
# endpoint_url=os.environ.get("ENDPOINT_URL", "http://localhost:9000")
# )
# S3_CLIENT.create_bucket
# S3_CLIENT.list_buckets()
def ingest_arguments() -> argparse.Namespace:
"""
Helper function to take in arguments from script start
"""
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training',
required=True)
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing',
required=True)
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"],
default="autogluon")
parser.add_argument('--target-column', type=str, help='The response variable',
choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
args = parser.parse_args()
return args
def training(
train_filepath: str,
test_filepath: str,
target_column: str = "RDSAP_CHANGE",
model_type: str = "autogluon",
hyperparameters: dict = None
) -> None:
"""
Pipeline to run training on the dataset
"""
logger.info('--- Loading data ---')
dataloader = DataLoader()
train_df = dataloader.load(filepath=train_filepath)
test_df = dataloader.load(filepath=test_filepath)
logger.info('--- Feature processing ---')
feature_processor = FeatureProcessor()
subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR)
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
test_df = feature_processor.process(test_df, target_column=target_column)
logger.info('--- Build Model ---')
logger.info("--- Load Hyperparameters ---")
if hyperparameters is None:
logger.info("Use base hyperparameters in settings")
hyperparameters = MODEL_HYPERPARAMETERS[model_type]
logger.info(f'Hyperparameters are: {hyperparameters}')
if model_type == "autogluon":
model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
output_base = BASE_REGISTRY_PATH / target_column / model_type / model_root
model = AutogluonModel(
output_filepath=output_base / MODEL_FOLDER
)
else:
raise ValueError("No alternative model implemented yet")
model.train_model(
data=train_df,
target_column=target_column,
hyperparameters=hyperparameters
)
logger.info("--- Save Model ---")
model.save_model(output_filepath=model.output_filepath)
logger.info('--- Generate evaluation metrics ---')
metrics_df = model.model_evaluation(
validation_data=test_df,
target_column=target_column,
metrics_location=output_base / METRICS_FOLDER
)
logger.info("--- Generate metric outputs using predictions ---")
# TODO: can have a model.metric_outputs method
# FOr not just do it here
residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred'])
# image formatting
# TODO: move to settings file , AXIS_FONT, TITLE_FONT
axis_fs = 18 # fontsize
title_fs = 22 # fontsize
sns.set(style="whitegrid")
ax = sns.scatterplot(x="true", y="pred", data=residual_df)
ax.set_aspect('equal')
ax.set_xlabel(f'True {target_column}', fontsize=axis_fs)
ax.set_ylabel(f'Predicted {target_column}', fontsize=axis_fs) # ylabel
ax.set_title('Residuals', fontsize=title_fs)
# Square aspect ratio
ax.plot([-100, 100], [-100, 100], 'black', linewidth=1)
plt.tight_layout()
RESIDUAL_FILE = "residuals.png"
plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
# TODO: for cml, we might want to have class that outputs all data and plots to add to the report
# If we want residual plot/ any plots, we will need to self host
# plt.savefig(RESIDUAL_FILE, dpi=120)
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
# Imagining for now that the model trained here is the best model amongst all models built
logger.info("--- Optimising model for deployment ---")
deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
# Autogluon requires models to be stored at relative paths. This will likely eventually be s3 however we
# make sure the path is relative to the location of this script
deployment_model_path = ensure_relative_path(deployment_model_path, Path(__file__).parent)
logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
# TODO: Need a model registry - for now have this as a CSV
# Save this in the model directory
logger.info("--- Append registry with new model ---")
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path.exists():
logger.info("Registry file found - Loading into Dataframe")
registry_df = pd.read_csv(registry_path, index_col=None)
else:
# TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns
registry_df = pd.DataFrame(
columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error',
'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
model_details_df = pd.DataFrame(
[{
'model_type': model_type,
'model_name': model_root,
'model_location': deployment_model_path
}]
)
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
# regenerate new metrics
# TODO: decide metric to optimise to
registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
registry_df['best_model'] = [False] * len(registry_df)
registry_df.loc[0, 'best_model'] = True
logger.info("--- Saving new model to registry ---")
# Ensure the directory exists
registry_path.parent.mkdir(parents=True, exist_ok=True)
registry_df.to_csv(registry_path, index=False)
logger.info("--- Training Pipeline Complete --- ")
if __name__ == "__main__":
logger.info('---Begin Pipeline---')
logger.info('---Ingest Arguments---')
args = ingest_arguments()
# To run script: python3 training.py --train-filepath
# ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
# ./model_build_data/change_data/rdsap_full/test_data.parquet
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
training(
train_filepath=args.train_filepath,
test_filepath=args.test_filepath,
target_column=args.target_column,
model_type=args.model_type
)