mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
259 lines
8.3 KiB
Python
259 lines
8.3 KiB
Python
import argparse
|
|
|
|
import os
|
|
import shutil
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import pandas as pd
|
|
from MLModel.Models import model_factory
|
|
from core.Logger import logger
|
|
from core.Metrics import Metrics, sort_by_metric
|
|
from core.DataLoader import dataloader_factory
|
|
from core.FeatureProcessor import FeatureProcessor
|
|
from core.CloudClient import BotoClient
|
|
from core.RegistryHandler import RegistryHandler
|
|
from core.Settings import (
|
|
MODEL_DIRECTORY,
|
|
REGISTRY_FILE,
|
|
MODEL_FOLDER,
|
|
METRICS_FOLDER,
|
|
DEPLOYMENT_FOLDER,
|
|
SUBSAMPLE_FACTOR,
|
|
MODEL_HYPERPARAMETERS,
|
|
TIMESTAMP_FORMAT,
|
|
RESIDUAL_FILE,
|
|
BEST_MODEL_COLUMN_NAME,
|
|
OPTIMISE_METRIC,
|
|
)
|
|
|
|
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
|
|
|
|
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
|
|
|
CLIENT = BotoClient(runtime_environment=RUNTIME_ENVIRONMENT)
|
|
# CLIENT = S3FSClient(runtime_environment=RUNTIME_ENVIRONMENT)
|
|
|
|
|
|
# FOR TESTING
|
|
# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
|
|
# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
|
|
# target_column = "RDSAP_CHANGE"
|
|
# model_type = "autogluon"
|
|
# hyperparameter = HYPERPARAMETERS
|
|
# SUBSAMPLE_FACTOR = 200
|
|
|
|
# Mock location
|
|
# train_filepath = "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet"
|
|
# test_filepath = "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data.parquet"
|
|
|
|
|
|
# To run script in local mode:
|
|
# python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
|
|
|
|
# To run script in local-mock mode:
|
|
# python3 training.py --train-filepath s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data.parquet
|
|
|
|
|
|
def ingest_arguments() -> argparse.Namespace:
|
|
"""
|
|
Helper function to take in arguments from script start
|
|
"""
|
|
|
|
parser = argparse.ArgumentParser(description="Inputs for training script")
|
|
|
|
parser.add_argument(
|
|
"--train-filepath",
|
|
type=str,
|
|
help="Location of Parquet dataset to load for training",
|
|
required=True,
|
|
)
|
|
parser.add_argument(
|
|
"--test-filepath",
|
|
type=str,
|
|
help="Location of Parquet dataset to load for testing",
|
|
required=True,
|
|
)
|
|
parser.add_argument(
|
|
"--model-type",
|
|
type=str,
|
|
help="The type of model to train",
|
|
choices=["autogluon"],
|
|
default="autogluon",
|
|
)
|
|
parser.add_argument(
|
|
"--target-column",
|
|
type=str,
|
|
help="The response variable",
|
|
choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"],
|
|
default="RDSAP_CHANGE",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
return args
|
|
|
|
|
|
def training(
|
|
train_filepath: str,
|
|
test_filepath: str,
|
|
target_column: str = "RDSAP_CHANGE",
|
|
model_type: str = "autogluon",
|
|
hyperparameters: dict | None = None,
|
|
) -> None:
|
|
"""
|
|
Pipeline to run training on the dataset
|
|
"""
|
|
|
|
logger.info("--- Loading data ---")
|
|
dataloader = dataloader_factory(runtime_environment=RUNTIME_ENVIRONMENT)
|
|
train_df = dataloader.load(client=CLIENT, filepath=train_filepath)
|
|
test_df = dataloader.load(client=CLIENT, filepath=test_filepath)
|
|
|
|
if train_df is None or test_df is None:
|
|
raise ValueError("No data Loaded - cancelling pipeline")
|
|
|
|
logger.info("--- Feature processing ---")
|
|
|
|
feature_processor = FeatureProcessor()
|
|
|
|
# This is for convenience for now
|
|
subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR)
|
|
|
|
train_df = feature_processor.process(
|
|
train_df, target_column=target_column, subsample_amount=subsample_amount
|
|
)
|
|
test_df = feature_processor.process(test_df, target_column=target_column)
|
|
|
|
logger.info("--- Build Model ---")
|
|
|
|
logger.info("--- Load Hyperparameters ---")
|
|
|
|
if hyperparameters is None:
|
|
logger.info("Use base hyperparameters in settings")
|
|
hyperparameters = MODEL_HYPERPARAMETERS[model_type]
|
|
logger.info(f"Hyperparameters are: {hyperparameters}")
|
|
|
|
logger.info(
|
|
"--- Loading model configuration (Model type and Naming convention) ---"
|
|
)
|
|
# We might want to have hyperparameters in the names to make models more recognisable
|
|
model_toolkit = model_factory(
|
|
model_type=model_type, hyperparameters=hyperparameters
|
|
)
|
|
|
|
model_root = (
|
|
f"{target_column}-{model_toolkit['naming_attributes']}-{TIMESTAMP}".lower()
|
|
)
|
|
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
|
|
|
|
# Will need to pass output path to model (for saving purposes)
|
|
model = model_toolkit["model"](output_filepath=output_base / MODEL_FOLDER)
|
|
|
|
model.train_model(
|
|
data=train_df, target_column=target_column, hyperparameters=hyperparameters
|
|
)
|
|
|
|
logger.info("--- Save Model ---")
|
|
model.save_model(output_filepath=model.output_filepath, client=CLIENT)
|
|
|
|
logger.info("--- Generate evaluation metrics ---")
|
|
metrics = Metrics()
|
|
|
|
metric_output_path = output_base / METRICS_FOLDER
|
|
metrics_df = model.model_evaluation(
|
|
validation_data=test_df,
|
|
target_column=target_column,
|
|
metrics_location=metric_output_path,
|
|
metrics=metrics,
|
|
)
|
|
|
|
logger.info("--- Generate metric outputs using predictions ---")
|
|
# metrics.generate_plot_suite()
|
|
|
|
plot_output_path = output_base / METRICS_FOLDER / RESIDUAL_FILE
|
|
metrics.generate_residual_plot(
|
|
actuals=test_df[target_column],
|
|
predictions=model.predictions,
|
|
target_column=target_column,
|
|
output_filepath=plot_output_path,
|
|
)
|
|
|
|
metrics.upload_metrics(output_filepath=metric_output_path, client=CLIENT)
|
|
|
|
# TODO: for cml, we might want to have class that outputs all data and plots to add to the report
|
|
# If we want residual plot/ any plots, we will need to self host
|
|
|
|
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
|
|
# Imagining for now that the model trained here is the best model amongst all models built
|
|
|
|
logger.info("--- Optimising model for deployment ---")
|
|
|
|
deployment_model_path = output_base / DEPLOYMENT_FOLDER
|
|
|
|
model.optimise_model_for_deployment(deployment_path=deployment_model_path)
|
|
logger.info(
|
|
f"Optimised version of best model can be found at: {deployment_model_path}"
|
|
)
|
|
|
|
model.save_model(output_filepath=deployment_model_path, client=CLIENT)
|
|
|
|
# TODO: Need a model registry - for now have this as a CSV
|
|
# Save this in the model directory
|
|
# Loading registry from s3
|
|
logger.info("--- Append registry with new model ---")
|
|
|
|
registry_handler = RegistryHandler()
|
|
|
|
registry_path = Path(MODEL_DIRECTORY) / target_column / REGISTRY_FILE
|
|
|
|
registry_df = registry_handler.load_registry(
|
|
registry_path=registry_path, client=CLIENT, metrics=metrics
|
|
)
|
|
|
|
model_details_df = pd.DataFrame(
|
|
[
|
|
{
|
|
"model_type": model_type,
|
|
"model_name": model_root,
|
|
"model_location": deployment_model_path,
|
|
}
|
|
]
|
|
)
|
|
|
|
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
|
|
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
|
|
|
|
registry_df = sort_by_metric(
|
|
registry_df,
|
|
optimse_metric=OPTIMISE_METRIC,
|
|
best_model_column_name=BEST_MODEL_COLUMN_NAME,
|
|
)
|
|
|
|
logger.info("--- Saving new model to registry ---")
|
|
# Ensure the directory exists
|
|
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
|
registry_df.to_csv(registry_path, index=False)
|
|
|
|
registry_handler.save_registry(output_filepath=registry_path, client=CLIENT)
|
|
|
|
logger.info("--- Clean up ---")
|
|
if RUNTIME_ENVIRONMENT != "local" and Path(MODEL_DIRECTORY).exists():
|
|
logger.info("Removing local development files not in s3")
|
|
shutil.rmtree(Path(MODEL_DIRECTORY))
|
|
|
|
logger.info("--- Training Pipeline Complete --- ")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
logger.info("---Begin Pipeline---")
|
|
|
|
logger.info("---Ingest Arguments---")
|
|
args = ingest_arguments()
|
|
|
|
training(
|
|
train_filepath=args.train_filepath,
|
|
test_filepath=args.test_filepath,
|
|
target_column=args.target_column,
|
|
model_type=args.model_type,
|
|
)
|