Model/model_data/simulation_system/MLModel/Models.py

"""
Different implementations of the MLModel Protocol
Uses the BaseMLModel protocol
Key tasks:
- Template Model class for different model types
- Save model
- Load Model
- Generate Inference
"""

from typing import Any
from pathlib import Path
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from core.Logger import logger
from core.Metrics import Metrics
from core.Settings import METRIC_FILENAME
from core.CloudClient import S3FSClient

AUTOGLUON_HYPERPARAMETERS = [
    "problem_type",
    "eval_metric",
    "time_limit",
    "presets",
    "excluded_model_types",
]


def model_factory(model_type: str, hyperparameters: dict) -> dict:
    """
    Use factory pattern to register the different ML implementations
    """

    model_types = {
        "autogluon": {
            "model": AutogluonModel,
            "naming_attributes": f"{hyperparameters['presets']}-{hyperparameters['time_limit']}",
        },
    }

    return model_types[model_type]


class AutogluonModel:
    """
    Autogluon model that implements the MLModel Protocol
    """

    def __init__(self, output_filepath: Path | None = None) -> None:
        self.model = None
        self.output_filepath = output_filepath
        self.predictions = None

    def load_model(
        self,
        filepath: str | Path,
        s3_client: S3FSClient,
        model_folder: str = "local_model",
    ) -> None:
        """
        Providing a path, this function will load the model to be used. Will load to internal variable
        """
        filepath = str(filepath)
        if s3_client.client is None:
            logger.info("In local development mode - no need for s3 client")
            self.model = TabularPredictor.load(path=filepath)
        else:
            logger.info(f"Loading model from s3")
            s3_client.download_model(filepath=filepath, model_folder=model_folder)
            self.model = TabularPredictor.load(path=model_folder)

    def save_model(self, output_filepath: Path, s3fs_client: S3FSClient) -> None:
        """
        Providing a path, this function will save the model to be used.
        """
        if s3fs_client.client is None:
            logger.info("In local development mode - no need for s3 client")
            logger.info("Using AutoGluon Model - Model saving already occured")
        else:
            logger.info(f"Saving model into s3")
            s3_location = s3fs_client.model_bucket + "/" + str(output_filepath)
            s3fs_client.client.put(str(output_filepath), s3_location, recursive=True)
            logger.info("Save complete")

    def train_model(
        self, data: pd.DataFrame, target_column: str, hyperparameters: dict
    ) -> None:
        """
        For the given data and hyperparameters, a model is trained
        """
        if self.output_filepath is None:
            logger.error("Please specify a output_filepath in order to train a model")
            exit(1)

        if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
            print(
                "Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required"
            )
            exit(1)

        AGdata = TabularDataset(data=data)

        self.model = TabularPredictor(
            label=target_column,
            path=self.output_filepath,
            problem_type=hyperparameters["problem_type"],
            eval_metric=hyperparameters["eval_metric"],
        ).fit(
            AGdata,
            time_limit=hyperparameters["time_limit"],
            presets=hyperparameters["presets"],
            excluded_model_types=hyperparameters["excluded_model_types"],
        )

    def generate_predictions(self, data: pd.DataFrame) -> pd.Series:
        """
        For the given dataframe, model is loaded and predictions are generated
        """

        if self.model is None:
            print("No model loaded/ trained")
            exit(1)

        predictions = pd.Series(self.model.predict(data))

        return predictions

    def model_evaluation(
        self,
        validation_data: pd.DataFrame,
        target_column: str,
        metrics: Metrics,
        metrics_location: Path | None = None,
        metric_filename: str = METRIC_FILENAME,
    ) -> pd.DataFrame:
        """
        For any validation data, a set of predictions and metrics are return
        """
        if metrics_location is None:
            logger.warning("Metrics will be outputted to current folder")
            metrics_location = Path()

        if self.model is None:
            logger.error("No model loaded/ trained - Unable to generate evaluation")
            exit(1)

        # Generate prediction, load metrics suite, generate metrics betweeen the two
        predictions = self.generate_predictions(validation_data)

        performance = metrics.generate_metric_suite(
            actuals=validation_data[target_column], predictions=predictions
        )

        logger.info("Prediction used for evaluations are saved in self.prediction")
        self.predictions = predictions

        logger.info("Saving metric file as metric.csv")
        metrics_location.mkdir(exist_ok=True)

        metrics_df = pd.DataFrame([performance])
        metrics_df.to_csv(metrics_location / metric_filename)
        markdown_filename = metric_filename.split(".")[0] + ".md"
        metrics_df.to_markdown(metrics_location / markdown_filename)

        return metrics_df

    def optimise_model_for_deployment(
        self, deployment_path: Path | str | None = None
    ) -> Any:
        """
        We can optimise the deployment for a autogluon model
        """
        if self.model is None:
            raise ValueError("No model to optimise for deployment")

        if deployment_path is None:
            raise ValueError("Deployment path required")

        deployment_path = str(deployment_path)

        # This will return a string path of the location
        return self.model.clone_for_deployment(deployment_path)

    def model_metadata(self) -> dict[str, Any]:
        """
        For Autogluon model, use the inbuilt model info method
        """

        if self.model is None:
            logger.error("No Model loaded/ trained")
            exit(1)

        return self.model.info()