""" Different implementations of the MLModel Protocol Uses the BaseMLModel protocol Key tasks: - Template Model class for different model types - Save model - Load Model - Generate Inference """ from typing import Any from pathlib import Path import pandas as pd from autogluon.tabular import TabularDataset, TabularPredictor from sklearn.metrics import mean_absolute_percentage_error from core.Logger import logger from core.Metrics import Metrics from core.Settings import METRIC_FILENAME from MLModel.BaseMLModel import MLModel AUTOGLUON_HYPERPARAMETERS = [ "problem_type", "eval_metric", "time_limit", "presets", "excluded_model_types", ] def model_factory(model_type: str, hyperparameters: dict) -> dict: """ Use factory pattern to register the different ML implementations """ model_types = { "autogluon": { "model": AutogluonModel, "naming_attributes": f"{hyperparameters['presets']}-{hyperparameters['time_limit']}", }, } return model_types[model_type] class AutogluonModel: """ Autogluon model that implements the MLModel Protocol """ def __init__(self, output_filepath: Path | None = None) -> None: self.model = None self.output_filepath = output_filepath self.predictions = None def load_model(self, filepath: str | Path) -> None: """ Providing a path, this function will load the model to be used. Will load to internal variable """ filepath = str(filepath) self.model = TabularPredictor.load(path=filepath) def save_model(self, output_filepath: Path | None = None) -> None: """ Providing a path, this function will save the model to be used. """ logger.info("Using AutoGluon Model - Model saving already occured") def train_model( self, data: pd.DataFrame, target_column: str, hyperparameters: dict ) -> None: """ For the given data and hyperparameters, a model is trained """ if self.output_filepath is None: logger.error("Please specify a output_filepath in order to train a model") exit(1) if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()): print( "Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required" ) exit(1) AGdata = TabularDataset(data=data) self.model = TabularPredictor( label=target_column, path=self.output_filepath, problem_type=hyperparameters["problem_type"], eval_metric=hyperparameters["eval_metric"], ).fit( AGdata, time_limit=hyperparameters["time_limit"], presets=hyperparameters["presets"], excluded_model_types=hyperparameters["excluded_model_types"], ) def generate_predictions(self, data: pd.DataFrame) -> pd.Series: """ For the given dataframe, model is loaded and predictions are generated """ if self.model is None: print("No model loaded/ trained") exit(1) predictions = pd.Series(self.model.predict(data)) return predictions def model_evaluation( self, validation_data: pd.DataFrame, target_column: str, metrics: Metrics, metrics_location: Path | None = None, metric_filename: str = METRIC_FILENAME, ) -> pd.DataFrame: """ For any validation data, a set of predictions and metrics are return """ if metrics_location is None: logger.warning("Metrics will be outputted to current folder") metrics_location = Path() if self.model is None: logger.error("No model loaded/ trained - Unable to generate evaluation") exit(1) # Generate prediction, load metrics suite, generate metrics betweeen the two predictions = self.generate_predictions(validation_data) performance = metrics.generate_metric_suite( actuals=validation_data[target_column], predictions=predictions ) logger.info("Prediction used for evaluations are saved in self.prediction") self.predictions = predictions logger.info("Saving metric file as metric.csv") metrics_location.mkdir(exist_ok=True) metrics_df = pd.DataFrame([performance]) metrics_df.to_csv(metrics_location / metric_filename) markdown_filename = metric_filename.split(".")[0] + ".md" metrics_df.to_markdown(metrics_location / markdown_filename) return metrics_df def optimise_model_for_deployment( self, deployment_path: Path | str | None = None ) -> Any: """ We can optimise the deployment for a autogluon model """ if self.model is None: raise ValueError("No model to optimise for deployment") if deployment_path is None: raise ValueError("Deployment path required") deployment_path = str(deployment_path) # This will return a string path of the location return self.model.clone_for_deployment(deployment_path) def model_metadata(self) -> dict[str, Any]: """ For Autogluon model, use the inbuilt model info method """ if self.model is None: logger.error("No Model loaded/ trained") exit(1) return self.model.info()