""" Different implementations of the MLModel Protocol Uses the BaseMLModel protocol Key tasks: - Template Model class for different model types - Save model - Load Model - Generate Inference """ from typing import Any from pathlib import Path import pandas as pd from autogluon.tabular import TabularDataset, TabularPredictor from core.Logger import logger from core.Metrics import Metrics from core.Settings import METRIC_FILENAME from core.CloudClient import S3FSClient AUTOGLUON_HYPERPARAMETERS = [ "problem_type", "eval_metric", "time_limit", "presets", "excluded_model_types", ] def model_factory(model_type: str, hyperparameters: dict) -> dict: """ Use factory pattern to register the different ML implementations """ model_types = { "autogluon": { "model": AutogluonModel, "naming_attributes": f"{hyperparameters['presets']}-{hyperparameters['time_limit']}", }, } return model_types[model_type] class AutogluonModel: """ Autogluon model that implements the MLModel Protocol """ def __init__(self, output_filepath: Path | None = None) -> None: self.model = None self.output_filepath = output_filepath self.predictions = None def load_model( self, filepath: str | Path, s3_client: S3FSClient | None = None ) -> None: """ Providing a path, this function will load the model to be used. Will load to internal variable """ if s3_client is None: logger.info("In local development mode - no need for s3 client") filepath = str(filepath) self.model = TabularPredictor.load(path=filepath) else: pass # logger.info(f"Loading model from s3") # s3_client.download_model(filepath=filepath, local_filepath=) # self.model = def save_model(self, output_filepath: Path, s3fs_client: S3FSClient) -> None: """ Providing a path, this function will save the model to be used. """ if s3fs_client.client is None: logger.info("In local development mode - no need for s3 client") logger.info("Using AutoGluon Model - Model saving already occured") else: logger.info(f"Saving model into s3") s3_location = s3fs_client.model_bucket + "/" + str(output_filepath) s3fs_client.client.put(str(output_filepath), s3_location, recursive=True) logger.info("Save complete") def train_model( self, data: pd.DataFrame, target_column: str, hyperparameters: dict ) -> None: """ For the given data and hyperparameters, a model is trained """ if self.output_filepath is None: logger.error("Please specify a output_filepath in order to train a model") exit(1) if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()): print( "Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required" ) exit(1) AGdata = TabularDataset(data=data) self.model = TabularPredictor( label=target_column, path=self.output_filepath, problem_type=hyperparameters["problem_type"], eval_metric=hyperparameters["eval_metric"], ).fit( AGdata, time_limit=hyperparameters["time_limit"], presets=hyperparameters["presets"], excluded_model_types=hyperparameters["excluded_model_types"], ) def generate_predictions(self, data: pd.DataFrame) -> pd.Series: """ For the given dataframe, model is loaded and predictions are generated """ if self.model is None: print("No model loaded/ trained") exit(1) predictions = pd.Series(self.model.predict(data)) return predictions def model_evaluation( self, validation_data: pd.DataFrame, target_column: str, metrics: Metrics, metrics_location: Path | None = None, metric_filename: str = METRIC_FILENAME, ) -> pd.DataFrame: """ For any validation data, a set of predictions and metrics are return """ if metrics_location is None: logger.warning("Metrics will be outputted to current folder") metrics_location = Path() if self.model is None: logger.error("No model loaded/ trained - Unable to generate evaluation") exit(1) # Generate prediction, load metrics suite, generate metrics betweeen the two predictions = self.generate_predictions(validation_data) performance = metrics.generate_metric_suite( actuals=validation_data[target_column], predictions=predictions ) logger.info("Prediction used for evaluations are saved in self.prediction") self.predictions = predictions logger.info("Saving metric file as metric.csv") metrics_location.mkdir(exist_ok=True) metrics_df = pd.DataFrame([performance]) metrics_df.to_csv(metrics_location / metric_filename) markdown_filename = metric_filename.split(".")[0] + ".md" metrics_df.to_markdown(metrics_location / markdown_filename) return metrics_df def optimise_model_for_deployment( self, deployment_path: Path | str | None = None ) -> Any: """ We can optimise the deployment for a autogluon model """ if self.model is None: raise ValueError("No model to optimise for deployment") if deployment_path is None: raise ValueError("Deployment path required") deployment_path = str(deployment_path) # This will return a string path of the location return self.model.clone_for_deployment(deployment_path) def model_metadata(self) -> dict[str, Any]: """ For Autogluon model, use the inbuilt model info method """ if self.model is None: logger.error("No Model loaded/ trained") exit(1) return self.model.info()