""" Different implementations of the MLModel Protocol Uses the BaseMLModel protocol Key tasks: - Template Model class for different model types - Save model - Load Model - Generate Inference """ from typing import NamedTuple from pathlib import Path import pandas as pd from autogluon.tabular import TabularDataset, TabularPredictor from sklearn.metrics import mean_absolute_percentage_error from core.Logger import logger AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types'] METRIC_FILENAME = "metrics.csv" class AutogluonModel: """ Autogluon model that implements the MLModel Protocol """ def __init__(self, output_filepath: Path = None) -> None: self.model = None self.output_filepath = output_filepath self.predictions = None def load_model(self, filepath: Path) -> None: """ Providing a path, this function will load the model to be used. Will load to internal variable """ self.model = TabularPredictor.load(path=filepath) def save_model(self, output_filepath: Path = None) -> None: """ Providing a path, this function will save the model to be used. """ logger.info("Using AutoGluon Model - Model saving already occured") def train_model( self, data: pd.DataFrame, target_column: str, hyperparameters: dict = None) -> None: """ For the given data and hyperparameters, a model is trained """ if self.output_filepath is None: logger.error("Please specify a output_filepath in order to train a model") exit(1) if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()): print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required") exit(1) AGdata = TabularDataset(data=data) self.model = TabularPredictor( label=target_column, path=self.output_filepath, problem_type=hyperparameters['problem_type'], eval_metric=hyperparameters['eval_metric'] ).fit( AGdata, time_limit=hyperparameters['time_limit'], presets=hyperparameters['presets'], excluded_model_types=hyperparameters['excluded_model_types'] ) def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame: """ For the given dataframe, model is loaded and predictions are generated """ if self.model is None: print("No model loaded/ trained") exit(1) predictions = self.model.predict(data) return predictions def model_evaluation( self, validation_data: pd.DataFrame, target_column: str, metrics_location: Path = None, metric_filename: str = METRIC_FILENAME ) -> pd.DataFrame: """ For any validation data, a set of predictions and metrics are return """ if metrics_location is None: logger.warning("Metrics will be outputted to current folder") if self.model is None: logger.error("No model loaded/ trained - Unable to generate evaluation") exit(1) performance = self.model.evaluate(validation_data) predictions = self.generate_predictions(validation_data) logger.info("Prediction used for evaluations are saved in self.prediction") self.predictions = predictions # TODO: Can have a custom metric class that defines all different metrics we want metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions) performance['mape'] = metric_mape logger.info("Saving metric file as metric.csv") metrics_location.mkdir(exist_ok=True) metrics_df = pd.DataFrame([performance]) metrics_df.to_csv(metrics_location / metric_filename) return metrics_df def optimise_model_for_deployment(self, deployment_path: Path = None) -> None: """ We can optimise the deployment for a autogluon model """ if self.model is None: logger.error("No model to optimise for deployment") exit(1) if deployment_path is None: logger.error("Deployment path required") exit(1) # This will return a string path of the location return self.model.clone_for_deployment(deployment_path)