From f0809966b732af5903ae9f4bee45b1d7d76ea3cc Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 15 Aug 2023 18:18:55 +0100 Subject: [PATCH] add protocol for ml class --- model_data/simulation_system/MLModel.py | 113 ++++++++++++++++++++ model_data/simulation_system/predictions.py | 64 +++++++++++ model_data/simulation_system/training.py | 19 +--- 3 files changed, 178 insertions(+), 18 deletions(-) create mode 100644 model_data/simulation_system/MLModel.py create mode 100644 model_data/simulation_system/predictions.py diff --git a/model_data/simulation_system/MLModel.py b/model_data/simulation_system/MLModel.py new file mode 100644 index 00000000..b5fdeb25 --- /dev/null +++ b/model_data/simulation_system/MLModel.py @@ -0,0 +1,113 @@ +""" +MLModel class +Key tasks: +- Template Model class for different model types +- Save model +- Load Model +- Generate Inference +""" + +from pathlib import Path +from typing import Protocol, NamedTuple +import pandas as pd +from autogluon import TabularPredictor + +AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types'] + +class MLModel(Protocol): + ''' + Base ML Model protocol + ''' + + def load_model(self, filepath: Path) -> None: + """ + Providing a path, this function will load the model to be used. Will load to internal variable + """ + + def save_model(self, output_filepath: Path) -> None: + """ + Providing a path, this function will save the model to be used. + """ + + def train_model( + self, + data: pd.DataFrame, + target: str, + hyperparameter: dict) -> None: + """ + For the given data and hyperparameters (specified to the model), a model is trained + """ + + def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame: + """ + For the given dataframe, model is loaded and predictions are generated + """ + + def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple: + """ + For any validation data, a set of predictions and metrics are return + """ + +class AutogluonModel(MLModel): + """ + Autogluon model that implements the MLModel Protocol + """ + def __init__(self) -> None: + self.model = None + + def load_model(self, filepath: Path) -> None: + """ + Providing a path, this function will load the model to be used. Will load to internal variable + """ + self.model = TabularPredictor.load(path=filepath) + + + def save_model(self, output_filepath: Path) -> None: + """ + Providing a path, this function will save the model to be used. + """ + + def train_model( + self, + data: pd.DataFrame, + target_column: str, + hyperparameters: dict = None) -> None: + """ + For the given data and hyperparameters, a model is trained + """ + + if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()): + print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required") + exit(1) + + self.model = TabularPredictor( + label=target_column, + path=hyperparameters['output_path'], + problem_type=hyperparameters['problem_type'], + eval_metric=hyperparameters['eval_metric'] + ).fit( + data, + time_limit=hyperparameters['time_limit'], + presets=hyperparameters['presets'], + excluded_model_types=hyperparameters['excluded_model_types'] + ) + + + def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame: + """ + For the given dataframe, model is loaded and predictions are generated + """ + + if self.model is None: + print("No model loaded/ trained") + exit(1) + + predictions = self.model.predict(data) + + return predictions + + def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple: + """ + For any validation data, a set of predictions and metrics are return + """ + diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py new file mode 100644 index 00000000..30584240 --- /dev/null +++ b/model_data/simulation_system/predictions.py @@ -0,0 +1,64 @@ +""" +Script to load MLModel class and generate predictions +""" + +from Logger import logger +from MLModel import AutogluonModel +from DataLoader import DataLoader +from pathlib import Path +import pandas as pd +from typing import Optional + +# These will be provided in some configuration setup +HYPERPARAMETERS = { + 'problem_type': 'regression', + 'output_path': 'agModels-predictRDSAP', + 'eval_metric': 'mean_absolute_error', + 'time_limit': 8000, + 'presets': 'best_quality', + 'excluded_model_types': ['KNN'] + +} + +def main(model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None): + """ + Main pipeline function + """ + + if model_path is None: + logger.error("No model path provided") + exit(1) + + if data is None and data_path is None: + logger.error("No Data/Data Path passed") + exit(1) + + if data_path and data is None: + logger.info("--- Loading Data ---") + data = DataLoader().load() + else: + logger.warning('Ignoring data_path and loading data provided') + + logger.info("--- Loading Model ---") + model = AutogluonModel() + model.load_model(filepath=model_path) + + # model.train_model( + # data=data, + # target_column='RDSAP_CHANGE', + # hyperparameters=HYPERPARAMETERS + # ) + + logger.info("--- Generating Predictions ---") + prediction = model.generate_predictions(data=data) + + # Save prediction some where? + prediction.to_csv("s3?") + + +if __name__ == "__main__": + # For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame) + data = DataLoader.load(filepath="../simulation_system/preprocessed_data/dataset.parquet") + data_for_prediction = data.sample(1) + + main(filepath="", data=data_for_prediction) \ No newline at end of file diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index cde310a3..fec61a73 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -3,6 +3,7 @@ import pandas as pd import argparse from typing import List from Logger import logger +from DataLoader import DataLoader from autogluon.tabular import TabularDataset, TabularPredictor @@ -28,24 +29,6 @@ def ingest_arguments() -> argparse.Namespace: args = parser.parse_args() return args - - -class DataLoader(): - - @staticmethod - def load(filepath: str) -> pd.DataFrame: - """ - Load different datasets - """ - if filepath.endswith('.parquet'): - df = pd.read_parquet(filepath) - elif filepath.endswith('.csv.'): - df = pd.read_csv(filepath) - else: - logger.error('Not implemented!') - exit(1) - - return df class FeatureProcessor: """