From f0809966b732af5903ae9f4bee45b1d7d76ea3cc Mon Sep 17 00:00:00 2001
From: Michael Duong <michaelduong@Michaels-MacBook-Pro.local>
Date: Tue, 15 Aug 2023 18:18:55 +0100
Subject: [PATCH] add protocol for ml class

---
 model_data/simulation_system/MLModel.py     | 113 ++++++++++++++++++++
 model_data/simulation_system/predictions.py |  64 +++++++++++
 model_data/simulation_system/training.py    |  19 +---
 3 files changed, 178 insertions(+), 18 deletions(-)
 create mode 100644 model_data/simulation_system/MLModel.py
 create mode 100644 model_data/simulation_system/predictions.py

diff --git a/model_data/simulation_system/MLModel.py b/model_data/simulation_system/MLModel.py
new file mode 100644
index 00000000..b5fdeb25
--- /dev/null
+++ b/model_data/simulation_system/MLModel.py
@@ -0,0 +1,113 @@
+"""
+MLModel class
+Key tasks:
+- Template Model class for different model types
+- Save model
+- Load Model 
+- Generate Inference
+"""
+
+from pathlib import Path
+from typing import Protocol, NamedTuple
+import pandas as pd
+from autogluon import TabularPredictor
+
+AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
+
+class MLModel(Protocol):
+    '''
+    Base ML Model protocol
+    '''
+
+    def load_model(self, filepath: Path) -> None:
+        """
+        Providing a path, this function will load the model to be used. Will load to internal variable
+        """
+
+    def save_model(self, output_filepath: Path) -> None:
+        """
+        Providing a path, this function will save the model to be used.
+        """
+
+    def train_model(
+            self, 
+            data: pd.DataFrame,
+            target: str,
+            hyperparameter: dict) -> None:
+        """
+        For the given data and hyperparameters (specified to the model), a model is trained
+        """
+
+    def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
+        """
+        For the given dataframe, model is loaded and predictions are generated
+        """
+
+    def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
+        """
+        For any validation data, a set of predictions and metrics are return
+        """
+
+class AutogluonModel(MLModel):
+    """
+    Autogluon model that implements the MLModel Protocol
+    """
+    def __init__(self) -> None:
+        self.model = None
+
+    def load_model(self, filepath: Path) -> None:
+        """
+        Providing a path, this function will load the model to be used. Will load to internal variable
+        """
+        self.model = TabularPredictor.load(path=filepath)
+        
+
+    def save_model(self, output_filepath: Path) -> None:
+        """
+        Providing a path, this function will save the model to be used.
+        """
+
+    def train_model(
+            self, 
+            data: pd.DataFrame, 
+            target_column: str, 
+            hyperparameters: dict = None) -> None:
+        """
+        For the given data and hyperparameters, a model is trained
+        """
+
+        if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
+            print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
+            exit(1)
+
+        self.model = TabularPredictor(
+            label=target_column, 
+            path=hyperparameters['output_path'], 
+            problem_type=hyperparameters['problem_type'],
+            eval_metric=hyperparameters['eval_metric']
+            ).fit(
+            data, 
+            time_limit=hyperparameters['time_limit'], 
+            presets=hyperparameters['presets'], 
+            excluded_model_types=hyperparameters['excluded_model_types']
+            )
+
+
+    def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
+        """
+        For the given dataframe, model is loaded and predictions are generated
+        """
+
+        if self.model is None:
+            print("No model loaded/ trained")
+            exit(1)
+
+        predictions = self.model.predict(data)
+
+        return predictions
+
+    def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
+        """
+        For any validation data, a set of predictions and metrics are return
+        """
+
diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py
new file mode 100644
index 00000000..30584240
--- /dev/null
+++ b/model_data/simulation_system/predictions.py
@@ -0,0 +1,64 @@
+"""
+Script to load MLModel class and generate predictions
+"""
+
+from Logger import logger
+from MLModel import AutogluonModel
+from DataLoader import DataLoader
+from pathlib import Path
+import pandas as pd
+from typing import Optional
+
+# These will be provided in some configuration setup
+HYPERPARAMETERS = {
+    'problem_type': 'regression',
+    'output_path': 'agModels-predictRDSAP',
+    'eval_metric': 'mean_absolute_error', 
+    'time_limit': 8000, 
+    'presets': 'best_quality', 
+    'excluded_model_types': ['KNN']
+
+}
+
+def main(model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
+    """
+    Main pipeline function
+    """
+
+    if model_path is None:
+        logger.error("No model path provided")
+        exit(1)
+
+    if data is None and data_path is None:
+        logger.error("No Data/Data Path passed")
+        exit(1)
+
+    if data_path and data is None:
+        logger.info("--- Loading Data ---")
+        data = DataLoader().load()
+    else:
+        logger.warning('Ignoring data_path and loading data provided')
+
+    logger.info("--- Loading Model ---")
+    model = AutogluonModel()
+    model.load_model(filepath=model_path)
+
+    # model.train_model(
+    #     data=data, 
+    #     target_column='RDSAP_CHANGE', 
+    #     hyperparameters=HYPERPARAMETERS
+    #     )
+
+    logger.info("--- Generating Predictions ---")
+    prediction = model.generate_predictions(data=data)
+
+    # Save prediction some where?
+    prediction.to_csv("s3?")
+
+
+if __name__ == "__main__":
+    # For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
+    data = DataLoader.load(filepath="../simulation_system/preprocessed_data/dataset.parquet")
+    data_for_prediction = data.sample(1)
+
+    main(filepath="", data=data_for_prediction)
\ No newline at end of file
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index cde310a3..fec61a73 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -3,6 +3,7 @@ import pandas as pd
 import argparse
 from typing import List
 from Logger import logger
+from DataLoader import DataLoader
 from autogluon.tabular import TabularDataset, TabularPredictor
 
 
@@ -28,24 +29,6 @@ def ingest_arguments() -> argparse.Namespace:
     args = parser.parse_args()
 
     return args
-
-
-class DataLoader():
-
-    @staticmethod
-    def load(filepath: str) -> pd.DataFrame:
-        """
-        Load different datasets
-        """
-        if filepath.endswith('.parquet'):
-            df = pd.read_parquet(filepath)
-        elif filepath.endswith('.csv.'):
-            df = pd.read_csv(filepath)
-        else:
-            logger.error('Not implemented!')
-            exit(1)
-
-        return df
     
 class FeatureProcessor:
     """