add protocol for ml class

2026-06-08 11:17:27 +00:00 · 2023-08-15 18:18:55 +01:00 · 2023-08-15 18:18:55 +01:00 · f0809966b7
commit f0809966b7
parent 18673e3147
3 changed files with 178 additions and 18 deletions
--- a/model_data/simulation_system/MLModel.py
+++ b/model_data/simulation_system/MLModel.py
@ -0,0 +1,113 @@
 """
 MLModel class
 Key tasks:
 - Template Model class for different model types
 - Save model
 - Load Model 
 - Generate Inference
 """
 from pathlib import Path
 from typing import Protocol, NamedTuple
 import pandas as pd
 from autogluon import TabularPredictor
 AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
 class MLModel(Protocol):
    '''
    Base ML Model protocol
    '''
    def load_model(self, filepath: Path) -> None:
        """
        Providing a path, this function will load the model to be used. Will load to internal variable
        """
    def save_model(self, output_filepath: Path) -> None:
        """
        Providing a path, this function will save the model to be used.
        """
    def train_model(
            self, 
            data: pd.DataFrame,
            target: str,
            hyperparameter: dict) -> None:
        """
        For the given data and hyperparameters (specified to the model), a model is trained
        """
    def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        For the given dataframe, model is loaded and predictions are generated
        """
    def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
        """
        For any validation data, a set of predictions and metrics are return
        """
 class AutogluonModel(MLModel):
    """
    Autogluon model that implements the MLModel Protocol
    """
    def __init__(self) -> None:
        self.model = None
    def load_model(self, filepath: Path) -> None:
        """
        Providing a path, this function will load the model to be used. Will load to internal variable
        """
        self.model = TabularPredictor.load(path=filepath)
    def save_model(self, output_filepath: Path) -> None:
        """
        Providing a path, this function will save the model to be used.
        """
    def train_model(
            self, 
            data: pd.DataFrame, 
            target_column: str, 
            hyperparameters: dict = None) -> None:
        """
        For the given data and hyperparameters, a model is trained
        """
        if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
            print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
            exit(1)
        self.model = TabularPredictor(
            label=target_column, 
            path=hyperparameters['output_path'], 
            problem_type=hyperparameters['problem_type'],
            eval_metric=hyperparameters['eval_metric']
            ).fit(
            data, 
            time_limit=hyperparameters['time_limit'], 
            presets=hyperparameters['presets'], 
            excluded_model_types=hyperparameters['excluded_model_types']
            )
    def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        For the given dataframe, model is loaded and predictions are generated
        """
        if self.model is None:
            print("No model loaded/ trained")
            exit(1)
        predictions = self.model.predict(data)
        return predictions
    def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
        """
        For any validation data, a set of predictions and metrics are return
        """
--- a/model_data/simulation_system/predictions.py
+++ b/model_data/simulation_system/predictions.py
@ -0,0 +1,64 @@
 """
 Script to load MLModel class and generate predictions
 """
 from Logger import logger
 from MLModel import AutogluonModel
 from DataLoader import DataLoader
 from pathlib import Path
 import pandas as pd
 from typing import Optional
 # These will be provided in some configuration setup
 HYPERPARAMETERS = {
    'problem_type': 'regression',
    'output_path': 'agModels-predictRDSAP',
    'eval_metric': 'mean_absolute_error', 
    'time_limit': 8000, 
    'presets': 'best_quality', 
    'excluded_model_types': ['KNN']
 }
 def main(model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
    """
    Main pipeline function
    """
    if model_path is None:
        logger.error("No model path provided")
        exit(1)
    if data is None and data_path is None:
        logger.error("No Data/Data Path passed")
        exit(1)
    if data_path and data is None:
        logger.info("--- Loading Data ---")
        data = DataLoader().load()
    else:
        logger.warning('Ignoring data_path and loading data provided')
    logger.info("--- Loading Model ---")
    model = AutogluonModel()
    model.load_model(filepath=model_path)
    # model.train_model(
    #     data=data, 
    #     target_column='RDSAP_CHANGE', 
    #     hyperparameters=HYPERPARAMETERS
    #     )
    logger.info("--- Generating Predictions ---")
    prediction = model.generate_predictions(data=data)
    # Save prediction some where?
    prediction.to_csv("s3?")
 if __name__ == "__main__":
    # For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
    data = DataLoader.load(filepath="../simulation_system/preprocessed_data/dataset.parquet")
    data_for_prediction = data.sample(1)
    main(filepath="", data=data_for_prediction)
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@ -3,6 +3,7 @@ import pandas as pd
 import argparse
 from typing import List
 from Logger import logger
 from DataLoader import DataLoader
 from autogluon.tabular import TabularDataset, TabularPredictor
@ -28,24 +29,6 @@ def ingest_arguments() -> argparse.Namespace:
    args = parser.parse_args()
    return args
 class DataLoader():
    @staticmethod
    def load(filepath: str) -> pd.DataFrame:
        """
        Load different datasets
        """
        if filepath.endswith('.parquet'):
            df = pd.read_parquet(filepath)
        elif filepath.endswith('.csv.'):
            df = pd.read_csv(filepath)
        else:
            logger.error('Not implemented!')
            exit(1)
        return df
 class FeatureProcessor:
    """