add protocol for ml class

This commit is contained in:
Michael Duong 2023-08-15 18:18:55 +01:00
parent 18673e3147
commit f0809966b7
3 changed files with 178 additions and 18 deletions

View file

@ -0,0 +1,113 @@
"""
MLModel class
Key tasks:
- Template Model class for different model types
- Save model
- Load Model
- Generate Inference
"""
from pathlib import Path
from typing import Protocol, NamedTuple
import pandas as pd
from autogluon import TabularPredictor
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
class MLModel(Protocol):
'''
Base ML Model protocol
'''
def load_model(self, filepath: Path) -> None:
"""
Providing a path, this function will load the model to be used. Will load to internal variable
"""
def save_model(self, output_filepath: Path) -> None:
"""
Providing a path, this function will save the model to be used.
"""
def train_model(
self,
data: pd.DataFrame,
target: str,
hyperparameter: dict) -> None:
"""
For the given data and hyperparameters (specified to the model), a model is trained
"""
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
For the given dataframe, model is loaded and predictions are generated
"""
def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
"""
For any validation data, a set of predictions and metrics are return
"""
class AutogluonModel(MLModel):
"""
Autogluon model that implements the MLModel Protocol
"""
def __init__(self) -> None:
self.model = None
def load_model(self, filepath: Path) -> None:
"""
Providing a path, this function will load the model to be used. Will load to internal variable
"""
self.model = TabularPredictor.load(path=filepath)
def save_model(self, output_filepath: Path) -> None:
"""
Providing a path, this function will save the model to be used.
"""
def train_model(
self,
data: pd.DataFrame,
target_column: str,
hyperparameters: dict = None) -> None:
"""
For the given data and hyperparameters, a model is trained
"""
if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
exit(1)
self.model = TabularPredictor(
label=target_column,
path=hyperparameters['output_path'],
problem_type=hyperparameters['problem_type'],
eval_metric=hyperparameters['eval_metric']
).fit(
data,
time_limit=hyperparameters['time_limit'],
presets=hyperparameters['presets'],
excluded_model_types=hyperparameters['excluded_model_types']
)
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
For the given dataframe, model is loaded and predictions are generated
"""
if self.model is None:
print("No model loaded/ trained")
exit(1)
predictions = self.model.predict(data)
return predictions
def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
"""
For any validation data, a set of predictions and metrics are return
"""

View file

@ -0,0 +1,64 @@
"""
Script to load MLModel class and generate predictions
"""
from Logger import logger
from MLModel import AutogluonModel
from DataLoader import DataLoader
from pathlib import Path
import pandas as pd
from typing import Optional
# These will be provided in some configuration setup
HYPERPARAMETERS = {
'problem_type': 'regression',
'output_path': 'agModels-predictRDSAP',
'eval_metric': 'mean_absolute_error',
'time_limit': 8000,
'presets': 'best_quality',
'excluded_model_types': ['KNN']
}
def main(model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
"""
Main pipeline function
"""
if model_path is None:
logger.error("No model path provided")
exit(1)
if data is None and data_path is None:
logger.error("No Data/Data Path passed")
exit(1)
if data_path and data is None:
logger.info("--- Loading Data ---")
data = DataLoader().load()
else:
logger.warning('Ignoring data_path and loading data provided')
logger.info("--- Loading Model ---")
model = AutogluonModel()
model.load_model(filepath=model_path)
# model.train_model(
# data=data,
# target_column='RDSAP_CHANGE',
# hyperparameters=HYPERPARAMETERS
# )
logger.info("--- Generating Predictions ---")
prediction = model.generate_predictions(data=data)
# Save prediction some where?
prediction.to_csv("s3?")
if __name__ == "__main__":
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
data = DataLoader.load(filepath="../simulation_system/preprocessed_data/dataset.parquet")
data_for_prediction = data.sample(1)
main(filepath="", data=data_for_prediction)

View file

@ -3,6 +3,7 @@ import pandas as pd
import argparse
from typing import List
from Logger import logger
from DataLoader import DataLoader
from autogluon.tabular import TabularDataset, TabularPredictor
@ -28,24 +29,6 @@ def ingest_arguments() -> argparse.Namespace:
args = parser.parse_args()
return args
class DataLoader():
@staticmethod
def load(filepath: str) -> pd.DataFrame:
"""
Load different datasets
"""
if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath)
elif filepath.endswith('.csv.'):
df = pd.read_csv(filepath)
else:
logger.error('Not implemented!')
exit(1)
return df
class FeatureProcessor:
"""