mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
193 lines
6.2 KiB
Python
193 lines
6.2 KiB
Python
"""
|
|
Different implementations of the MLModel Protocol
|
|
Uses the BaseMLModel protocol
|
|
Key tasks:
|
|
- Template Model class for different model types
|
|
- Save model
|
|
- Load Model
|
|
- Generate Inference
|
|
"""
|
|
|
|
from typing import Any
|
|
from pathlib import Path
|
|
import pandas as pd
|
|
from autogluon.tabular import TabularDataset, TabularPredictor
|
|
from core.Logger import logger
|
|
from core.Metrics import Metrics
|
|
from core.Settings import METRIC_FILENAME
|
|
from core.CloudClient import S3FSClient
|
|
|
|
AUTOGLUON_HYPERPARAMETERS = [
|
|
"problem_type",
|
|
"eval_metric",
|
|
"time_limit",
|
|
"presets",
|
|
"excluded_model_types",
|
|
]
|
|
|
|
|
|
def model_factory(model_type: str, hyperparameters: dict) -> dict:
|
|
"""
|
|
Use factory pattern to register the different ML implementations
|
|
"""
|
|
|
|
model_types = {
|
|
"autogluon": {
|
|
"model": AutogluonModel,
|
|
"naming_attributes": f"{hyperparameters['presets']}-{hyperparameters['time_limit']}",
|
|
},
|
|
}
|
|
|
|
return model_types[model_type]
|
|
|
|
|
|
class AutogluonModel:
|
|
"""
|
|
Autogluon model that implements the MLModel Protocol
|
|
"""
|
|
|
|
def __init__(self, output_filepath: Path | None = None) -> None:
|
|
self.model = None
|
|
self.output_filepath = output_filepath
|
|
self.predictions = None
|
|
|
|
def load_model(
|
|
self,
|
|
filepath: str | Path,
|
|
s3_client: S3FSClient,
|
|
model_folder: str = "local_model",
|
|
) -> None:
|
|
"""
|
|
Providing a path, this function will load the model to be used. Will load to internal variable
|
|
"""
|
|
filepath = str(filepath)
|
|
if s3_client.client is None:
|
|
logger.info("In local development mode - no need for s3 client")
|
|
self.model = TabularPredictor.load(path=filepath)
|
|
else:
|
|
logger.info(f"Loading model from s3")
|
|
s3_client.download_model(filepath=filepath, model_folder=model_folder)
|
|
self.model = TabularPredictor.load(path=model_folder)
|
|
|
|
def save_model(self, output_filepath: Path, s3fs_client: S3FSClient) -> None:
|
|
"""
|
|
Providing a path, this function will save the model to be used.
|
|
"""
|
|
if s3fs_client.client is None:
|
|
logger.info("In local development mode - no need for s3 client")
|
|
logger.info("Using AutoGluon Model - Model saving already occured")
|
|
else:
|
|
logger.info(f"Saving model into s3")
|
|
s3_location = s3fs_client.model_bucket + "/" + str(output_filepath)
|
|
s3fs_client.client.put(str(output_filepath), s3_location, recursive=True)
|
|
logger.info("Save complete")
|
|
|
|
def train_model(
|
|
self, data: pd.DataFrame, target_column: str, hyperparameters: dict
|
|
) -> None:
|
|
"""
|
|
For the given data and hyperparameters, a model is trained
|
|
"""
|
|
if self.output_filepath is None:
|
|
logger.error("Please specify a output_filepath in order to train a model")
|
|
exit(1)
|
|
|
|
if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
|
|
print(
|
|
"Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required"
|
|
)
|
|
exit(1)
|
|
|
|
AGdata = TabularDataset(data=data)
|
|
|
|
self.model = TabularPredictor(
|
|
label=target_column,
|
|
path=self.output_filepath,
|
|
problem_type=hyperparameters["problem_type"],
|
|
eval_metric=hyperparameters["eval_metric"],
|
|
).fit(
|
|
AGdata,
|
|
time_limit=hyperparameters["time_limit"],
|
|
presets=hyperparameters["presets"],
|
|
excluded_model_types=hyperparameters["excluded_model_types"],
|
|
)
|
|
|
|
def generate_predictions(self, data: pd.DataFrame) -> pd.Series:
|
|
"""
|
|
For the given dataframe, model is loaded and predictions are generated
|
|
"""
|
|
|
|
if self.model is None:
|
|
print("No model loaded/ trained")
|
|
exit(1)
|
|
|
|
predictions = pd.Series(self.model.predict(data))
|
|
|
|
return predictions
|
|
|
|
def model_evaluation(
|
|
self,
|
|
validation_data: pd.DataFrame,
|
|
target_column: str,
|
|
metrics: Metrics,
|
|
metrics_location: Path | None = None,
|
|
metric_filename: str = METRIC_FILENAME,
|
|
) -> pd.DataFrame:
|
|
"""
|
|
For any validation data, a set of predictions and metrics are return
|
|
"""
|
|
if metrics_location is None:
|
|
logger.warning("Metrics will be outputted to current folder")
|
|
metrics_location = Path()
|
|
|
|
if self.model is None:
|
|
logger.error("No model loaded/ trained - Unable to generate evaluation")
|
|
exit(1)
|
|
|
|
# Generate prediction, load metrics suite, generate metrics betweeen the two
|
|
predictions = self.generate_predictions(validation_data)
|
|
|
|
performance = metrics.generate_metric_suite(
|
|
actuals=validation_data[target_column], predictions=predictions
|
|
)
|
|
|
|
logger.info("Prediction used for evaluations are saved in self.prediction")
|
|
self.predictions = predictions
|
|
|
|
logger.info("Saving metric file as metric.csv")
|
|
metrics_location.mkdir(exist_ok=True)
|
|
|
|
metrics_df = pd.DataFrame([performance])
|
|
metrics_df.to_csv(metrics_location / metric_filename)
|
|
markdown_filename = metric_filename.split(".")[0] + ".md"
|
|
metrics_df.to_markdown(metrics_location / markdown_filename)
|
|
|
|
return metrics_df
|
|
|
|
def optimise_model_for_deployment(
|
|
self, deployment_path: Path | str | None = None
|
|
) -> Any:
|
|
"""
|
|
We can optimise the deployment for a autogluon model
|
|
"""
|
|
if self.model is None:
|
|
raise ValueError("No model to optimise for deployment")
|
|
|
|
if deployment_path is None:
|
|
raise ValueError("Deployment path required")
|
|
|
|
deployment_path = str(deployment_path)
|
|
|
|
# This will return a string path of the location
|
|
return self.model.clone_for_deployment(deployment_path)
|
|
|
|
def model_metadata(self) -> dict[str, Any]:
|
|
"""
|
|
For Autogluon model, use the inbuilt model info method
|
|
"""
|
|
|
|
if self.model is None:
|
|
logger.error("No Model loaded/ trained")
|
|
exit(1)
|
|
|
|
return self.model.info()
|