handling relative paths for autogluon

This commit is contained in:
Khalim Conn-Kowlessar 2023-08-25 16:29:24 +01:00
parent 67fd184ac5
commit 2ff57a83ed
5 changed files with 43 additions and 22 deletions

View file

@ -122,17 +122,15 @@ class AutogluonModel:
return metrics_df
def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
"""
We can optimise the deployment for a autogluon model
"""
if self.model is None:
logger.error("No model to optimise for deployment")
exit(1)
raise ValueError("No model to optimise for deployment")
if deployment_path is None:
logger.error("Deployment path required")
exit(1)
raise ValueError("Deployment path required")
# This will return a string path of the location
return self.model.clone_for_deployment(deployment_path)

View file

@ -0,0 +1,17 @@
from pathlib import Path
def ensure_relative_path(file_path: str, relative_to: str | Path = None) -> Path:
"""
Convert the given path to a relative path.
:param file_path: The path to check and possibly convert.
:param relative_to: Optional path to which the given path should be made relative.
If not provided, the current working directory is used.
:return: The relative path.
"""
path = Path(file_path)
if path.is_absolute():
base_path = Path(relative_to) if relative_to else Path.cwd()
return path.relative_to(base_path)
return path

View file

@ -4,14 +4,13 @@ Script to load MLModel class and generate predictions
import json
import argparse
from MLModel.Models import AutogluonModel
from core.Logger import logger
from core.DataLoader import DataLoader
from pathlib import Path
from model_data.simulation_system.MLModel.Models import AutogluonModel
from model_data.simulation_system.core.Logger import logger
from model_data.simulation_system.core.DataLoader import DataLoader
import pandas as pd
from typing import Optional
from datetime import datetime
from core.Settings import (
from model_data.simulation_system.core.Settings import (
BASE_REGISTRY_PATH,
REGISTRY_FILE,
PREDICTION_LOCATION,
@ -19,10 +18,12 @@ from core.Settings import (
METADATA_FILE
)
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
TIMESTAMP = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# FOR TESTING
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
# DataFrame)
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
# DATA = TEST_DATA.sample(1)
@ -33,18 +34,20 @@ def ingest_arguments() -> argparse.Namespace:
"""
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for',
choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
parser.add_argument('--model-path', type=str,
help='If you wish to use a specific model, specify the model path here')
parser.add_argument('--data', type=str, help='Json data for predictions')
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
args = parser.parse_args()
return args
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None,
data_path: Optional[str] = None):
"""
Main pipeline function
"""
@ -93,6 +96,7 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
logger.info("--- Loading Model ---")
model = AutogluonModel()
model.load_model(filepath=model_location)
logger.info("--- Generating Predictions ---")
@ -125,10 +129,11 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
return json_prediction
if __name__ == "__main__":
if __name__ == "__main__":
args = ingest_arguments()
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
# Data path can be passed as so: python3 predictions.py --data-path
# ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)

View file

@ -1,16 +1,13 @@
import argparse
# import boto3
import os
from pathlib import Path
from datetime import datetime
from typing import List
from model_data.simulation_system.core.Logger import logger
from model_data.simulation_system.core.DataLoader import DataLoader
from model_data.simulation_system.core.FeatureProcessor import FeatureProcessor
from model_data.simulation_system.MLModel.Models import AutogluonModel
import pandas as pd
from model_data.simulation_system.core.Settings import (
MODEL_DIRECTORY,
BASE_REGISTRY_PATH,
REGISTRY_FILE,
MODEL_FOLDER,
@ -19,6 +16,7 @@ from model_data.simulation_system.core.Settings import (
SUBSAMPLE_FACTOR,
MODEL_HYPERPARAMETERS
)
from model_data.simulation_system.core.Helpers import ensure_relative_path
import seaborn as sns
import matplotlib.pyplot as plt
@ -159,6 +157,9 @@ def training(
logger.info("--- Optimising model for deployment ---")
deployment_model_path = model.optimise_model_for_deployment(deployment_path=output_base / DEPLOYMENT_FOLDER)
# Autogluon requires models to be stored at relative paths. This will likely eventually be s3 however we
# make sure the path is relative to the location of this script
deployment_model_path = ensure_relative_path(deployment_model_path, Path(__file__).parent)
logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
# TODO: Need a model registry - for now have this as a CSV