added mlmodel, prediction and training files

This commit is contained in:
Michael Duong 2023-08-17 16:07:22 +01:00
parent a90a1278a8
commit 2a18180c53
15 changed files with 529 additions and 237 deletions

View file

@ -1,113 +0,0 @@
"""
MLModel class
Key tasks:
- Template Model class for different model types
- Save model
- Load Model
- Generate Inference
"""
from pathlib import Path
from typing import Protocol, NamedTuple
import pandas as pd
from autogluon import TabularPredictor
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
class MLModel(Protocol):
'''
Base ML Model protocol
'''
def load_model(self, filepath: Path) -> None:
"""
Providing a path, this function will load the model to be used. Will load to internal variable
"""
def save_model(self, output_filepath: Path) -> None:
"""
Providing a path, this function will save the model to be used.
"""
def train_model(
self,
data: pd.DataFrame,
target: str,
hyperparameter: dict) -> None:
"""
For the given data and hyperparameters (specified to the model), a model is trained
"""
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
For the given dataframe, model is loaded and predictions are generated
"""
def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
"""
For any validation data, a set of predictions and metrics are return
"""
class AutogluonModel(MLModel):
"""
Autogluon model that implements the MLModel Protocol
"""
def __init__(self) -> None:
self.model = None
def load_model(self, filepath: Path) -> None:
"""
Providing a path, this function will load the model to be used. Will load to internal variable
"""
self.model = TabularPredictor.load(path=filepath)
def save_model(self, output_filepath: Path) -> None:
"""
Providing a path, this function will save the model to be used.
"""
def train_model(
self,
data: pd.DataFrame,
target_column: str,
hyperparameters: dict = None) -> None:
"""
For the given data and hyperparameters, a model is trained
"""
if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
exit(1)
self.model = TabularPredictor(
label=target_column,
path=hyperparameters['output_path'],
problem_type=hyperparameters['problem_type'],
eval_metric=hyperparameters['eval_metric']
).fit(
data,
time_limit=hyperparameters['time_limit'],
presets=hyperparameters['presets'],
excluded_model_types=hyperparameters['excluded_model_types']
)
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
For the given dataframe, model is loaded and predictions are generated
"""
if self.model is None:
print("No model loaded/ trained")
exit(1)
predictions = self.model.predict(data)
return predictions
def model_evaluation(self, validation_data: pd.DataFrame) -> NamedTuple:
"""
For any validation data, a set of predictions and metrics are return
"""

View file

@ -0,0 +1,56 @@
"""
BaseMLModel class
This is the base protocol:
- Any implementation will be its own seperate file
Key tasks:
- Template Model class for different model types
- Save model
- Load Model
- Generate Inference
"""
from pathlib import Path
from typing import Protocol, NamedTuple
import pandas as pd
class MLModel(Protocol):
'''
Base ML Model protocol
'''
def load_model(self, filepath: Path) -> None:
"""
Providing a path, this function will load the model to be used. Will load to internal variable
"""
def save_model(self, output_filepath: Path) -> None:
"""
Providing a path, this function will save the model to be used.
"""
def train_model(
self,
data: pd.DataFrame,
target_column: str,
hyperparameter: dict
) -> None:
"""
For the given data and hyperparameters (specified to the model), a model is trained
"""
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
For the given dataframe, model is loaded and predictions are generated
"""
def model_evaluation(self, validation_data: pd.DataFrame, target_column: str, metrics_location: Path = None) -> NamedTuple:
"""
For any validation data, a set of predictions and metrics are return
"""
def optimise_model_for_deployment(self):
"""
Perfomance post processing on Model to ensure ready for deployment
"""

View file

@ -0,0 +1,140 @@
"""
Different implementations of the MLModel Protocol
Uses the BaseMLModel protocol
Key tasks:
- Template Model class for different model types
- Save model
- Load Model
- Generate Inference
"""
from typing import NamedTuple
from pathlib import Path
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.metrics import mean_absolute_percentage_error
from core.Logger import logger
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
METRIC_FILENAME = "metrics.csv"
class AutogluonModel:
"""
Autogluon model that implements the MLModel Protocol
"""
def __init__(self, output_filepath: Path = None) -> None:
self.model = None
self.output_filepath = output_filepath
self.predictions = None
def load_model(self, filepath: Path) -> None:
"""
Providing a path, this function will load the model to be used. Will load to internal variable
"""
self.model = TabularPredictor.load(path=filepath)
def save_model(self, output_filepath: Path = None) -> None:
"""
Providing a path, this function will save the model to be used.
"""
logger.info("Using AutoGluon Model - Model saving already occured")
def train_model(
self,
data: pd.DataFrame,
target_column: str,
hyperparameters: dict = None) -> None:
"""
For the given data and hyperparameters, a model is trained
"""
if self.output_filepath is None:
logger.error("Please specify a output_filepath in order to train a model")
exit(1)
if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
exit(1)
AGdata = TabularDataset(data=data)
self.model = TabularPredictor(
label=target_column,
path=self.output_filepath,
problem_type=hyperparameters['problem_type'],
eval_metric=hyperparameters['eval_metric']
).fit(
AGdata,
time_limit=hyperparameters['time_limit'],
presets=hyperparameters['presets'],
excluded_model_types=hyperparameters['excluded_model_types']
)
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
"""
For the given dataframe, model is loaded and predictions are generated
"""
if self.model is None:
print("No model loaded/ trained")
exit(1)
predictions = self.model.predict(data)
return predictions
def model_evaluation(
self,
validation_data: pd.DataFrame,
target_column: str,
metrics_location: Path = None,
metric_filename: str = METRIC_FILENAME
) -> pd.DataFrame:
"""
For any validation data, a set of predictions and metrics are return
"""
if metrics_location is None:
logger.warning("Metrics will be outputted to current folder")
if self.model is None:
logger.error("No model loaded/ trained - Unable to generate evaluation")
exit(1)
performance = self.model.evaluate(validation_data)
predictions = self.generate_predictions(validation_data)
logger.info("Prediction used for evaluations are saved in self.prediction")
self.predictions = predictions
# TODO: Can have a custom metric class that defines all different metrics we want
metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
performance['mape'] = metric_mape
logger.info("Saving metric file as metric.csv")
metrics_location.mkdir(exist_ok=True)
metrics_df = pd.DataFrame([performance])
metrics_df.to_csv(metrics_location / metric_filename)
return metrics_df
def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
"""
We can optimise the deployment for a autogluon model
"""
if self.model is None:
logger.error("No model to optimise for deployment")
exit(1)
if deployment_path is None:
logger.error("Deployment path required")
exit(1)
# This will return a string path of the location
return self.model.clone_for_deployment(deployment_path)

View file

@ -0,0 +1,21 @@
import pandas as pd
from core.Logger import logger
class DataLoader():
@staticmethod
def load(filepath: str, index_col: str = None) -> pd.DataFrame:
"""
Load different datasets
"""
if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath)
if index_col is not None:
df = df.set_index(index_col)
elif filepath.endswith('.csv'):
df = pd.read_csv(filepath, index_col=index_col)
else:
logger.error('Not implemented!')
exit(1)
return df

View file

@ -2,7 +2,7 @@ from pathlib import Path
import numpy as np
import pandas as pd
from model_data.BaseUtility import BaseUtility
from simulation_system.Settings import (
from simulation_system.core.Settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
FULLY_GLAZED_DESCRIPTIONS,

View file

@ -0,0 +1,70 @@
"""
Create additional features from the dataset
"""
import pandas as pd
from typing import List
from core.Logger import logger
RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
RANDOM_SEED = 0
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
"""
@staticmethod
def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame:
"""
Remove the unused columns for RDS
"""
if target_column == "RDSAP_CHANGE":
df = df.drop(columns=RDSAP_CHANGE_DROP_COLUMNS)
elif target_column == "HEAT_DEMAND_CHANGE":
df = df.drop(columns=HEAT_DEMAND_CHANGE_DROP_COLUMNS)
return df
@staticmethod
def retain_features(df: pd.DataFrame, features: List[str] = None):
"""
Determine which columns to keep for modelling
"""
if features is None:
features = df.columns
else:
if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data')
exit(1)
df = df[features]
return df
@staticmethod
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
"""
Sample data to reduce number of rows for model building if needed
"""
if subsample_amount:
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
return df
def process(
self,
df: pd.DataFrame,
target_column: str = "RDSAP_CHANGE",
features: List[str] = None,
subsample_amount: int = None
) -> pd.DataFrame:
"""
Pipeline to get data ready for building a model
"""
df = self.subsample_data(df, subsample_amount=subsample_amount)
df = self.drop_unused_columns(df, target_column=target_column)
df = self.retain_features(df, features=features)
return df

View file

@ -1,3 +1,7 @@
"""
Logger that will be used throughout the application
"""
import logging
def setup_logger():

View file

@ -1,5 +1,18 @@
# Using a simply python file as settings for now
# TODO: migrate to dynaconf
from pathlib import Path
RANDOM_SEED = 0
TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet'
TEST_DATA_NAME = 'test_data.parquet'
REGISTRY_FILE = "model_registry.csv"
MODEL_DIRECTORY = "model_directory"
REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY / REGISTRY_FILE
PREDICTION_LOCATION = Path("predictions")
PREDICTION_FILE = 'prediction.json'
METADATA_FILE = 'metadata.json'
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45

View file

@ -1,5 +1,5 @@
from pathlib import Path
from Settings import (
from core.SettingsSettings import (
RDSAP_RESPONSE,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP,

View file

@ -1,23 +1,22 @@
import numpy as np
import pandas as pd
from tqdm import tqdm
from model_data.BaseUtility import BaseUtility
from pathlib import Path
from model_data.simulation_system.Settings import (
from core.Settings import (
MANDATORY_FIXED_FEATURES,
AVERAGE_FIXED_FEATURES,
LATEST_FIELD,
COMPONENT_FEATURES,
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP
COLUMNS_TO_MERGE_ON
)
from DataProcessor import DataProcessor
from core.DataProcessor import DataProcessor
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
# TODO: Have a look at temporal features
def app():
# Get all the files in the directory
@ -77,9 +76,6 @@ def app():
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
# Take the more recent value since it's likely to be more accurate
vals = [vals[-1]]
if len(vals) == 0:
wrong_var
fixed_data[field] = np.mean(vals)

View file

@ -2,63 +2,127 @@
Script to load MLModel class and generate predictions
"""
from Logger import logger
from MLModel import AutogluonModel
from DataLoader import DataLoader
import json
import argparse
from MLModel.Models import AutogluonModel
from core.Logger import logger
from core.DataLoader import DataLoader
from pathlib import Path
import pandas as pd
from typing import Optional
from datetime import datetime
from core.Settings import (
REGISTRY_PATH,
PREDICTION_LOCATION,
PREDICTION_FILE,
METADATA_FILE
)
# These will be provided in some configuration setup
HYPERPARAMETERS = {
'problem_type': 'regression',
'output_path': 'agModels-predictRDSAP',
'eval_metric': 'mean_absolute_error',
'time_limit': 8000,
'presets': 'best_quality',
'excluded_model_types': ['KNN']
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
}
# FOR TESTING
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
# DATA = TEST_DATA.sample(1)
def main(model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
def ingest_arguments() -> argparse.Namespace:
"""
Helper function to take in arguments from script start
"""
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
parser.add_argument('--data', type=str, help='Location of Parquet dataset to load for testing')
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
args = parser.parse_args()
return args
def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
"""
Main pipeline function
"""
if model_path is None:
logger.error("No model path provided")
if registry_path is None:
logger.error("No registry path provided")
exit(1)
if model_path is not None:
logger.info("User specified a model to load - ignoring registry")
model_location = model_path
model_type = model_path
model_name = model_path
else:
# TODO: Think about where registry will sit/ type
logger.info("Loading best model from registry")
registry_df = pd.read_csv(registry_path)
best_model_df = registry_df[registry_df['best_model']]
model_location = best_model_df['model_location'].values[0]
model_type = best_model_df['model_type'].values[0]
model_name = best_model_df['model_name'].values[0]
logger.info("--- Model Info: ---")
logger.info(f"Model type: {model_type}")
logger.info(f"Model name: {model_name}")
logger.info(f"Model location: {model_location}")
logger.info("--- Loading Data ---")
if data is None and data_path is None:
logger.error("No Data/Data Path passed")
exit(1)
if data_path and data is None:
logger.info("--- Loading Data ---")
data = DataLoader().load()
logger.info("Loading data from provided path")
data = DataLoader().load(filepath=data_path, index_col="UPRN")
# TODO: DOWNSAMPLING DOWN TO JUST USE ONE FOR PREDICTION
data = data.sample(1)
else:
logger.warning('Ignoring data_path and loading data provided')
logger.info('Using data provided')
data = json.loads(data)
data = pd.DataFrame([data])
print(data)
logger.info("--- Loading Model ---")
model = AutogluonModel()
model.load_model(filepath=model_path)
# model.train_model(
# data=data,
# target_column='RDSAP_CHANGE',
# hyperparameters=HYPERPARAMETERS
# )
model.load_model(filepath=model_location)
logger.info("--- Generating Predictions ---")
prediction = model.generate_predictions(data=data)
# Save prediction some where?
prediction.to_csv("s3?")
# prediction.to_csv("s3?")
# TODO: Check how we want to structure outputs
# For now, just categorise by uprn and timestamp
# Assume one uprn coming in for now
uprn = data.index.values[0]
# Saving prediction local for now
logger.info("--- Outputting prediction and metadata --- ")
output_base = PREDICTION_LOCATION / uprn / TIMESTAMP
output_base.mkdir(parents=True, exist_ok=True)
json_prediction = prediction.to_json(output_base / PREDICTION_FILE)
prediction_metadata = {
"model_type": model_type,
"model_name": model_name,
"model_location": model_location,
"model_settings": model.model.info()
}
pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE)
return json_prediction
if __name__ == "__main__":
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
data = DataLoader.load(filepath="../simulation_system/preprocessed_data/dataset.parquet")
data_for_prediction = data.sample(1)
main(filepath="", data=data_for_prediction)
args = ingest_arguments()
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
prediction(registry_path=REGISTRY_PATH, model_path=args.model_path, data=args.data, data_path=args.data_path)

View file

@ -1,9 +1,12 @@
from Logger import logger
from core.Logger import logger
import argparse
import pandas as pd
from pathlib import Path
RANDOM_SEED = 0
from core.Settings import (
RANDOM_SEED,
TRAIN_AND_VALIDATION_DATA_NAME,
TEST_DATA_NAME
)
def ingest_arguments() -> argparse.Namespace:
"""
@ -56,8 +59,8 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp
logger.info('--- Saving data ---')
train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet')
test_data.to_parquet(Path(output_folder)/'test_data.parquet')
train_validation_data.to_parquet(Path(output_folder)/ TRAIN_AND_VALIDATION_DATA_NAME)
test_data.to_parquet(Path(output_folder)/ TEST_DATA_NAME)
logger.info(' ---Pipeline complete---')

View file

@ -1,19 +1,37 @@
import os
import pandas as pd
import argparse
from pathlib import Path
from datetime import datetime
from typing import List
from Logger import logger
from DataLoader import DataLoader
from autogluon.tabular import TabularDataset, TabularPredictor
from core.Logger import logger
from core.DataLoader import DataLoader
from core.FeatureProcessor import FeatureProcessor
from MLModel.Models import AutogluonModel
import pandas as pd
from core.Settings import (
MODEL_DIRECTORY,
REGISTRY_PATH
)
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
FEATURE_COLUMNS = None
RANDOM_SEED = 0
# Can move to a hyperparmeters file
# If anything we might want to have a file that can be loaded and sent to this script
HYPERPARAMETERS = {
'problem_type': 'regression',
'eval_metric': 'mean_absolute_error',
'time_limit': 60,
'presets': 'medium_quality',
'excluded_model_types': None
}
# FOR TESTING
train_filepath = "./model_build_data/train_validation_data.parquet"
test_filepath = "./model_build_data/test_data.parquet"
train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
target_column = "RDSAP_CHANGE"
model_type = "autogluon"
hyperparameter = HYPERPARAMETERS
subsample_factor = 200
def ingest_arguments() -> argparse.Namespace:
@ -23,98 +41,112 @@ def ingest_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE"], default='RDSAP_CHANGE')
args = parser.parse_args()
return args
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
"""
@staticmethod
def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
df = df.drop(columns=[drop_columns])
return df
def retain_features(df: pd.DataFrame, features: List[str] = None):
"""
Determine which columns to keep ofr modelling
"""
if features is None:
features = df.columns
else:
if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data')
exit(1)
df = df[features]
return df
def process(self, df: pd.DataFrame) -> pd.DataFrame:
df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
df = self.retain_features(df, features=FEATURE_COLUMNS)
return df
def training(train_filepath: str, test_filepath: str) -> None:
def training(
train_filepath: str,
test_filepath: str,
target_column: str = "RDSAP_CHANGE",
model_type: str = "autogluon",
hyperparameter: dict = HYPERPARAMETERS
) -> None:
"""
Pipeline to run training on the dataset
"""
logger.info('Loading data')
logger.info('--- Loading data ---')
dataloader = DataLoader()
train_df = dataloader.load(filepath=train_filepath)
test_df = dataloader.load(filepath=test_filepath)
# df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
logger.info('Feature processing')
logger.info('--- Feature processing ---')
feature_processor = FeatureProcessor()
train_df = feature_processor.process(train_df)
test_df = feature_processor.process(test_df)
# logger.info('Split data into train and validation')
subsample_amount = round(len(train_df)/subsample_factor)
logger.info('Build Model')
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
test_df = feature_processor.process(test_df, target_column=target_column)
logger.info('--- Build Model ---')
if model_type == "autogluon":
model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower()
output_base = Path(MODEL_DIRECTORY) / model_type / model_root
model_folder = "model"
metrics_folder = "metrics"
model = AutogluonModel(
output_filepath = output_base / model_folder
)
else:
logger.error("No alternative model implemented yet")
exit(1)
data = TabularDataset(data=train_filepath)
data = data.drop(columns=['UPRN', 'HEAT_DEMAND_CHANGE'])
TOP_FEATURES = ['MAINHEAT', 'ROOF', 'WALLS', 'MAINHEATCONT', 'PHOTO', 'HOTWATER', 'SECONDHEAT']
# top_features = data.columns[data.columns.str.startswith(tuple(TOP_FEATURES))]
model.train_model(
data=train_df,
target_column=target_column,
hyperparameters=hyperparameter
)
logger.info("--- Save Model ---")
model.save_model(output_filepath=model.output_filepath)
data = data[['RDSAP_CHANGE'] + top_features.to_list()]
# data = TabularDataset(data=train_df)
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
subsample_size = round(len(data)/20)
data = data.sample(subsample_size, random_state=RANDOM_SEED)
logger.info('--- Generate evaluation metrics ---')
metrics_df = model.model_evaluation(
validation_data=test_df,
target_column=target_column,
metrics_location = output_base / metrics_folder
)
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
# Imagining for now that the model trained here is the best model amongst all models built
# Add custom metric class MAPE
# Have a look at temporal features
logger.info("--- Optimising model for deployment ---")
optimised_folder = "deployment"
deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / optimised_folder)
logger.info("Optimised version of best model can be found at: {deployment_model_path}")
target_column = 'RDSAP_CHANGE'
predictor_RDSAP = TabularPredictor(
label=target_column,
path="agModels-predictRDSAP",
problem_type="regression",
eval_metric='mean_absolute_error'
).fit(data, time_limit=200, presets='best_quality', excluded_model_types=['KNN'])
# TODO: Need a model registry - for now have this as a CSV
# Save this in the model directory
logger.info("--- Append registry with new model ---")
if REGISTRY_PATH.exists():
logger.info("Registry file found - Loading into Dataframe")
registry_df = pd.read_csv(REGISTRY_PATH, index_col=None)
else:
registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
model_details_df = pd.DataFrame(
[{
'model_type': model_type,
'model_name': model_root,
'model_location': deployment_model_path
}]
)
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
# TODO: decide metric to optimise to
registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
registry_df['best_model'] = [False]*len(registry_df)
registry_df.loc[0, 'best_model'] = True
logger.info('Evaluate matrics')
logger.info("--- Saving new model to registry ---")
registry_df.to_csv(REGISTRY_PATH, index=False)
test_data = TabularDataset('./model_build_data/test_data.parquet')
performance = predictor_RDSAP.evaluate(test_data)
predictions = predictor_RDSAP.predict(test_data)
logger.info("--- Training Pipeline Complete --- ")
test_data['predictions'] = predictions
test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
if __name__ == "__main__":
@ -123,4 +155,10 @@ if __name__ == "__main__":
logger.info('---Ingest Arguments---')
args = ingest_arguments()
training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
training(
train_filepath=args.train_filepath,
test_filepath=args.test_filepath,
target_column=args.target_column,
model_type=args.model_type
)