mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
add pre-commit hook, no customisation on black
This commit is contained in:
parent
6956a80707
commit
1e1cf60543
5 changed files with 267 additions and 198 deletions
|
|
@ -5,7 +5,7 @@ This is the base protocol:
|
|||
Key tasks:
|
||||
- Template Model class for different model types
|
||||
- Save model
|
||||
- Load Model
|
||||
- Load Model
|
||||
- Generate Inference
|
||||
"""
|
||||
|
||||
|
|
@ -15,9 +15,9 @@ import pandas as pd
|
|||
|
||||
|
||||
class MLModel(Protocol):
|
||||
'''
|
||||
"""
|
||||
Base ML Model protocol
|
||||
'''
|
||||
"""
|
||||
|
||||
def load_model(self, filepath: Path) -> None:
|
||||
"""
|
||||
|
|
@ -30,11 +30,8 @@ class MLModel(Protocol):
|
|||
"""
|
||||
|
||||
def train_model(
|
||||
self,
|
||||
data: pd.DataFrame,
|
||||
target_column: str,
|
||||
hyperparameter: dict
|
||||
) -> None:
|
||||
self, data: pd.DataFrame, target_column: str, hyperparameter: dict
|
||||
) -> None:
|
||||
"""
|
||||
For the given data and hyperparameters (specified to the model), a model is trained
|
||||
"""
|
||||
|
|
@ -44,7 +41,12 @@ class MLModel(Protocol):
|
|||
For the given dataframe, model is loaded and predictions are generated
|
||||
"""
|
||||
|
||||
def model_evaluation(self, validation_data: pd.DataFrame, target_column: str, metrics_location: Path = None) -> NamedTuple:
|
||||
def model_evaluation(
|
||||
self,
|
||||
validation_data: pd.DataFrame,
|
||||
target_column: str,
|
||||
metrics_location: Path = None,
|
||||
) -> NamedTuple:
|
||||
"""
|
||||
For any validation data, a set of predictions and metrics are return
|
||||
"""
|
||||
|
|
@ -53,7 +55,7 @@ class MLModel(Protocol):
|
|||
"""
|
||||
Perfomance post processing on Model to ensure ready for deployment
|
||||
"""
|
||||
|
||||
|
||||
def model_metadata(self) -> dict:
|
||||
"""
|
||||
Extract out model metadata as dictionary
|
||||
|
|
|
|||
|
|
@ -1,27 +1,34 @@
|
|||
"""
|
||||
Different implementations of the MLModel Protocol
|
||||
Different implementations of the MLModel Protocol
|
||||
Uses the BaseMLModel protocol
|
||||
Key tasks:
|
||||
- Template Model class for different model types
|
||||
- Save model
|
||||
- Load Model
|
||||
- Load Model
|
||||
- Generate Inference
|
||||
"""
|
||||
|
||||
from typing import NamedTuple
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from autogluon.tabular import TabularDataset, TabularPredictor
|
||||
from sklearn.metrics import mean_absolute_percentage_error
|
||||
from core.Logger import logger
|
||||
|
||||
AUTOGLUON_HYPERPARAMETERS = ['problem_type', 'eval_metric', 'time_limit', 'presets', 'excluded_model_types']
|
||||
AUTOGLUON_HYPERPARAMETERS = [
|
||||
"problem_type",
|
||||
"eval_metric",
|
||||
"time_limit",
|
||||
"presets",
|
||||
"excluded_model_types",
|
||||
]
|
||||
METRIC_FILENAME = "metrics.csv"
|
||||
|
||||
|
||||
class AutogluonModel:
|
||||
"""
|
||||
Autogluon model that implements the MLModel Protocol
|
||||
"""
|
||||
|
||||
def __init__(self, output_filepath: Path = None) -> None:
|
||||
self.model = None
|
||||
self.output_filepath = output_filepath
|
||||
|
|
@ -40,10 +47,8 @@ class AutogluonModel:
|
|||
logger.info("Using AutoGluon Model - Model saving already occured")
|
||||
|
||||
def train_model(
|
||||
self,
|
||||
data: pd.DataFrame,
|
||||
target_column: str,
|
||||
hyperparameters: dict = None) -> None:
|
||||
self, data: pd.DataFrame, target_column: str, hyperparameters: dict = None
|
||||
) -> None:
|
||||
"""
|
||||
For the given data and hyperparameters, a model is trained
|
||||
"""
|
||||
|
|
@ -52,23 +57,24 @@ class AutogluonModel:
|
|||
exit(1)
|
||||
|
||||
if set(AUTOGLUON_HYPERPARAMETERS) != set(hyperparameters.keys()):
|
||||
print("Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required")
|
||||
print(
|
||||
"Hyperparameters (dict) is incorrectly defined - please check what hyperparameters are required"
|
||||
)
|
||||
exit(1)
|
||||
|
||||
AGdata = TabularDataset(data=data)
|
||||
|
||||
self.model = TabularPredictor(
|
||||
label=target_column,
|
||||
path=self.output_filepath,
|
||||
problem_type=hyperparameters['problem_type'],
|
||||
eval_metric=hyperparameters['eval_metric']
|
||||
).fit(
|
||||
AGdata,
|
||||
time_limit=hyperparameters['time_limit'],
|
||||
presets=hyperparameters['presets'],
|
||||
excluded_model_types=hyperparameters['excluded_model_types']
|
||||
)
|
||||
|
||||
label=target_column,
|
||||
path=self.output_filepath,
|
||||
problem_type=hyperparameters["problem_type"],
|
||||
eval_metric=hyperparameters["eval_metric"],
|
||||
).fit(
|
||||
AGdata,
|
||||
time_limit=hyperparameters["time_limit"],
|
||||
presets=hyperparameters["presets"],
|
||||
excluded_model_types=hyperparameters["excluded_model_types"],
|
||||
)
|
||||
|
||||
def generate_predictions(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
|
|
@ -84,12 +90,12 @@ class AutogluonModel:
|
|||
return predictions
|
||||
|
||||
def model_evaluation(
|
||||
self,
|
||||
validation_data: pd.DataFrame,
|
||||
target_column: str,
|
||||
metrics_location: Path = None,
|
||||
metric_filename: str = METRIC_FILENAME
|
||||
) -> pd.DataFrame:
|
||||
self,
|
||||
validation_data: pd.DataFrame,
|
||||
target_column: str,
|
||||
metrics_location: Path = None,
|
||||
metric_filename: str = METRIC_FILENAME,
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
For any validation data, a set of predictions and metrics are return
|
||||
"""
|
||||
|
|
@ -105,11 +111,13 @@ class AutogluonModel:
|
|||
|
||||
logger.info("Prediction used for evaluations are saved in self.prediction")
|
||||
self.predictions = predictions
|
||||
|
||||
# TODO: Can have a custom metric class that defines all different metrics we want
|
||||
metric_mape = mean_absolute_percentage_error(validation_data[target_column], predictions)
|
||||
|
||||
performance['mape'] = metric_mape
|
||||
# TODO: Can have a custom metric class that defines all different metrics we want
|
||||
metric_mape = mean_absolute_percentage_error(
|
||||
validation_data[target_column], predictions
|
||||
)
|
||||
|
||||
performance["mape"] = metric_mape
|
||||
|
||||
logger.info("Saving metric file as metric.csv")
|
||||
metrics_location.mkdir(exist_ok=True)
|
||||
|
|
@ -117,7 +125,7 @@ class AutogluonModel:
|
|||
metrics_df = pd.DataFrame([performance])
|
||||
metrics_df.to_csv(metrics_location / metric_filename)
|
||||
markdown_filename = metric_filename.split(".")[0] + ".md"
|
||||
metrics_df.to_markdown(metrics_location/ markdown_filename)
|
||||
metrics_df.to_markdown(metrics_location / markdown_filename)
|
||||
|
||||
return metrics_df
|
||||
|
||||
|
|
@ -135,14 +143,9 @@ class AutogluonModel:
|
|||
|
||||
# This will return a string path of the location
|
||||
return self.model.clone_for_deployment(deployment_path)
|
||||
|
||||
|
||||
def model_metadata(self) -> dict:
|
||||
"""
|
||||
For Autogluon model, use the inbuilt model info method
|
||||
For Autogluon model, use the inbuilt model info method
|
||||
"""
|
||||
return self.model.info()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
# Using a simply python file as settings for now
|
||||
# Using a simply python file as settings for now
|
||||
# TODO: migrate to dynaconf
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -6,40 +6,42 @@ from pathlib import Path
|
|||
# If anything we might want to have a file that can be loaded and sent to this script
|
||||
MODEL_HYPERPARAMETERS = {
|
||||
"autogluon": {
|
||||
'problem_type': 'regression',
|
||||
'eval_metric': 'mean_absolute_error',
|
||||
'time_limit': 30,
|
||||
'presets': 'medium_quality',
|
||||
'excluded_model_types': None
|
||||
"problem_type": "regression",
|
||||
"eval_metric": "mean_absolute_error",
|
||||
"time_limit": 30,
|
||||
"presets": "medium_quality",
|
||||
"excluded_model_types": None,
|
||||
}
|
||||
}
|
||||
|
||||
TIMESTAMP_FORMAT = "%Y-%m-%d_%H-%M-%S"
|
||||
|
||||
RANDOM_SEED = 0
|
||||
SUBSAMPLE_FACTOR = 200
|
||||
|
||||
TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet'
|
||||
TEST_DATA_NAME = 'test_data.parquet'
|
||||
TRAIN_AND_VALIDATION_DATA_NAME = "train_validation_data.parquet"
|
||||
TEST_DATA_NAME = "test_data.parquet"
|
||||
|
||||
REGISTRY_FILE = "model_registry.csv"
|
||||
MODEL_DIRECTORY = "model_directory"
|
||||
BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
|
||||
BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
|
||||
PREDICTION_LOCATION = Path("predictions")
|
||||
PREDICTION_FILE = 'prediction.json'
|
||||
METADATA_FILE = 'metadata.json'
|
||||
PREDICTION_FILE = "prediction.json"
|
||||
METADATA_FILE = "metadata.json"
|
||||
MODEL_FOLDER = "model"
|
||||
METRICS_FOLDER = "metrics"
|
||||
DEPLOYMENT_FOLDER = "deployment"
|
||||
DEPLOYMENT_FOLDER = "deployment"
|
||||
|
||||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
||||
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
||||
|
||||
COLUMNS_TO_MERGE_ON = [
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
"CONSTRUCTION_AGE_BAND",
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
"CONSTRUCTION_AGE_BAND",
|
||||
"NUMBER_HABITABLE_ROOMS",
|
||||
"NUMBER_HEATED_ROOMS"
|
||||
]
|
||||
"NUMBER_HEATED_ROOMS",
|
||||
]
|
||||
|
||||
FULLY_GLAZED_DESCRIPTIONS = [
|
||||
"Fully double glazed",
|
||||
|
|
@ -50,48 +52,45 @@ FULLY_GLAZED_DESCRIPTIONS = [
|
|||
]
|
||||
|
||||
FIXED_FEATURES = [
|
||||
'PROPERTY_TYPE',
|
||||
'BUILT_FORM',
|
||||
'CONSTRUCTION_AGE_BAND',
|
||||
'NUMBER_HABITABLE_ROOMS',
|
||||
'CONSTITUENCY',
|
||||
'NUMBER_HEATED_ROOMS',
|
||||
'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
'FLOOR_HEIGHT',
|
||||
'FLOOR_LEVEL',
|
||||
'TOTAL_FLOOR_AREA',
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
"CONSTRUCTION_AGE_BAND",
|
||||
"NUMBER_HABITABLE_ROOMS",
|
||||
"CONSTITUENCY",
|
||||
"NUMBER_HEATED_ROOMS",
|
||||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||||
"FLOOR_HEIGHT",
|
||||
"FLOOR_LEVEL",
|
||||
"TOTAL_FLOOR_AREA",
|
||||
]
|
||||
|
||||
COMPONENT_FEATURES = [
|
||||
'TRANSACTION_TYPE',
|
||||
'WALLS_DESCRIPTION',
|
||||
'FLOOR_DESCRIPTION',
|
||||
'LIGHTING_DESCRIPTION',
|
||||
'ROOF_DESCRIPTION',
|
||||
'MAINHEAT_DESCRIPTION',
|
||||
'HOTWATER_DESCRIPTION',
|
||||
'MAIN_FUEL',
|
||||
'MECHANICAL_VENTILATION',
|
||||
'SECONDHEAT_DESCRIPTION',
|
||||
'ENERGY_TARIFF', # Not sure if this is relevant
|
||||
'SOLAR_WATER_HEATING_FLAG',
|
||||
'PHOTO_SUPPLY',
|
||||
'WINDOWS_DESCRIPTION',
|
||||
'GLAZED_TYPE',
|
||||
'MULTI_GLAZE_PROPORTION',
|
||||
'LIGHTING_DESCRIPTION',
|
||||
'LOW_ENERGY_LIGHTING',
|
||||
'NUMBER_OPEN_FIREPLACES',
|
||||
'MAINHEATCONT_DESCRIPTION',
|
||||
'EXTENSION_COUNT',
|
||||
"TRANSACTION_TYPE",
|
||||
"WALLS_DESCRIPTION",
|
||||
"FLOOR_DESCRIPTION",
|
||||
"LIGHTING_DESCRIPTION",
|
||||
"ROOF_DESCRIPTION",
|
||||
"MAINHEAT_DESCRIPTION",
|
||||
"HOTWATER_DESCRIPTION",
|
||||
"MAIN_FUEL",
|
||||
"MECHANICAL_VENTILATION",
|
||||
"SECONDHEAT_DESCRIPTION",
|
||||
"ENERGY_TARIFF", # Not sure if this is relevant
|
||||
"SOLAR_WATER_HEATING_FLAG",
|
||||
"PHOTO_SUPPLY",
|
||||
"WINDOWS_DESCRIPTION",
|
||||
"GLAZED_TYPE",
|
||||
"MULTI_GLAZE_PROPORTION",
|
||||
"LIGHTING_DESCRIPTION",
|
||||
"LOW_ENERGY_LIGHTING",
|
||||
"NUMBER_OPEN_FIREPLACES",
|
||||
"MAINHEATCONT_DESCRIPTION",
|
||||
"EXTENSION_COUNT",
|
||||
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
|
||||
]
|
||||
|
||||
# For these fields, we take an average if we have multiple values
|
||||
AVERAGE_FIXED_FEATURES = [
|
||||
"TOTAL_FLOOR_AREA",
|
||||
"FLOOR_HEIGHT"
|
||||
]
|
||||
AVERAGE_FIXED_FEATURES = ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
|
||||
|
||||
# For these fields, we take the latest value if we have multiple values
|
||||
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
|
||||
|
|
@ -105,11 +104,7 @@ LATEST_FIELD = [
|
|||
]
|
||||
|
||||
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
|
||||
MANDATORY_FIXED_FEATURES = [
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
"CONSTITUENCY"
|
||||
]
|
||||
MANDATORY_FIXED_FEATURES = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY"]
|
||||
|
||||
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
|
||||
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
|
||||
|
|
@ -119,14 +114,16 @@ EARLIEST_EPC_DATE = "2014-08-01"
|
|||
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
|
||||
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
||||
|
||||
|
||||
def ordinal(n):
|
||||
if 10 <= n % 100 <= 20:
|
||||
suffix = 'th'
|
||||
suffix = "th"
|
||||
else:
|
||||
suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
|
||||
suffix = {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")
|
||||
|
||||
return str(n) + suffix
|
||||
|
||||
|
||||
FLOOR_LEVEL_MAP = {
|
||||
"Basement": -1,
|
||||
"Ground": 0,
|
||||
|
|
@ -145,8 +142,7 @@ BUILT_FORM_REMAP = {
|
|||
}
|
||||
|
||||
DATA_PROCESSOR_SETTINGS = {
|
||||
'low_memory': False,
|
||||
'epc_minimum_count': 1,
|
||||
'column_mappings': {'UPRN': [int, str]}
|
||||
"low_memory": False,
|
||||
"epc_minimum_count": 1,
|
||||
"column_mappings": {"UPRN": [int, str]},
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -16,10 +16,11 @@ from core.Settings import (
|
|||
REGISTRY_FILE,
|
||||
PREDICTION_LOCATION,
|
||||
PREDICTION_FILE,
|
||||
METADATA_FILE
|
||||
METADATA_FILE,
|
||||
TIMESTAMP_FORMAT,
|
||||
)
|
||||
|
||||
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
|
||||
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
|
||||
|
||||
# FOR TESTING
|
||||
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
|
||||
|
|
@ -32,19 +33,35 @@ def ingest_arguments() -> argparse.Namespace:
|
|||
Helper function to take in arguments from script start
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
|
||||
parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
|
||||
parser.add_argument('--data', type=str, help='Json data for predictions')
|
||||
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
|
||||
parser = argparse.ArgumentParser(description="Inputs for training script")
|
||||
parser.add_argument(
|
||||
"--target-column",
|
||||
type=str,
|
||||
help="The response variable you are predicting for",
|
||||
choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"],
|
||||
default="RDSAP_CHANGE",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-path",
|
||||
type=str,
|
||||
help="If you wish to use a specific model, specify the model path here",
|
||||
)
|
||||
parser.add_argument("--data", type=str, help="Json data for predictions")
|
||||
parser.add_argument(
|
||||
"--data-path", type=str, help="Location of Parquet dataset to load for training"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
|
||||
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
|
||||
def prediction(
|
||||
target_column: str = "RDSAP_CHANGE",
|
||||
model_path: str = None,
|
||||
data: pd.DataFrame = None,
|
||||
data_path: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Main pipeline function
|
||||
"""
|
||||
|
|
@ -64,11 +81,11 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
|
|||
# TODO: Think about where registry will sit/ type
|
||||
logger.info("Loading best model from registry")
|
||||
registry_df = pd.read_csv(registry_path)
|
||||
best_model_df = registry_df[registry_df['best_model']]
|
||||
best_model_df = registry_df[registry_df["best_model"]]
|
||||
|
||||
model_location = best_model_df['model_location'].values[0]
|
||||
model_type = best_model_df['model_type'].values[0]
|
||||
model_name = best_model_df['model_name'].values[0]
|
||||
model_location = best_model_df["model_location"].values[0]
|
||||
model_type = best_model_df["model_type"].values[0]
|
||||
model_name = best_model_df["model_name"].values[0]
|
||||
|
||||
logger.info("--- Model Info: ---")
|
||||
logger.info(f"Model type: {model_type}")
|
||||
|
|
@ -86,7 +103,7 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
|
|||
# TODO: DOWNSAMPLING DOWN TO JUST USE ONE FOR PREDICTION
|
||||
data = data.sample(1)
|
||||
else:
|
||||
logger.info('Using data provided')
|
||||
logger.info("Using data provided")
|
||||
data = json.loads(data)
|
||||
data = pd.DataFrame([data])
|
||||
print(data)
|
||||
|
|
@ -117,17 +134,23 @@ def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data
|
|||
"model_type": model_type,
|
||||
"model_name": model_name,
|
||||
"model_location": model_location,
|
||||
"model_settings": model.model_metadata()
|
||||
"model_settings": model.model_metadata(),
|
||||
}
|
||||
|
||||
pd.DataFrame([prediction_metadata]).to_json(output_base / METADATA_FILE)
|
||||
|
||||
return json_prediction
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
args = ingest_arguments()
|
||||
|
||||
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
|
||||
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
|
||||
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
|
||||
prediction(
|
||||
target_column=args.target_column,
|
||||
model_path=args.model_path,
|
||||
data=args.data,
|
||||
data_path=args.data_path,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
import argparse
|
||||
|
||||
# import boto3
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
|
@ -16,12 +16,13 @@ from core.Settings import (
|
|||
METRICS_FOLDER,
|
||||
DEPLOYMENT_FOLDER,
|
||||
SUBSAMPLE_FACTOR,
|
||||
MODEL_HYPERPARAMETERS
|
||||
MODEL_HYPERPARAMETERS,
|
||||
TIMESTAMP_FORMAT,
|
||||
)
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
|
||||
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
|
||||
|
||||
# FOR TESTING
|
||||
# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
|
||||
|
|
@ -43,119 +44,145 @@ TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
|
|||
# S3_CLIENT.create_bucket
|
||||
# S3_CLIENT.list_buckets()
|
||||
|
||||
|
||||
def ingest_arguments() -> argparse.Namespace:
|
||||
"""
|
||||
Helper function to take in arguments from script start
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
parser = argparse.ArgumentParser(description="Inputs for training script")
|
||||
|
||||
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
|
||||
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
|
||||
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
|
||||
parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
|
||||
parser.add_argument(
|
||||
"--train-filepath",
|
||||
type=str,
|
||||
help="Location of Parquet dataset to load for training",
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test-filepath",
|
||||
type=str,
|
||||
help="Location of Parquet dataset to load for testing",
|
||||
required=True,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-type",
|
||||
type=str,
|
||||
help="The type of model to train",
|
||||
choices=["autogluon"],
|
||||
default="autogluon",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--target-column",
|
||||
type=str,
|
||||
help="The response variable",
|
||||
choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"],
|
||||
default="RDSAP_CHANGE",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
|
||||
def training(
|
||||
train_filepath: str,
|
||||
test_filepath: str,
|
||||
target_column: str = "RDSAP_CHANGE",
|
||||
model_type: str = "autogluon",
|
||||
hyperparameters: dict = None
|
||||
) -> None:
|
||||
train_filepath: str,
|
||||
test_filepath: str,
|
||||
target_column: str = "RDSAP_CHANGE",
|
||||
model_type: str = "autogluon",
|
||||
hyperparameters: dict = None,
|
||||
) -> None:
|
||||
"""
|
||||
Pipeline to run training on the dataset
|
||||
"""
|
||||
|
||||
logger.info('--- Loading data ---')
|
||||
logger.info("--- Loading data ---")
|
||||
dataloader = DataLoader()
|
||||
train_df = dataloader.load(filepath=train_filepath)
|
||||
test_df = dataloader.load(filepath=test_filepath)
|
||||
|
||||
logger.info('--- Feature processing ---')
|
||||
|
||||
logger.info("--- Feature processing ---")
|
||||
|
||||
feature_processor = FeatureProcessor()
|
||||
|
||||
subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR)
|
||||
subsample_amount = round(len(train_df) / SUBSAMPLE_FACTOR)
|
||||
|
||||
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
|
||||
train_df = feature_processor.process(
|
||||
train_df, target_column=target_column, subsample_amount=subsample_amount
|
||||
)
|
||||
test_df = feature_processor.process(test_df, target_column=target_column)
|
||||
|
||||
logger.info('--- Build Model ---')
|
||||
logger.info("--- Build Model ---")
|
||||
|
||||
logger.info("--- Load Hyperparameters ---")
|
||||
|
||||
if hyperparameters is None:
|
||||
logger.info("Use base hyperparameters in settings")
|
||||
hyperparameters = MODEL_HYPERPARAMETERS[model_type]
|
||||
logger.info(f'Hyperparameters are: {hyperparameters}')
|
||||
logger.info(f"Hyperparameters are: {hyperparameters}")
|
||||
|
||||
if model_type == "autogluon":
|
||||
model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
|
||||
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
|
||||
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
|
||||
|
||||
model = AutogluonModel(
|
||||
output_filepath = output_base / MODEL_FOLDER
|
||||
)
|
||||
model = AutogluonModel(output_filepath=output_base / MODEL_FOLDER)
|
||||
else:
|
||||
logger.error("No alternative model implemented yet")
|
||||
exit(1)
|
||||
|
||||
|
||||
model.train_model(
|
||||
data=train_df,
|
||||
target_column=target_column,
|
||||
hyperparameters=hyperparameters
|
||||
)
|
||||
|
||||
data=train_df, target_column=target_column, hyperparameters=hyperparameters
|
||||
)
|
||||
|
||||
logger.info("--- Save Model ---")
|
||||
model.save_model(output_filepath=model.output_filepath)
|
||||
|
||||
logger.info('--- Generate evaluation metrics ---')
|
||||
logger.info("--- Generate evaluation metrics ---")
|
||||
metrics_df = model.model_evaluation(
|
||||
validation_data=test_df,
|
||||
validation_data=test_df,
|
||||
target_column=target_column,
|
||||
metrics_location = output_base / METRICS_FOLDER
|
||||
)
|
||||
|
||||
metrics_location=output_base / METRICS_FOLDER,
|
||||
)
|
||||
|
||||
logger.info("--- Generate metric outputs using predictions ---")
|
||||
# TODO: can have a model.metric_outputs method
|
||||
# FOr not just do it here
|
||||
residual_df = pd.DataFrame(list(zip(test_df[target_column], model.predictions)), columns=['true', 'pred'])
|
||||
|
||||
residual_df = pd.DataFrame(
|
||||
list(zip(test_df[target_column], model.predictions)), columns=["true", "pred"]
|
||||
)
|
||||
|
||||
# image formatting
|
||||
# TODO: move to settings file , AXIS_FONT, TITLE_FONT
|
||||
axis_fs = 18 #fontsize
|
||||
title_fs = 22 #fontsize
|
||||
axis_fs = 18 # fontsize
|
||||
title_fs = 22 # fontsize
|
||||
sns.set(style="whitegrid")
|
||||
ax = sns.scatterplot(x="true", y="pred",data=residual_df)
|
||||
ax.set_aspect('equal')
|
||||
ax.set_xlabel(f'True {target_column}',fontsize = axis_fs)
|
||||
ax.set_ylabel(f'Predicted {target_column}', fontsize = axis_fs)#ylabel
|
||||
ax.set_title('Residuals', fontsize = title_fs)
|
||||
ax = sns.scatterplot(x="true", y="pred", data=residual_df)
|
||||
ax.set_aspect("equal")
|
||||
ax.set_xlabel(f"True {target_column}", fontsize=axis_fs)
|
||||
ax.set_ylabel(f"Predicted {target_column}", fontsize=axis_fs) # ylabel
|
||||
ax.set_title("Residuals", fontsize=title_fs)
|
||||
|
||||
# Square aspect ratio
|
||||
ax.plot([-100, 100], [-100, 100], 'black', linewidth=1)
|
||||
ax.plot([-100, 100], [-100, 100], "black", linewidth=1)
|
||||
|
||||
plt.tight_layout()
|
||||
RESIDUAL_FILE = "residuals.png"
|
||||
plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
|
||||
plt.savefig(output_base / METRICS_FOLDER / RESIDUAL_FILE, dpi=120)
|
||||
|
||||
# TODO: for cml, we might want to have class that outputs all data and plots to add to the report
|
||||
# If we want residual plot/ any plots, we will need to self host
|
||||
# plt.savefig(RESIDUAL_FILE, dpi=120)
|
||||
# plt.savefig(RESIDUAL_FILE, dpi=120)
|
||||
|
||||
|
||||
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
|
||||
# Imagining for now that the model trained here is the best model amongst all models built
|
||||
|
||||
logger.info("--- Optimising model for deployment ---")
|
||||
|
||||
deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER)
|
||||
logger.info(f"Optimised version of best model can be found at: {deployment_model_path}")
|
||||
deployment_model_path = model.optimise_model_for_deployment(
|
||||
deployment_path=output_base / DEPLOYMENT_FOLDER
|
||||
)
|
||||
logger.info(
|
||||
f"Optimised version of best model can be found at: {deployment_model_path}"
|
||||
)
|
||||
|
||||
# TODO: Need a model registry - for now have this as a CSV
|
||||
# Save this in the model directory
|
||||
|
|
@ -167,25 +194,43 @@ def training(
|
|||
logger.info("Registry file found - Loading into Dataframe")
|
||||
registry_df = pd.read_csv(registry_path, index_col=None)
|
||||
else:
|
||||
# TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns
|
||||
registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
|
||||
# TODO: Moved columns into settings: MODEL_DETAILS and Metrics class columns
|
||||
registry_df = pd.DataFrame(
|
||||
columns=[
|
||||
"model_type",
|
||||
"model_name",
|
||||
"model_location",
|
||||
"mean_absolute_error",
|
||||
"root_mean_squared_error",
|
||||
"mean_squared_error",
|
||||
"r2",
|
||||
"pearsonr",
|
||||
"median_absolute_error",
|
||||
"mape",
|
||||
"best_model",
|
||||
]
|
||||
)
|
||||
|
||||
model_details_df = pd.DataFrame(
|
||||
[{
|
||||
'model_type': model_type,
|
||||
'model_name': model_root,
|
||||
'model_location': deployment_model_path
|
||||
}]
|
||||
)
|
||||
|
||||
[
|
||||
{
|
||||
"model_type": model_type,
|
||||
"model_name": model_root,
|
||||
"model_location": deployment_model_path,
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
|
||||
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
|
||||
|
||||
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
|
||||
# TODO: decide metric to optimise to
|
||||
registry_df = registry_df.sort_values("mean_absolute_error", ascending=False).reset_index(drop=True)
|
||||
registry_df['best_model'] = [False]*len(registry_df)
|
||||
registry_df.loc[0, 'best_model'] = True
|
||||
registry_df = registry_df.sort_values(
|
||||
"mean_absolute_error", ascending=False
|
||||
).reset_index(drop=True)
|
||||
registry_df["best_model"] = [False] * len(registry_df)
|
||||
registry_df.loc[0, "best_model"] = True
|
||||
|
||||
logger.info("--- Saving new model to registry ---")
|
||||
registry_df.to_csv(registry_path, index=False)
|
||||
|
|
@ -195,16 +240,16 @@ def training(
|
|||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info('---Begin Pipeline---')
|
||||
logger.info("---Begin Pipeline---")
|
||||
|
||||
logger.info('---Ingest Arguments---')
|
||||
logger.info("---Ingest Arguments---")
|
||||
args = ingest_arguments()
|
||||
|
||||
# To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
|
||||
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
|
||||
training(
|
||||
train_filepath=args.train_filepath,
|
||||
test_filepath=args.test_filepath,
|
||||
target_column=args.target_column,
|
||||
model_type=args.model_type
|
||||
)
|
||||
train_filepath=args.train_filepath,
|
||||
test_filepath=args.test_filepath,
|
||||
target_column=args.target_column,
|
||||
model_type=args.model_type,
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue