cleaned up settings

This commit is contained in:
Michael Duong 2023-08-18 11:09:21 +01:00
parent 2e5c423562
commit 3e23240afe
4 changed files with 53 additions and 32 deletions

View file

@ -44,6 +44,10 @@ Steps for pipeline:
- a `metadata.json`
- This is all the metadata from the model (can change this if needed)
- NOTE: If you wish to change any settings, these are currently all in the `Settings.py` file
- It will be separated out eventually but for now, it works to keep track of anything that we might want to respecify.
- I.e. the hyperparameters for models are in here but will move into a separate configuration file
# TODO:
- Structure/ MLOps:
@ -54,6 +58,7 @@ Steps for pipeline:
- Sort out Model Registry
- Sort out Data version control
- Data Science:
- Implement a metrics class, to hold all metric
- Rebuild metrics script (Could be a one off but good to have)
- Determine metrics
- Implement and test custom model (Tensorflow Decision Trees etc)

View file

@ -2,7 +2,20 @@
# TODO: migrate to dynaconf
from pathlib import Path
# Can move to a hyperparmeters file
# If anything we might want to have a file that can be loaded and sent to this script
MODEL_HYPERPARAMETERS = {
"autogluon": {
'problem_type': 'regression',
'eval_metric': 'mean_absolute_error',
'time_limit': 75,
'presets': 'medium_quality',
'excluded_model_types': None
}
}
RANDOM_SEED = 0
SUBSAMPLE_FACTOR = 200
TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet'
TEST_DATA_NAME = 'test_data.parquet'
@ -13,6 +26,9 @@ BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
PREDICTION_LOCATION = Path("predictions")
PREDICTION_FILE = 'prediction.json'
METADATA_FILE = 'metadata.json'
MODEL_FOLDER = "model"
METRICS_FOLDER = "metrics"
DEPLOYMENT_FOLDER = "deployment"
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45

View file

@ -33,8 +33,9 @@ def ingest_arguments() -> argparse.Namespace:
"""
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
parser.add_argument('--data', type=str, help='Location of Parquet dataset to load for testing')
parser.add_argument('--data', type=str, help='Json data for predictions')
parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')
args = parser.parse_args()
@ -128,4 +129,4 @@ if __name__ == "__main__":
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
prediction(model_path=args.model_path, data=args.data, data_path=args.data_path)
prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)

View file

@ -13,28 +13,23 @@ import pandas as pd
from core.Settings import (
MODEL_DIRECTORY,
BASE_REGISTRY_PATH,
REGISTRY_FILE
REGISTRY_FILE,
MODEL_FOLDER,
METRICS_FOLDER,
DEPLOYMENT_FOLDER,
SUBSAMPLE_FACTOR,
MODEL_HYPERPARAMETERS
)
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
# Can move to a hyperparmeters file
# If anything we might want to have a file that can be loaded and sent to this script
HYPERPARAMETERS = {
'problem_type': 'regression',
'eval_metric': 'mean_absolute_error',
'time_limit': 60,
'presets': 'medium_quality',
'excluded_model_types': None
}
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
# FOR TESTING
train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
target_column = "RDSAP_CHANGE"
model_type = "autogluon"
hyperparameter = HYPERPARAMETERS
subsample_factor = 200
# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
# target_column = "RDSAP_CHANGE"
# model_type = "autogluon"
# hyperparameter = HYPERPARAMETERS
# SUBSAMPLE_FACTOR = 200
# SESSION = boto3.Session()
@ -58,7 +53,7 @@ def ingest_arguments() -> argparse.Namespace:
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE"], default='RDSAP_CHANGE')
parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')
args = parser.parse_args()
@ -70,7 +65,7 @@ def training(
test_filepath: str,
target_column: str = "RDSAP_CHANGE",
model_type: str = "autogluon",
hyperparameter: dict = HYPERPARAMETERS
hyperparameters: dict = None
) -> None:
"""
Pipeline to run training on the dataset
@ -85,22 +80,26 @@ def training(
feature_processor = FeatureProcessor()
subsample_amount = round(len(train_df)/subsample_factor)
subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR)
train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
test_df = feature_processor.process(test_df, target_column=target_column)
logger.info('--- Build Model ---')
model_folder = "model"
metrics_folder = "metrics"
logger.info("--- Load Hyperparameters ---")
if hyperparameters is None:
logger.info("Use base hyperparameters in settings")
hyperparameters = MODEL_HYPERPARAMETERS[model_type]
logger.info(f'Hyperparameters are: {hyperparameters}')
if model_type == "autogluon":
model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower()
model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
model = AutogluonModel(
output_filepath = output_base / model_folder
output_filepath = output_base / MODEL_FOLDER
)
else:
logger.error("No alternative model implemented yet")
@ -109,7 +108,7 @@ def training(
model.train_model(
data=train_df,
target_column=target_column,
hyperparameters=hyperparameter
hyperparameters=hyperparameters
)
logger.info("--- Save Model ---")
@ -119,15 +118,15 @@ def training(
metrics_df = model.model_evaluation(
validation_data=test_df,
target_column=target_column,
metrics_location = output_base / metrics_folder
metrics_location = output_base / METRICS_FOLDER
)
# TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
# Imagining for now that the model trained here is the best model amongst all models built
logger.info("--- Optimising model for deployment ---")
optimised_folder = "deployment"
deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / optimised_folder)
deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER)
logger.info("Optimised version of best model can be found at: {deployment_model_path}")
# TODO: Need a model registry - for now have this as a CSV
@ -135,7 +134,7 @@ def training(
logger.info("--- Append registry with new model ---")
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path.exists():
logger.info("Registry file found - Loading into Dataframe")
registry_df = pd.read_csv(registry_path, index_col=None)