diff --git a/model_data/simulation_system/README.md b/model_data/simulation_system/README.md index 281ced31..b6fe8327 100644 --- a/model_data/simulation_system/README.md +++ b/model_data/simulation_system/README.md @@ -44,6 +44,10 @@ Steps for pipeline: - a `metadata.json` - This is all the metadata from the model (can change this if needed) +- NOTE: If you wish to change any settings, these are currently all in the `Settings.py` file + - It will be separated out eventually but for now, it works to keep track of anything that we might want to respecify. + - I.e. the hyperparameters for models are in here but will move into a separate configuration file + # TODO: - Structure/ MLOps: @@ -54,6 +58,7 @@ Steps for pipeline: - Sort out Model Registry - Sort out Data version control - Data Science: + - Implement a metrics class, to hold all metric - Rebuild metrics script (Could be a one off but good to have) - Determine metrics - Implement and test custom model (Tensorflow Decision Trees etc) diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py index 0728e68d..f590934d 100644 --- a/model_data/simulation_system/core/Settings.py +++ b/model_data/simulation_system/core/Settings.py @@ -2,7 +2,20 @@ # TODO: migrate to dynaconf from pathlib import Path +# Can move to a hyperparmeters file +# If anything we might want to have a file that can be loaded and sent to this script +MODEL_HYPERPARAMETERS = { + "autogluon": { + 'problem_type': 'regression', + 'eval_metric': 'mean_absolute_error', + 'time_limit': 75, + 'presets': 'medium_quality', + 'excluded_model_types': None + } +} + RANDOM_SEED = 0 +SUBSAMPLE_FACTOR = 200 TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet' TEST_DATA_NAME = 'test_data.parquet' @@ -13,6 +26,9 @@ BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY PREDICTION_LOCATION = Path("predictions") PREDICTION_FILE = 'prediction.json' METADATA_FILE = 'metadata.json' +MODEL_FOLDER = "model" +METRICS_FOLDER = "metrics" +DEPLOYMENT_FOLDER = "deployment" TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py index 7931ecb4..b650444f 100644 --- a/model_data/simulation_system/predictions.py +++ b/model_data/simulation_system/predictions.py @@ -33,8 +33,9 @@ def ingest_arguments() -> argparse.Namespace: """ parser = argparse.ArgumentParser(description='Inputs for training script') + parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE') parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here') - parser.add_argument('--data', type=str, help='Location of Parquet dataset to load for testing') + parser.add_argument('--data', type=str, help='Json data for predictions') parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training') args = parser.parse_args() @@ -128,4 +129,4 @@ if __name__ == "__main__": # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}' # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet - prediction(model_path=args.model_path, data=args.data, data_path=args.data_path) \ No newline at end of file + prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path) \ No newline at end of file diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index cc2a3939..358abb41 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -13,28 +13,23 @@ import pandas as pd from core.Settings import ( MODEL_DIRECTORY, BASE_REGISTRY_PATH, - REGISTRY_FILE + REGISTRY_FILE, + MODEL_FOLDER, + METRICS_FOLDER, + DEPLOYMENT_FOLDER, + SUBSAMPLE_FACTOR, + MODEL_HYPERPARAMETERS ) -TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") - -# Can move to a hyperparmeters file -# If anything we might want to have a file that can be loaded and sent to this script -HYPERPARAMETERS = { - 'problem_type': 'regression', - 'eval_metric': 'mean_absolute_error', - 'time_limit': 60, - 'presets': 'medium_quality', - 'excluded_model_types': None -} +TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") # FOR TESTING -train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet" -test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet" -target_column = "RDSAP_CHANGE" -model_type = "autogluon" -hyperparameter = HYPERPARAMETERS -subsample_factor = 200 +# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet" +# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet" +# target_column = "RDSAP_CHANGE" +# model_type = "autogluon" +# hyperparameter = HYPERPARAMETERS +# SUBSAMPLE_FACTOR = 200 # SESSION = boto3.Session() @@ -58,7 +53,7 @@ def ingest_arguments() -> argparse.Namespace: parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True) parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True) parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon") - parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE"], default='RDSAP_CHANGE') + parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE') args = parser.parse_args() @@ -70,7 +65,7 @@ def training( test_filepath: str, target_column: str = "RDSAP_CHANGE", model_type: str = "autogluon", - hyperparameter: dict = HYPERPARAMETERS + hyperparameters: dict = None ) -> None: """ Pipeline to run training on the dataset @@ -85,22 +80,26 @@ def training( feature_processor = FeatureProcessor() - subsample_amount = round(len(train_df)/subsample_factor) + subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR) train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount) test_df = feature_processor.process(test_df, target_column=target_column) logger.info('--- Build Model ---') - model_folder = "model" - metrics_folder = "metrics" + logger.info("--- Load Hyperparameters ---") + + if hyperparameters is None: + logger.info("Use base hyperparameters in settings") + hyperparameters = MODEL_HYPERPARAMETERS[model_type] + logger.info(f'Hyperparameters are: {hyperparameters}') if model_type == "autogluon": - model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower() + model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower() output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root model = AutogluonModel( - output_filepath = output_base / model_folder + output_filepath = output_base / MODEL_FOLDER ) else: logger.error("No alternative model implemented yet") @@ -109,7 +108,7 @@ def training( model.train_model( data=train_df, target_column=target_column, - hyperparameters=hyperparameter + hyperparameters=hyperparameters ) logger.info("--- Save Model ---") @@ -119,15 +118,15 @@ def training( metrics_df = model.model_evaluation( validation_data=test_df, target_column=target_column, - metrics_location = output_base / metrics_folder + metrics_location = output_base / METRICS_FOLDER ) # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment # Imagining for now that the model trained here is the best model amongst all models built logger.info("--- Optimising model for deployment ---") - optimised_folder = "deployment" - deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / optimised_folder) + + deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER) logger.info("Optimised version of best model can be found at: {deployment_model_path}") # TODO: Need a model registry - for now have this as a CSV @@ -135,7 +134,7 @@ def training( logger.info("--- Append registry with new model ---") registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE - + if registry_path.exists(): logger.info("Registry file found - Loading into Dataframe") registry_df = pd.read_csv(registry_path, index_col=None)