cleaned up settings

2026-07-27 23:35:01 +00:00 · 2023-08-18 11:09:21 +01:00 · 2023-08-18 11:09:21 +01:00 · 3e23240afe
commit 3e23240afe
parent 2e5c423562
4 changed files with 53 additions and 32 deletions
--- a/model_data/simulation_system/README.md
+++ b/model_data/simulation_system/README.md
@ -44,6 +44,10 @@ Steps for pipeline:
        - a `metadata.json`
            - This is all the metadata from the model (can change this if needed)

+- NOTE: If you wish to change any settings, these are currently all in the `Settings.py` file
+    - It will be separated out eventually but for now, it works to keep track of anything that we might want to respecify.
+        - I.e. the hyperparameters for models are in here but will move into a separate configuration file
+

 # TODO:
 - Structure/ MLOps:
@ -54,6 +58,7 @@ Steps for pipeline:
    - Sort out Model Registry 
    - Sort out Data version control
 - Data Science:
+    - Implement a metrics class, to hold all metric 
    - Rebuild metrics script (Could be a one off but good to have)
    - Determine metrics 
    - Implement and test custom model (Tensorflow Decision Trees etc)
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@ -2,7 +2,20 @@
 # TODO: migrate to dynaconf
 from pathlib import Path

+# Can move to a hyperparmeters file
+# If anything we might want to have a file that can be loaded and sent to this script
+MODEL_HYPERPARAMETERS = {
+    "autogluon": {
+        'problem_type': 'regression',
+        'eval_metric': 'mean_absolute_error', 
+        'time_limit': 75, 
+        'presets': 'medium_quality', 
+        'excluded_model_types': None
+    }
+}
+
 RANDOM_SEED = 0
+SUBSAMPLE_FACTOR = 200

 TRAIN_AND_VALIDATION_DATA_NAME = 'train_validation_data.parquet'
 TEST_DATA_NAME = 'test_data.parquet'
@ -13,6 +26,9 @@ BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
 PREDICTION_LOCATION = Path("predictions")
 PREDICTION_FILE = 'prediction.json'
 METADATA_FILE = 'metadata.json'
+MODEL_FOLDER = "model"
+METRICS_FOLDER = "metrics"
+DEPLOYMENT_FOLDER = "deployment"   

 TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
--- a/model_data/simulation_system/predictions.py
+++ b/model_data/simulation_system/predictions.py
@ -33,8 +33,9 @@ def ingest_arguments() -> argparse.Namespace:
    """

    parser = argparse.ArgumentParser(description='Inputs for training script')
+    parser.add_argument('--target-column', type=str, help='The response variable you are predicting for', choices=['RDSAP_CHANGE', 'HEAT_DEMAND_CHANGE'], default='RDSAP_CHANGE')
    parser.add_argument('--model-path', type=str, help='If you wish to use a specific model, specify the model path here')
-    parser.add_argument('--data', type=str, help='Location of Parquet dataset to load for testing')
+    parser.add_argument('--data', type=str, help='Json data for predictions')
    parser.add_argument('--data-path', type=str, help='Location of Parquet dataset to load for training')

    args = parser.parse_args()
@ -128,4 +129,4 @@ if __name__ == "__main__":

    # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
    # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
-    prediction(model_path=args.model_path, data=args.data, data_path=args.data_path)
+    prediction(target_column=args.target_column, model_path=args.model_path, data=args.data, data_path=args.data_path)
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@ -13,28 +13,23 @@ import pandas as pd
 from core.Settings import (
    MODEL_DIRECTORY,
    BASE_REGISTRY_PATH,
-    REGISTRY_FILE
+    REGISTRY_FILE,
+    MODEL_FOLDER,
+    METRICS_FOLDER,
+    DEPLOYMENT_FOLDER,
+    SUBSAMPLE_FACTOR,
+    MODEL_HYPERPARAMETERS
 )

-TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
-
-# Can move to a hyperparmeters file
-# If anything we might want to have a file that can be loaded and sent to this script
-HYPERPARAMETERS = {
-    'problem_type': 'regression',
-    'eval_metric': 'mean_absolute_error', 
-    'time_limit': 60, 
-    'presets': 'medium_quality', 
-    'excluded_model_types': None
-}
+TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")     

 # FOR TESTING
-train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
-test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
-target_column = "RDSAP_CHANGE"
-model_type = "autogluon"
-hyperparameter = HYPERPARAMETERS
-subsample_factor = 200
+# train_filepath = "./model_build_data/change_data/rdsap_full/train_validation_data.parquet"
+# test_filepath = "./model_build_data/change_data/rdsap_full/test_data.parquet"
+# target_column = "RDSAP_CHANGE"
+# model_type = "autogluon"
+# hyperparameter = HYPERPARAMETERS
+# SUBSAMPLE_FACTOR = 200

 # SESSION = boto3.Session()

@ -58,7 +53,7 @@ def ingest_arguments() -> argparse.Namespace:
    parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training', required=True)
    parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing', required=True)
    parser.add_argument('--model-type', type=str, help='The type of model to train', choices=["autogluon"], default="autogluon")
-    parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE"], default='RDSAP_CHANGE')
+    parser.add_argument('--target-column', type=str, help='The response variable', choices=["RDSAP_CHANGE", "HEAT_DEMAND_CHANGE"], default='RDSAP_CHANGE')

    args = parser.parse_args()

@ -70,7 +65,7 @@ def training(
        test_filepath: str, 
        target_column: str = "RDSAP_CHANGE", 
        model_type: str = "autogluon", 
-        hyperparameter: dict = HYPERPARAMETERS
+        hyperparameters: dict = None
        ) -> None:
    """
    Pipeline to run training on the dataset
@ -85,22 +80,26 @@ def training(

    feature_processor = FeatureProcessor()

-    subsample_amount = round(len(train_df)/subsample_factor)
+    subsample_amount = round(len(train_df)/SUBSAMPLE_FACTOR)

    train_df = feature_processor.process(train_df, target_column=target_column, subsample_amount=subsample_amount)
    test_df = feature_processor.process(test_df, target_column=target_column)

    logger.info('--- Build Model ---')

-    model_folder = "model"
-    metrics_folder = "metrics"
+    logger.info("--- Load Hyperparameters ---")
+
+    if hyperparameters is None:
+        logger.info("Use base hyperparameters in settings")
+        hyperparameters = MODEL_HYPERPARAMETERS[model_type]
+        logger.info(f'Hyperparameters are: {hyperparameters}')

    if model_type == "autogluon":
-        model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower()
+        model_root = f"{target_column}-{hyperparameters['presets']}-{hyperparameters['time_limit']}-{TIMESTAMP}".lower()
        output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root 

        model = AutogluonModel(
-            output_filepath = output_base / model_folder
+            output_filepath = output_base / MODEL_FOLDER
            )
    else:
        logger.error("No alternative model implemented yet")
@ -109,7 +108,7 @@ def training(
    model.train_model(
        data=train_df, 
        target_column=target_column, 
-        hyperparameters=hyperparameter
+        hyperparameters=hyperparameters
        )
    
    logger.info("--- Save Model ---")
@ -119,15 +118,15 @@ def training(
    metrics_df = model.model_evaluation(
        validation_data=test_df, 
        target_column=target_column,
-        metrics_location = output_base / metrics_folder
+        metrics_location = output_base / METRICS_FOLDER
        )
    
    # TODO: introduce a seperate script for model optimisation, and from there, optimise for deployment
    # Imagining for now that the model trained here is the best model amongst all models built

    logger.info("--- Optimising model for deployment ---")
-    optimised_folder = "deployment"        
-    deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / optimised_folder)
+
+    deployment_model_path = model.optimise_model_for_deployment(deployment_path= output_base / DEPLOYMENT_FOLDER)
    logger.info("Optimised version of best model can be found at: {deployment_model_path}")

    # TODO: Need a model registry - for now have this as a CSV
@ -135,7 +134,7 @@ def training(
    logger.info("--- Append registry with new model ---")

    registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
-    
+
    if registry_path.exists():
        logger.info("Registry file found - Loading into Dataframe")
        registry_df = pd.read_csv(registry_path, index_col=None)