Add readme and a better folder output structure. Starter code for mock up s3

2026-07-27 23:35:01 +00:00 · 2023-08-18 10:18:17 +01:00 · 2023-08-18 10:18:17 +01:00 · 2e5c423562
commit 2e5c423562
parent 2a18180c53
7 changed files with 129 additions and 16 deletions
--- a/model_data/simulation_system/Makefile
+++ b/model_data/simulation_system/Makefile
@ -0,0 +1,14 @@
+.PHONY: init
+init: build docker
+
+.PHONY: build
+build: 
+	docker-compose build
+
+.PHONY: docker
+docker: 
+	docker-compose up -d
+
+.PHONY: down
+down: 
+	docker compose down
--- a/model_data/simulation_system/README.md
+++ b/model_data/simulation_system/README.md
@ -0,0 +1,61 @@
+# Simulation System
+
+Starter Readme:
+Steps for pipeline:
+
+- (WIP) Use Makefile to start up mock up s3 service
+    - By running `make init`, this will run the `docker-compose build` and `docker-compose up -d`, which spins up a S3 service
+    - This docker compose is running in detached mode `-d`, so will no output anything to the terminal
+
+- Once the Minio service is run, you can run the `training.py` file to start a model build process
+    - This will output a model, for a given target column, and add model name composed of some of the hyperparameters
+    - An example of running this file is:
+        - `python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet`
+    - Outputs of the pipeline are:
+        - A model directory bucket
+        - A target variable prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
+        - A model type prefix (i.e. autogluon, tensorflow etc)
+        - A model name prefix (i.e. rdsap_change_medium_quality_60_TIMESTAMP)
+            - This model name is made up of target variable, quality, time spent training and timestamp
+            - Within this prefix, there are three folders:
+                - model
+                    - The model path that can be loaded in the codebase
+                - deployment
+                    - The optimised model that can be deployed (may or maynot need this)
+                - metrics
+                    - The metrics generatted from the model (may or may not need this as this can be contained in the registry)
+
+- Once model build is finished, you can run the `prediction.py` file to generate prediction
+    - By default, the prediction pipeline will select the best model based on **mean absolute error** from the model registry
+        - This can be overwritten by specifying a model_path, which will load an alternative model 
+    - There are two ways of getting data into the pipeline:
+        - Using the `--data` argument:
+            - This is a JSON string which can be passed as `python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'`
+                - Note the single and double quotation marks, as this affects the ingestion
+        - Using the `--data-path` argument:
+            - This can be a filepath (Can imagine that we might want to pull data from S3/ DB)
+    - An example of running the file is:
+        - `python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet`
+    - Outputs of the pipeline are:
+        - prediction bucket
+        - a Target variables prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
+        - a uprn prefix (i.e 0123456789)
+        - a `prediction.json`
+        - a `metadata.json`
+            - This is all the metadata from the model (can change this if needed)
+
+
+# TODO:
+- Structure/ MLOps:
+    - Add configuration files (dev, staging, prod), including hyperparamters
+    - Add precommit hooks (linters, branch names, etc)
+    - Sphinx documentation
+    - Sort out local mock up services
+    - Sort out Model Registry 
+    - Sort out Data version control
+- Data Science:
+    - Rebuild metrics script (Could be a one off but good to have)
+    - Determine metrics 
+    - Implement and test custom model (Tensorflow Decision Trees etc)
+- Orchestration:
+    - Lambda handler for the pipeline
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@ -9,7 +9,7 @@ TEST_DATA_NAME = 'test_data.parquet'

 REGISTRY_FILE = "model_registry.csv"
 MODEL_DIRECTORY = "model_directory"
-REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY / REGISTRY_FILE
+BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY 
 PREDICTION_LOCATION = Path("predictions")
 PREDICTION_FILE = 'prediction.json'
 METADATA_FILE = 'metadata.json'
--- a/model_data/simulation_system/docker-compose.yml
+++ b/model_data/simulation_system/docker-compose.yml
@ -0,0 +1,17 @@
+version: '3'
+
+services:
+  minio:
+    image: minio/minio
+    ports:
+      - "9000:9000"
+      - "9001:9001"
+    volumes:
+      - ./data:/data
+    environment:
+      MINIO_ROOT_USER: &MINIO_USER admin
+      MINIO_ROOT_PASSWORD: &MINIO_PASS password
+    command: server --console-address ":9001" /data
+
+# volumes:
+#   minio_storage: {}
--- a/model_data/simulation_system/energy_predictor.py
+++ b/model_data/simulation_system/energy_predictor.py
@ -1,5 +1,5 @@
 from pathlib import Path
-from core.SettingsSettings import (
+from core.Settings import (
    RDSAP_RESPONSE, 
    FLOOR_LEVEL_MAP, 
    BUILT_FORM_REMAP,
--- a/model_data/simulation_system/predictions.py
+++ b/model_data/simulation_system/predictions.py
@ -12,7 +12,8 @@ import pandas as pd
 from typing import Optional
 from datetime import datetime
 from core.Settings import (
-    REGISTRY_PATH,
+    BASE_REGISTRY_PATH,
+    REGISTRY_FILE,
    PREDICTION_LOCATION,
    PREDICTION_FILE,
    METADATA_FILE
@ -42,13 +43,15 @@ def ingest_arguments() -> argparse.Namespace:
            


-def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
+def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
    """
    Main pipeline function
    """

-    if registry_path is None:
-        logger.error("No registry path provided")
+    registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
+
+    if registry_path is None or not registry_path.exists():
+        logger.error("No registry path provided or registry doesn't exist")
        exit(1)

    if model_path is not None:
@ -104,7 +107,7 @@ def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame =

    # Saving prediction local for now
    logger.info("--- Outputting prediction and metadata --- ")
-    output_base = PREDICTION_LOCATION / uprn / TIMESTAMP
+    output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP
    output_base.mkdir(parents=True, exist_ok=True)

    json_prediction = prediction.to_json(output_base / PREDICTION_FILE)
@ -125,4 +128,4 @@ if __name__ == "__main__":

    # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
    # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
-    prediction(registry_path=REGISTRY_PATH, model_path=args.model_path, data=args.data, data_path=args.data_path)
+    prediction(model_path=args.model_path, data=args.data, data_path=args.data_path)
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@ -1,5 +1,7 @@

 import argparse
+import boto3
+import os 
 from pathlib import Path
 from datetime import datetime
 from typing import List
@ -10,7 +12,8 @@ from MLModel.Models import AutogluonModel
 import pandas as pd
 from core.Settings import (
    MODEL_DIRECTORY,
-    REGISTRY_PATH
+    BASE_REGISTRY_PATH,
+    REGISTRY_FILE
 )

 TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
@ -33,6 +36,17 @@ model_type = "autogluon"
 hyperparameter = HYPERPARAMETERS
 subsample_factor = 200

+# SESSION = boto3.Session()
+
+# S3_CLIENT = SESSION.client(
+#     service_name="s3",
+#     aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", 'admin'),
+#     aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", 'password'),
+#     endpoint_url=os.environ.get("ENDPOINT_URL", "http://localhost:9000")
+# )
+
+# S3_CLIENT.create_bucket
+# S3_CLIENT.list_buckets()

 def ingest_arguments() -> argparse.Namespace:
    """
@ -77,12 +91,13 @@ def training(
    test_df = feature_processor.process(test_df, target_column=target_column)

    logger.info('--- Build Model ---')
+
+    model_folder = "model"
+    metrics_folder = "metrics"
+
    if model_type == "autogluon":
        model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower()
-        output_base = Path(MODEL_DIRECTORY) / model_type / model_root 
-
-        model_folder = "model"
-        metrics_folder = "metrics"
+        output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root 

        model = AutogluonModel(
            output_filepath = output_base / model_folder
@ -118,10 +133,12 @@ def training(
    # TODO: Need a model registry - for now have this as a CSV
    # Save this in the model directory
    logger.info("--- Append registry with new model ---")
+
+    registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
    
-    if REGISTRY_PATH.exists():
+    if registry_path.exists():
        logger.info("Registry file found - Loading into Dataframe")
-        registry_df = pd.read_csv(REGISTRY_PATH, index_col=None)
+        registry_df = pd.read_csv(registry_path, index_col=None)
    else:
        registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])

@ -143,7 +160,7 @@ def training(
    registry_df.loc[0, 'best_model'] = True

    logger.info("--- Saving new model to registry ---")
-    registry_df.to_csv(REGISTRY_PATH, index=False)
+    registry_df.to_csv(registry_path, index=False)

    logger.info("--- Training Pipeline Complete --- ")

@ -155,6 +172,7 @@ if __name__ == "__main__":
    logger.info('---Ingest Arguments---')
    args = ingest_arguments()

+    # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
    # TODO: Ingest hyper parameters from somewhere - currently change at the top of script
    training(
        train_filepath=args.train_filepath,