From 2e5c42356203a721ac46abb628687080cb9febea Mon Sep 17 00:00:00 2001
From: Michael Duong <michaelduong@Michaels-MacBook-Pro.local>
Date: Fri, 18 Aug 2023 10:18:17 +0100
Subject: [PATCH] Add readme and a better folder output structure. Starter code
 for mock up s3

---
 model_data/simulation_system/Makefile         | 14 +++++
 model_data/simulation_system/README.md        | 61 +++++++++++++++++++
 model_data/simulation_system/core/Settings.py |  2 +-
 .../simulation_system/docker-compose.yml      | 17 ++++++
 .../simulation_system/energy_predictor.py     |  2 +-
 model_data/simulation_system/predictions.py   | 15 +++--
 model_data/simulation_system/training.py      | 34 ++++++++---
 7 files changed, 129 insertions(+), 16 deletions(-)
 create mode 100644 model_data/simulation_system/Makefile
 create mode 100644 model_data/simulation_system/README.md
 create mode 100644 model_data/simulation_system/docker-compose.yml

diff --git a/model_data/simulation_system/Makefile b/model_data/simulation_system/Makefile
new file mode 100644
index 00000000..97df8d3e
--- /dev/null
+++ b/model_data/simulation_system/Makefile
@@ -0,0 +1,14 @@
+.PHONY: init
+init: build docker
+
+.PHONY: build
+build: 
+	docker-compose build
+
+.PHONY: docker
+docker: 
+	docker-compose up -d
+
+.PHONY: down
+down: 
+	docker compose down
\ No newline at end of file
diff --git a/model_data/simulation_system/README.md b/model_data/simulation_system/README.md
new file mode 100644
index 00000000..281ced31
--- /dev/null
+++ b/model_data/simulation_system/README.md
@@ -0,0 +1,61 @@
+# Simulation System
+
+Starter Readme:
+Steps for pipeline:
+
+- (WIP) Use Makefile to start up mock up s3 service
+    - By running `make init`, this will run the `docker-compose build` and `docker-compose up -d`, which spins up a S3 service
+    - This docker compose is running in detached mode `-d`, so will no output anything to the terminal
+
+- Once the Minio service is run, you can run the `training.py` file to start a model build process
+    - This will output a model, for a given target column, and add model name composed of some of the hyperparameters
+    - An example of running this file is:
+        - `python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet`
+    - Outputs of the pipeline are:
+        - A model directory bucket
+        - A target variable prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
+        - A model type prefix (i.e. autogluon, tensorflow etc)
+        - A model name prefix (i.e. rdsap_change_medium_quality_60_TIMESTAMP)
+            - This model name is made up of target variable, quality, time spent training and timestamp
+            - Within this prefix, there are three folders:
+                - model
+                    - The model path that can be loaded in the codebase
+                - deployment
+                    - The optimised model that can be deployed (may or maynot need this)
+                - metrics
+                    - The metrics generatted from the model (may or may not need this as this can be contained in the registry)
+
+- Once model build is finished, you can run the `prediction.py` file to generate prediction
+    - By default, the prediction pipeline will select the best model based on **mean absolute error** from the model registry
+        - This can be overwritten by specifying a model_path, which will load an alternative model 
+    - There are two ways of getting data into the pipeline:
+        - Using the `--data` argument:
+            - This is a JSON string which can be passed as `python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'`
+                - Note the single and double quotation marks, as this affects the ingestion
+        - Using the `--data-path` argument:
+            - This can be a filepath (Can imagine that we might want to pull data from S3/ DB)
+    - An example of running the file is:
+        - `python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet`
+    - Outputs of the pipeline are:
+        - prediction bucket
+        - a Target variables prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
+        - a uprn prefix (i.e 0123456789)
+        - a `prediction.json`
+        - a `metadata.json`
+            - This is all the metadata from the model (can change this if needed)
+
+
+# TODO:
+- Structure/ MLOps:
+    - Add configuration files (dev, staging, prod), including hyperparamters
+    - Add precommit hooks (linters, branch names, etc)
+    - Sphinx documentation
+    - Sort out local mock up services
+    - Sort out Model Registry 
+    - Sort out Data version control
+- Data Science:
+    - Rebuild metrics script (Could be a one off but good to have)
+    - Determine metrics 
+    - Implement and test custom model (Tensorflow Decision Trees etc)
+- Orchestration:
+    - Lambda handler for the pipeline
\ No newline at end of file
diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py
index ac9643fd..0728e68d 100644
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@@ -9,7 +9,7 @@ TEST_DATA_NAME = 'test_data.parquet'
 
 REGISTRY_FILE = "model_registry.csv"
 MODEL_DIRECTORY = "model_directory"
-REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY / REGISTRY_FILE
+BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY 
 PREDICTION_LOCATION = Path("predictions")
 PREDICTION_FILE = 'prediction.json'
 METADATA_FILE = 'metadata.json'
diff --git a/model_data/simulation_system/docker-compose.yml b/model_data/simulation_system/docker-compose.yml
new file mode 100644
index 00000000..55f181bc
--- /dev/null
+++ b/model_data/simulation_system/docker-compose.yml
@@ -0,0 +1,17 @@
+version: '3'
+
+services:
+  minio:
+    image: minio/minio
+    ports:
+      - "9000:9000"
+      - "9001:9001"
+    volumes:
+      - ./data:/data
+    environment:
+      MINIO_ROOT_USER: &MINIO_USER admin
+      MINIO_ROOT_PASSWORD: &MINIO_PASS password
+    command: server --console-address ":9001" /data
+
+# volumes:
+#   minio_storage: {}
\ No newline at end of file
diff --git a/model_data/simulation_system/energy_predictor.py b/model_data/simulation_system/energy_predictor.py
index d195241e..7b90ded2 100644
--- a/model_data/simulation_system/energy_predictor.py
+++ b/model_data/simulation_system/energy_predictor.py
@@ -1,5 +1,5 @@
 from pathlib import Path
-from core.SettingsSettings import (
+from core.Settings import (
     RDSAP_RESPONSE, 
     FLOOR_LEVEL_MAP, 
     BUILT_FORM_REMAP,
diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py
index 7fc6bcc2..7931ecb4 100644
--- a/model_data/simulation_system/predictions.py
+++ b/model_data/simulation_system/predictions.py
@@ -12,7 +12,8 @@ import pandas as pd
 from typing import Optional
 from datetime import datetime
 from core.Settings import (
-    REGISTRY_PATH,
+    BASE_REGISTRY_PATH,
+    REGISTRY_FILE,
     PREDICTION_LOCATION,
     PREDICTION_FILE,
     METADATA_FILE
@@ -42,13 +43,15 @@ def ingest_arguments() -> argparse.Namespace:
             
 
 
-def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
+def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
     """
     Main pipeline function
     """
 
-    if registry_path is None:
-        logger.error("No registry path provided")
+    registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
+
+    if registry_path is None or not registry_path.exists():
+        logger.error("No registry path provided or registry doesn't exist")
         exit(1)
 
     if model_path is not None:
@@ -104,7 +107,7 @@ def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame =
 
     # Saving prediction local for now
     logger.info("--- Outputting prediction and metadata --- ")
-    output_base = PREDICTION_LOCATION / uprn / TIMESTAMP
+    output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP
     output_base.mkdir(parents=True, exist_ok=True)
 
     json_prediction = prediction.to_json(output_base / PREDICTION_FILE)
@@ -125,4 +128,4 @@ if __name__ == "__main__":
 
     # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
     # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
-    prediction(registry_path=REGISTRY_PATH, model_path=args.model_path, data=args.data, data_path=args.data_path)
\ No newline at end of file
+    prediction(model_path=args.model_path, data=args.data, data_path=args.data_path)
\ No newline at end of file
diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py
index ecad367f..cc2a3939 100644
--- a/model_data/simulation_system/training.py
+++ b/model_data/simulation_system/training.py
@@ -1,5 +1,7 @@
 
 import argparse
+import boto3
+import os 
 from pathlib import Path
 from datetime import datetime
 from typing import List
@@ -10,7 +12,8 @@ from MLModel.Models import AutogluonModel
 import pandas as pd
 from core.Settings import (
     MODEL_DIRECTORY,
-    REGISTRY_PATH
+    BASE_REGISTRY_PATH,
+    REGISTRY_FILE
 )
 
 TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
@@ -33,6 +36,17 @@ model_type = "autogluon"
 hyperparameter = HYPERPARAMETERS
 subsample_factor = 200
 
+# SESSION = boto3.Session()
+
+# S3_CLIENT = SESSION.client(
+#     service_name="s3",
+#     aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", 'admin'),
+#     aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", 'password'),
+#     endpoint_url=os.environ.get("ENDPOINT_URL", "http://localhost:9000")
+# )
+
+# S3_CLIENT.create_bucket
+# S3_CLIENT.list_buckets()
 
 def ingest_arguments() -> argparse.Namespace:
     """
@@ -77,12 +91,13 @@ def training(
     test_df = feature_processor.process(test_df, target_column=target_column)
 
     logger.info('--- Build Model ---')
+
+    model_folder = "model"
+    metrics_folder = "metrics"
+
     if model_type == "autogluon":
         model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower()
-        output_base = Path(MODEL_DIRECTORY) / model_type / model_root 
-
-        model_folder = "model"
-        metrics_folder = "metrics"
+        output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root 
 
         model = AutogluonModel(
             output_filepath = output_base / model_folder
@@ -118,10 +133,12 @@ def training(
     # TODO: Need a model registry - for now have this as a CSV
     # Save this in the model directory
     logger.info("--- Append registry with new model ---")
+
+    registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
     
-    if REGISTRY_PATH.exists():
+    if registry_path.exists():
         logger.info("Registry file found - Loading into Dataframe")
-        registry_df = pd.read_csv(REGISTRY_PATH, index_col=None)
+        registry_df = pd.read_csv(registry_path, index_col=None)
     else:
         registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
 
@@ -143,7 +160,7 @@ def training(
     registry_df.loc[0, 'best_model'] = True
 
     logger.info("--- Saving new model to registry ---")
-    registry_df.to_csv(REGISTRY_PATH, index=False)
+    registry_df.to_csv(registry_path, index=False)
 
     logger.info("--- Training Pipeline Complete --- ")
 
@@ -155,6 +172,7 @@ if __name__ == "__main__":
     logger.info('---Ingest Arguments---')
     args = ingest_arguments()
 
+    # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
     # TODO: Ingest hyper parameters from somewhere - currently change at the top of script
     training(
         train_filepath=args.train_filepath,