From 2e5c42356203a721ac46abb628687080cb9febea Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 18 Aug 2023 10:18:17 +0100 Subject: [PATCH] Add readme and a better folder output structure. Starter code for mock up s3 --- model_data/simulation_system/Makefile | 14 +++++ model_data/simulation_system/README.md | 61 +++++++++++++++++++ model_data/simulation_system/core/Settings.py | 2 +- .../simulation_system/docker-compose.yml | 17 ++++++ .../simulation_system/energy_predictor.py | 2 +- model_data/simulation_system/predictions.py | 15 +++-- model_data/simulation_system/training.py | 34 ++++++++--- 7 files changed, 129 insertions(+), 16 deletions(-) create mode 100644 model_data/simulation_system/Makefile create mode 100644 model_data/simulation_system/README.md create mode 100644 model_data/simulation_system/docker-compose.yml diff --git a/model_data/simulation_system/Makefile b/model_data/simulation_system/Makefile new file mode 100644 index 00000000..97df8d3e --- /dev/null +++ b/model_data/simulation_system/Makefile @@ -0,0 +1,14 @@ +.PHONY: init +init: build docker + +.PHONY: build +build: + docker-compose build + +.PHONY: docker +docker: + docker-compose up -d + +.PHONY: down +down: + docker compose down \ No newline at end of file diff --git a/model_data/simulation_system/README.md b/model_data/simulation_system/README.md new file mode 100644 index 00000000..281ced31 --- /dev/null +++ b/model_data/simulation_system/README.md @@ -0,0 +1,61 @@ +# Simulation System + +Starter Readme: +Steps for pipeline: + +- (WIP) Use Makefile to start up mock up s3 service + - By running `make init`, this will run the `docker-compose build` and `docker-compose up -d`, which spins up a S3 service + - This docker compose is running in detached mode `-d`, so will no output anything to the terminal + +- Once the Minio service is run, you can run the `training.py` file to start a model build process + - This will output a model, for a given target column, and add model name composed of some of the hyperparameters + - An example of running this file is: + - `python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet` + - Outputs of the pipeline are: + - A model directory bucket + - A target variable prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE) + - A model type prefix (i.e. autogluon, tensorflow etc) + - A model name prefix (i.e. rdsap_change_medium_quality_60_TIMESTAMP) + - This model name is made up of target variable, quality, time spent training and timestamp + - Within this prefix, there are three folders: + - model + - The model path that can be loaded in the codebase + - deployment + - The optimised model that can be deployed (may or maynot need this) + - metrics + - The metrics generatted from the model (may or may not need this as this can be contained in the registry) + +- Once model build is finished, you can run the `prediction.py` file to generate prediction + - By default, the prediction pipeline will select the best model based on **mean absolute error** from the model registry + - This can be overwritten by specifying a model_path, which will load an alternative model + - There are two ways of getting data into the pipeline: + - Using the `--data` argument: + - This is a JSON string which can be passed as `python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'` + - Note the single and double quotation marks, as this affects the ingestion + - Using the `--data-path` argument: + - This can be a filepath (Can imagine that we might want to pull data from S3/ DB) + - An example of running the file is: + - `python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet` + - Outputs of the pipeline are: + - prediction bucket + - a Target variables prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE) + - a uprn prefix (i.e 0123456789) + - a `prediction.json` + - a `metadata.json` + - This is all the metadata from the model (can change this if needed) + + +# TODO: +- Structure/ MLOps: + - Add configuration files (dev, staging, prod), including hyperparamters + - Add precommit hooks (linters, branch names, etc) + - Sphinx documentation + - Sort out local mock up services + - Sort out Model Registry + - Sort out Data version control +- Data Science: + - Rebuild metrics script (Could be a one off but good to have) + - Determine metrics + - Implement and test custom model (Tensorflow Decision Trees etc) +- Orchestration: + - Lambda handler for the pipeline \ No newline at end of file diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py index ac9643fd..0728e68d 100644 --- a/model_data/simulation_system/core/Settings.py +++ b/model_data/simulation_system/core/Settings.py @@ -9,7 +9,7 @@ TEST_DATA_NAME = 'test_data.parquet' REGISTRY_FILE = "model_registry.csv" MODEL_DIRECTORY = "model_directory" -REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY / REGISTRY_FILE +BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY PREDICTION_LOCATION = Path("predictions") PREDICTION_FILE = 'prediction.json' METADATA_FILE = 'metadata.json' diff --git a/model_data/simulation_system/docker-compose.yml b/model_data/simulation_system/docker-compose.yml new file mode 100644 index 00000000..55f181bc --- /dev/null +++ b/model_data/simulation_system/docker-compose.yml @@ -0,0 +1,17 @@ +version: '3' + +services: + minio: + image: minio/minio + ports: + - "9000:9000" + - "9001:9001" + volumes: + - ./data:/data + environment: + MINIO_ROOT_USER: &MINIO_USER admin + MINIO_ROOT_PASSWORD: &MINIO_PASS password + command: server --console-address ":9001" /data + +# volumes: +# minio_storage: {} \ No newline at end of file diff --git a/model_data/simulation_system/energy_predictor.py b/model_data/simulation_system/energy_predictor.py index d195241e..7b90ded2 100644 --- a/model_data/simulation_system/energy_predictor.py +++ b/model_data/simulation_system/energy_predictor.py @@ -1,5 +1,5 @@ from pathlib import Path -from core.SettingsSettings import ( +from core.Settings import ( RDSAP_RESPONSE, FLOOR_LEVEL_MAP, BUILT_FORM_REMAP, diff --git a/model_data/simulation_system/predictions.py b/model_data/simulation_system/predictions.py index 7fc6bcc2..7931ecb4 100644 --- a/model_data/simulation_system/predictions.py +++ b/model_data/simulation_system/predictions.py @@ -12,7 +12,8 @@ import pandas as pd from typing import Optional from datetime import datetime from core.Settings import ( - REGISTRY_PATH, + BASE_REGISTRY_PATH, + REGISTRY_FILE, PREDICTION_LOCATION, PREDICTION_FILE, METADATA_FILE @@ -42,13 +43,15 @@ def ingest_arguments() -> argparse.Namespace: -def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None): +def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None): """ Main pipeline function """ - if registry_path is None: - logger.error("No registry path provided") + registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE + + if registry_path is None or not registry_path.exists(): + logger.error("No registry path provided or registry doesn't exist") exit(1) if model_path is not None: @@ -104,7 +107,7 @@ def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame = # Saving prediction local for now logger.info("--- Outputting prediction and metadata --- ") - output_base = PREDICTION_LOCATION / uprn / TIMESTAMP + output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP output_base.mkdir(parents=True, exist_ok=True) json_prediction = prediction.to_json(output_base / PREDICTION_FILE) @@ -125,4 +128,4 @@ if __name__ == "__main__": # Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}' # Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet - prediction(registry_path=REGISTRY_PATH, model_path=args.model_path, data=args.data, data_path=args.data_path) \ No newline at end of file + prediction(model_path=args.model_path, data=args.data, data_path=args.data_path) \ No newline at end of file diff --git a/model_data/simulation_system/training.py b/model_data/simulation_system/training.py index ecad367f..cc2a3939 100644 --- a/model_data/simulation_system/training.py +++ b/model_data/simulation_system/training.py @@ -1,5 +1,7 @@ import argparse +import boto3 +import os from pathlib import Path from datetime import datetime from typing import List @@ -10,7 +12,8 @@ from MLModel.Models import AutogluonModel import pandas as pd from core.Settings import ( MODEL_DIRECTORY, - REGISTRY_PATH + BASE_REGISTRY_PATH, + REGISTRY_FILE ) TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S") @@ -33,6 +36,17 @@ model_type = "autogluon" hyperparameter = HYPERPARAMETERS subsample_factor = 200 +# SESSION = boto3.Session() + +# S3_CLIENT = SESSION.client( +# service_name="s3", +# aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", 'admin'), +# aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", 'password'), +# endpoint_url=os.environ.get("ENDPOINT_URL", "http://localhost:9000") +# ) + +# S3_CLIENT.create_bucket +# S3_CLIENT.list_buckets() def ingest_arguments() -> argparse.Namespace: """ @@ -77,12 +91,13 @@ def training( test_df = feature_processor.process(test_df, target_column=target_column) logger.info('--- Build Model ---') + + model_folder = "model" + metrics_folder = "metrics" + if model_type == "autogluon": model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower() - output_base = Path(MODEL_DIRECTORY) / model_type / model_root - - model_folder = "model" - metrics_folder = "metrics" + output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root model = AutogluonModel( output_filepath = output_base / model_folder @@ -118,10 +133,12 @@ def training( # TODO: Need a model registry - for now have this as a CSV # Save this in the model directory logger.info("--- Append registry with new model ---") + + registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE - if REGISTRY_PATH.exists(): + if registry_path.exists(): logger.info("Registry file found - Loading into Dataframe") - registry_df = pd.read_csv(REGISTRY_PATH, index_col=None) + registry_df = pd.read_csv(registry_path, index_col=None) else: registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model']) @@ -143,7 +160,7 @@ def training( registry_df.loc[0, 'best_model'] = True logger.info("--- Saving new model to registry ---") - registry_df.to_csv(REGISTRY_PATH, index=False) + registry_df.to_csv(registry_path, index=False) logger.info("--- Training Pipeline Complete --- ") @@ -155,6 +172,7 @@ if __name__ == "__main__": logger.info('---Ingest Arguments---') args = ingest_arguments() + # To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet # TODO: Ingest hyper parameters from somewhere - currently change at the top of script training( train_filepath=args.train_filepath,