Add readme and a better folder output structure. Starter code for mock up s3

This commit is contained in:
Michael Duong 2023-08-18 10:18:17 +01:00
parent 2a18180c53
commit 2e5c423562
7 changed files with 129 additions and 16 deletions

View file

@ -0,0 +1,14 @@
.PHONY: init
init: build docker
.PHONY: build
build:
docker-compose build
.PHONY: docker
docker:
docker-compose up -d
.PHONY: down
down:
docker compose down

View file

@ -0,0 +1,61 @@
# Simulation System
Starter Readme:
Steps for pipeline:
- (WIP) Use Makefile to start up mock up s3 service
- By running `make init`, this will run the `docker-compose build` and `docker-compose up -d`, which spins up a S3 service
- This docker compose is running in detached mode `-d`, so will no output anything to the terminal
- Once the Minio service is run, you can run the `training.py` file to start a model build process
- This will output a model, for a given target column, and add model name composed of some of the hyperparameters
- An example of running this file is:
- `python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet`
- Outputs of the pipeline are:
- A model directory bucket
- A target variable prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
- A model type prefix (i.e. autogluon, tensorflow etc)
- A model name prefix (i.e. rdsap_change_medium_quality_60_TIMESTAMP)
- This model name is made up of target variable, quality, time spent training and timestamp
- Within this prefix, there are three folders:
- model
- The model path that can be loaded in the codebase
- deployment
- The optimised model that can be deployed (may or maynot need this)
- metrics
- The metrics generatted from the model (may or may not need this as this can be contained in the registry)
- Once model build is finished, you can run the `prediction.py` file to generate prediction
- By default, the prediction pipeline will select the best model based on **mean absolute error** from the model registry
- This can be overwritten by specifying a model_path, which will load an alternative model
- There are two ways of getting data into the pipeline:
- Using the `--data` argument:
- This is a JSON string which can be passed as `python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'`
- Note the single and double quotation marks, as this affects the ingestion
- Using the `--data-path` argument:
- This can be a filepath (Can imagine that we might want to pull data from S3/ DB)
- An example of running the file is:
- `python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet`
- Outputs of the pipeline are:
- prediction bucket
- a Target variables prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
- a uprn prefix (i.e 0123456789)
- a `prediction.json`
- a `metadata.json`
- This is all the metadata from the model (can change this if needed)
# TODO:
- Structure/ MLOps:
- Add configuration files (dev, staging, prod), including hyperparamters
- Add precommit hooks (linters, branch names, etc)
- Sphinx documentation
- Sort out local mock up services
- Sort out Model Registry
- Sort out Data version control
- Data Science:
- Rebuild metrics script (Could be a one off but good to have)
- Determine metrics
- Implement and test custom model (Tensorflow Decision Trees etc)
- Orchestration:
- Lambda handler for the pipeline

View file

@ -9,7 +9,7 @@ TEST_DATA_NAME = 'test_data.parquet'
REGISTRY_FILE = "model_registry.csv"
MODEL_DIRECTORY = "model_directory"
REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY / REGISTRY_FILE
BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
PREDICTION_LOCATION = Path("predictions")
PREDICTION_FILE = 'prediction.json'
METADATA_FILE = 'metadata.json'

View file

@ -0,0 +1,17 @@
version: '3'
services:
minio:
image: minio/minio
ports:
- "9000:9000"
- "9001:9001"
volumes:
- ./data:/data
environment:
MINIO_ROOT_USER: &MINIO_USER admin
MINIO_ROOT_PASSWORD: &MINIO_PASS password
command: server --console-address ":9001" /data
# volumes:
# minio_storage: {}

View file

@ -1,5 +1,5 @@
from pathlib import Path
from core.SettingsSettings import (
from core.Settings import (
RDSAP_RESPONSE,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP,

View file

@ -12,7 +12,8 @@ import pandas as pd
from typing import Optional
from datetime import datetime
from core.Settings import (
REGISTRY_PATH,
BASE_REGISTRY_PATH,
REGISTRY_FILE,
PREDICTION_LOCATION,
PREDICTION_FILE,
METADATA_FILE
@ -42,13 +43,15 @@ def ingest_arguments() -> argparse.Namespace:
def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
"""
Main pipeline function
"""
if registry_path is None:
logger.error("No registry path provided")
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if registry_path is None or not registry_path.exists():
logger.error("No registry path provided or registry doesn't exist")
exit(1)
if model_path is not None:
@ -104,7 +107,7 @@ def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame =
# Saving prediction local for now
logger.info("--- Outputting prediction and metadata --- ")
output_base = PREDICTION_LOCATION / uprn / TIMESTAMP
output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP
output_base.mkdir(parents=True, exist_ok=True)
json_prediction = prediction.to_json(output_base / PREDICTION_FILE)
@ -125,4 +128,4 @@ if __name__ == "__main__":
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
prediction(registry_path=REGISTRY_PATH, model_path=args.model_path, data=args.data, data_path=args.data_path)
prediction(model_path=args.model_path, data=args.data, data_path=args.data_path)

View file

@ -1,5 +1,7 @@
import argparse
import boto3
import os
from pathlib import Path
from datetime import datetime
from typing import List
@ -10,7 +12,8 @@ from MLModel.Models import AutogluonModel
import pandas as pd
from core.Settings import (
MODEL_DIRECTORY,
REGISTRY_PATH
BASE_REGISTRY_PATH,
REGISTRY_FILE
)
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
@ -33,6 +36,17 @@ model_type = "autogluon"
hyperparameter = HYPERPARAMETERS
subsample_factor = 200
# SESSION = boto3.Session()
# S3_CLIENT = SESSION.client(
# service_name="s3",
# aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", 'admin'),
# aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", 'password'),
# endpoint_url=os.environ.get("ENDPOINT_URL", "http://localhost:9000")
# )
# S3_CLIENT.create_bucket
# S3_CLIENT.list_buckets()
def ingest_arguments() -> argparse.Namespace:
"""
@ -77,12 +91,13 @@ def training(
test_df = feature_processor.process(test_df, target_column=target_column)
logger.info('--- Build Model ---')
model_folder = "model"
metrics_folder = "metrics"
if model_type == "autogluon":
model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower()
output_base = Path(MODEL_DIRECTORY) / model_type / model_root
model_folder = "model"
metrics_folder = "metrics"
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
model = AutogluonModel(
output_filepath = output_base / model_folder
@ -118,10 +133,12 @@ def training(
# TODO: Need a model registry - for now have this as a CSV
# Save this in the model directory
logger.info("--- Append registry with new model ---")
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
if REGISTRY_PATH.exists():
if registry_path.exists():
logger.info("Registry file found - Loading into Dataframe")
registry_df = pd.read_csv(REGISTRY_PATH, index_col=None)
registry_df = pd.read_csv(registry_path, index_col=None)
else:
registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
@ -143,7 +160,7 @@ def training(
registry_df.loc[0, 'best_model'] = True
logger.info("--- Saving new model to registry ---")
registry_df.to_csv(REGISTRY_PATH, index=False)
registry_df.to_csv(registry_path, index=False)
logger.info("--- Training Pipeline Complete --- ")
@ -155,6 +172,7 @@ if __name__ == "__main__":
logger.info('---Ingest Arguments---')
args = ingest_arguments()
# To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
training(
train_filepath=args.train_filepath,