mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Add readme and a better folder output structure. Starter code for mock up s3
This commit is contained in:
parent
2a18180c53
commit
2e5c423562
7 changed files with 129 additions and 16 deletions
14
model_data/simulation_system/Makefile
Normal file
14
model_data/simulation_system/Makefile
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
.PHONY: init
|
||||
init: build docker
|
||||
|
||||
.PHONY: build
|
||||
build:
|
||||
docker-compose build
|
||||
|
||||
.PHONY: docker
|
||||
docker:
|
||||
docker-compose up -d
|
||||
|
||||
.PHONY: down
|
||||
down:
|
||||
docker compose down
|
||||
61
model_data/simulation_system/README.md
Normal file
61
model_data/simulation_system/README.md
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
# Simulation System
|
||||
|
||||
Starter Readme:
|
||||
Steps for pipeline:
|
||||
|
||||
- (WIP) Use Makefile to start up mock up s3 service
|
||||
- By running `make init`, this will run the `docker-compose build` and `docker-compose up -d`, which spins up a S3 service
|
||||
- This docker compose is running in detached mode `-d`, so will no output anything to the terminal
|
||||
|
||||
- Once the Minio service is run, you can run the `training.py` file to start a model build process
|
||||
- This will output a model, for a given target column, and add model name composed of some of the hyperparameters
|
||||
- An example of running this file is:
|
||||
- `python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet`
|
||||
- Outputs of the pipeline are:
|
||||
- A model directory bucket
|
||||
- A target variable prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
|
||||
- A model type prefix (i.e. autogluon, tensorflow etc)
|
||||
- A model name prefix (i.e. rdsap_change_medium_quality_60_TIMESTAMP)
|
||||
- This model name is made up of target variable, quality, time spent training and timestamp
|
||||
- Within this prefix, there are three folders:
|
||||
- model
|
||||
- The model path that can be loaded in the codebase
|
||||
- deployment
|
||||
- The optimised model that can be deployed (may or maynot need this)
|
||||
- metrics
|
||||
- The metrics generatted from the model (may or may not need this as this can be contained in the registry)
|
||||
|
||||
- Once model build is finished, you can run the `prediction.py` file to generate prediction
|
||||
- By default, the prediction pipeline will select the best model based on **mean absolute error** from the model registry
|
||||
- This can be overwritten by specifying a model_path, which will load an alternative model
|
||||
- There are two ways of getting data into the pipeline:
|
||||
- Using the `--data` argument:
|
||||
- This is a JSON string which can be passed as `python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'`
|
||||
- Note the single and double quotation marks, as this affects the ingestion
|
||||
- Using the `--data-path` argument:
|
||||
- This can be a filepath (Can imagine that we might want to pull data from S3/ DB)
|
||||
- An example of running the file is:
|
||||
- `python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet`
|
||||
- Outputs of the pipeline are:
|
||||
- prediction bucket
|
||||
- a Target variables prefix (i.e. RDSAP_CHANGE or HEAT_DEMAND_CHANGE)
|
||||
- a uprn prefix (i.e 0123456789)
|
||||
- a `prediction.json`
|
||||
- a `metadata.json`
|
||||
- This is all the metadata from the model (can change this if needed)
|
||||
|
||||
|
||||
# TODO:
|
||||
- Structure/ MLOps:
|
||||
- Add configuration files (dev, staging, prod), including hyperparamters
|
||||
- Add precommit hooks (linters, branch names, etc)
|
||||
- Sphinx documentation
|
||||
- Sort out local mock up services
|
||||
- Sort out Model Registry
|
||||
- Sort out Data version control
|
||||
- Data Science:
|
||||
- Rebuild metrics script (Could be a one off but good to have)
|
||||
- Determine metrics
|
||||
- Implement and test custom model (Tensorflow Decision Trees etc)
|
||||
- Orchestration:
|
||||
- Lambda handler for the pipeline
|
||||
|
|
@ -9,7 +9,7 @@ TEST_DATA_NAME = 'test_data.parquet'
|
|||
|
||||
REGISTRY_FILE = "model_registry.csv"
|
||||
MODEL_DIRECTORY = "model_directory"
|
||||
REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY / REGISTRY_FILE
|
||||
BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
|
||||
PREDICTION_LOCATION = Path("predictions")
|
||||
PREDICTION_FILE = 'prediction.json'
|
||||
METADATA_FILE = 'metadata.json'
|
||||
|
|
|
|||
17
model_data/simulation_system/docker-compose.yml
Normal file
17
model_data/simulation_system/docker-compose.yml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
version: '3'
|
||||
|
||||
services:
|
||||
minio:
|
||||
image: minio/minio
|
||||
ports:
|
||||
- "9000:9000"
|
||||
- "9001:9001"
|
||||
volumes:
|
||||
- ./data:/data
|
||||
environment:
|
||||
MINIO_ROOT_USER: &MINIO_USER admin
|
||||
MINIO_ROOT_PASSWORD: &MINIO_PASS password
|
||||
command: server --console-address ":9001" /data
|
||||
|
||||
# volumes:
|
||||
# minio_storage: {}
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
from pathlib import Path
|
||||
from core.SettingsSettings import (
|
||||
from core.Settings import (
|
||||
RDSAP_RESPONSE,
|
||||
FLOOR_LEVEL_MAP,
|
||||
BUILT_FORM_REMAP,
|
||||
|
|
|
|||
|
|
@ -12,7 +12,8 @@ import pandas as pd
|
|||
from typing import Optional
|
||||
from datetime import datetime
|
||||
from core.Settings import (
|
||||
REGISTRY_PATH,
|
||||
BASE_REGISTRY_PATH,
|
||||
REGISTRY_FILE,
|
||||
PREDICTION_LOCATION,
|
||||
PREDICTION_FILE,
|
||||
METADATA_FILE
|
||||
|
|
@ -42,13 +43,15 @@ def ingest_arguments() -> argparse.Namespace:
|
|||
|
||||
|
||||
|
||||
def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
|
||||
def prediction(target_column: str = "RDSAP_CHANGE", model_path: str = None, data: pd.DataFrame = None, data_path: Optional[str] = None):
|
||||
"""
|
||||
Main pipeline function
|
||||
"""
|
||||
|
||||
if registry_path is None:
|
||||
logger.error("No registry path provided")
|
||||
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
|
||||
|
||||
if registry_path is None or not registry_path.exists():
|
||||
logger.error("No registry path provided or registry doesn't exist")
|
||||
exit(1)
|
||||
|
||||
if model_path is not None:
|
||||
|
|
@ -104,7 +107,7 @@ def prediction(registry_path: Path, model_path: str = None, data: pd.DataFrame =
|
|||
|
||||
# Saving prediction local for now
|
||||
logger.info("--- Outputting prediction and metadata --- ")
|
||||
output_base = PREDICTION_LOCATION / uprn / TIMESTAMP
|
||||
output_base = PREDICTION_LOCATION / target_column / uprn / TIMESTAMP
|
||||
output_base.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
json_prediction = prediction.to_json(output_base / PREDICTION_FILE)
|
||||
|
|
@ -125,4 +128,4 @@ if __name__ == "__main__":
|
|||
|
||||
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
|
||||
# Data path can be passed as so: python3 predictions.py --data-path ../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet
|
||||
prediction(registry_path=REGISTRY_PATH, model_path=args.model_path, data=args.data, data_path=args.data_path)
|
||||
prediction(model_path=args.model_path, data=args.data, data_path=args.data_path)
|
||||
|
|
@ -1,5 +1,7 @@
|
|||
|
||||
import argparse
|
||||
import boto3
|
||||
import os
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
|
|
@ -10,7 +12,8 @@ from MLModel.Models import AutogluonModel
|
|||
import pandas as pd
|
||||
from core.Settings import (
|
||||
MODEL_DIRECTORY,
|
||||
REGISTRY_PATH
|
||||
BASE_REGISTRY_PATH,
|
||||
REGISTRY_FILE
|
||||
)
|
||||
|
||||
TIMESTAMP = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
|
||||
|
|
@ -33,6 +36,17 @@ model_type = "autogluon"
|
|||
hyperparameter = HYPERPARAMETERS
|
||||
subsample_factor = 200
|
||||
|
||||
# SESSION = boto3.Session()
|
||||
|
||||
# S3_CLIENT = SESSION.client(
|
||||
# service_name="s3",
|
||||
# aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", 'admin'),
|
||||
# aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", 'password'),
|
||||
# endpoint_url=os.environ.get("ENDPOINT_URL", "http://localhost:9000")
|
||||
# )
|
||||
|
||||
# S3_CLIENT.create_bucket
|
||||
# S3_CLIENT.list_buckets()
|
||||
|
||||
def ingest_arguments() -> argparse.Namespace:
|
||||
"""
|
||||
|
|
@ -77,12 +91,13 @@ def training(
|
|||
test_df = feature_processor.process(test_df, target_column=target_column)
|
||||
|
||||
logger.info('--- Build Model ---')
|
||||
|
||||
model_folder = "model"
|
||||
metrics_folder = "metrics"
|
||||
|
||||
if model_type == "autogluon":
|
||||
model_root = f"{target_column}-{HYPERPARAMETERS['presets']}-{HYPERPARAMETERS['time_limit']}-{TIMESTAMP}".lower()
|
||||
output_base = Path(MODEL_DIRECTORY) / model_type / model_root
|
||||
|
||||
model_folder = "model"
|
||||
metrics_folder = "metrics"
|
||||
output_base = Path(MODEL_DIRECTORY) / target_column / model_type / model_root
|
||||
|
||||
model = AutogluonModel(
|
||||
output_filepath = output_base / model_folder
|
||||
|
|
@ -118,10 +133,12 @@ def training(
|
|||
# TODO: Need a model registry - for now have this as a CSV
|
||||
# Save this in the model directory
|
||||
logger.info("--- Append registry with new model ---")
|
||||
|
||||
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
|
||||
|
||||
if REGISTRY_PATH.exists():
|
||||
if registry_path.exists():
|
||||
logger.info("Registry file found - Loading into Dataframe")
|
||||
registry_df = pd.read_csv(REGISTRY_PATH, index_col=None)
|
||||
registry_df = pd.read_csv(registry_path, index_col=None)
|
||||
else:
|
||||
registry_df = pd.DataFrame(columns=['model_type', 'model_name', 'model_location', 'mean_absolute_error', 'root_mean_squared_error', 'mean_squared_error', 'r2', 'pearsonr', 'median_absolute_error', 'mape', 'best_model'])
|
||||
|
||||
|
|
@ -143,7 +160,7 @@ def training(
|
|||
registry_df.loc[0, 'best_model'] = True
|
||||
|
||||
logger.info("--- Saving new model to registry ---")
|
||||
registry_df.to_csv(REGISTRY_PATH, index=False)
|
||||
registry_df.to_csv(registry_path, index=False)
|
||||
|
||||
logger.info("--- Training Pipeline Complete --- ")
|
||||
|
||||
|
|
@ -155,6 +172,7 @@ if __name__ == "__main__":
|
|||
logger.info('---Ingest Arguments---')
|
||||
args = ingest_arguments()
|
||||
|
||||
# To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
|
||||
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
|
||||
training(
|
||||
train_filepath=args.train_filepath,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue