Merge pull request #36 from Hestia-Homes/readme

Readme
This commit is contained in:
quandanrepo 2023-09-18 20:16:42 +01:00 committed by GitHub
commit b6e155828b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
55 changed files with 129 additions and 125 deletions

View file

@ -16,19 +16,19 @@ jobs:
- name: Install packages to retrieve artifacts
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Retrieve artifacts (dvc.lock)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline/src
cd modules/ml-pipeline/src/pipeline
dvc pull -r dev
- name: Build Prediction docker image (TODO - NEED LAMBDA IMAGE, need to add version from gto registry)
run: |
cd modules/ml-pipeline/src/pipeline/
cd modules/ml-pipeline/src/
REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}')
docker build . --file Prediction.Dockerfile --tag ${REGISTER_MODEL_NAME}

View file

@ -26,7 +26,7 @@ jobs:
- name: Install packages to register model
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Register Model
run: |
@ -64,7 +64,7 @@ jobs:
- name: Install packages to register model
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Register Model
run: |
@ -102,7 +102,7 @@ jobs:
- name: Install packages to register model
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Register Model
run: |
@ -138,14 +138,14 @@ jobs:
- name: Install packages to retrieve artifacts
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Retrieve artifacts (dvc.lock)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline/src
cd modules/ml-pipeline/src/pipeline
dvc pull -r experiments
- name: Push artifacts to Dev
@ -153,7 +153,7 @@ jobs:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline/src
cd modules/ml-pipeline/src/pipeline
dvc push -r dev
Register-New-Model-Dev:
@ -173,7 +173,7 @@ jobs:
- name: Install packages to register model
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Register Model
run: |
@ -211,43 +211,3 @@ jobs:
git add .
git commit -m "Update Registry"
git push origin master
Register-Prediction-Image-Dev:
needs: [Promote-Artefacts-To-Dev, Register-New-Model-Dev]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Install packages to retrieve artifacts
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt
- name: Retrieve artifacts (dvc.lock)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline/src
dvc pull -r dev
- name: Build Prediction docker image (TODO - NEED LAMBDA IMAGE, need to add version from gto registry)
run: |
cd modules/ml-pipeline/src/pipeline/
REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}')
docker build . --file Prediction.Dockerfile --tag ${REGISTER_MODEL_NAME}
- name: ECR Login - Dev
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
echo "LOGIN TO ECR"
- name: Push Prediction image to ECR - Dev
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
echo "PUSH TO ECR"

View file

@ -44,19 +44,19 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Retrieve artifacts (dvc.lock)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline/src
cd modules/ml-pipeline/src/pipeline
dvc pull -r experiments
- name: Build Prediction docker Image
run: |
cd modules/ml-pipeline/src/pipeline/
cd modules/ml-pipeline/src/
docker build . --file Prediction.Dockerfile --tag prediction_test
- name: Run Prediction docker container
@ -72,14 +72,14 @@ jobs:
- name: Install packages to retrieve artifacts
run: |
pip install --upgrade pip
pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt
pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt
- name: Retrieve artifacts (dvc.lock)
env:
AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
run: |
cd modules/ml-pipeline/src/pipeline/src
cd modules/ml-pipeline/src/pipeline
dvc pull -r experiments
- uses: actions/setup-python@v4
@ -90,7 +90,7 @@ jobs:
AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }}
REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
cd modules/ml-pipeline/src/pipeline/src
cd modules/ml-pipeline/src/pipeline
echo "## Model metrics" > report.md
# Compare metrics to master

View file

@ -10,3 +10,10 @@ repos:
rev: 22.10.0
hooks:
- id: black
- repo: local
hooks:
- id: dvc-push-experiment
name: DVC - Push to experiment to remote location (experiments)
entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"'
language: system
verbose: true

View file

@ -3,9 +3,21 @@
Creating a ML-toolkit that can be reused:
- ML pipeline:
- A dummy pipeline that has data version control, experiment
- A generic pipeline that has data version control, experiment
tracking and a model registry
- ML monitoring:
- A bolt-on service that can implement model monitoring
There are multiple protected branches which adapt the generic pipeline to produce different models:
- sap_change-**
- heat_change-**
- carbon_change-**
These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline
- There can be different additional logic for each branch but the pipeline will be the same.
# Deployment
TBD

View file

@ -20,6 +20,6 @@ repos:
hooks:
- id: dvc-push-experiment
name: DVC - Push to experiment to remote location (experiments)
entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"'
entry: bash -c 'cd modules/ml-pipeline/src/pipeline && dvc push -r experiments || echo "Up to date!"'
language: system
verbose: true

View file

@ -11,7 +11,7 @@ dev-pyenv:
pyenv install ${PYTHON_VERSION} || echo "Python version already installed"
pyenv global ${PYTHON_VERSION}
python3 -m venv .dev_env_pipeline
. .dev_env_pipeline/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/src/requirements/training/requirements-dev.txt && pip install -r src/pipeline/src/requirements/version_control/requirements.txt && pre-commit install
. .dev_env_pipeline/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/requirements/training/requirements-dev.txt && pip install -r src/pipeline/requirements/version_control/requirements.txt && pre-commit install
echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
echo "source .dev_env_pipeline/bin/activate"

View file

@ -1,16 +1,30 @@
# ML-pipeline
This is a dummy ML-pipeline, consisting of:
This is a generic ML-pipeline, consisting of:
- dvc tracking for version control (data and models)
- gto for model registry
- docs, created via sphinx (in pre-commit hooks)
- tests for unit, integration and end to end testing
Within `src` folder, the structure is as follows:
- multiple pipelines can be defined
- i.e. for a product, we might require multuple pipelines do deliver a result
- i.e. multiple models
- these models can be all tracked within the same gto model registry
- `pipeline` folder, which contains all the codebase for the generic pipeline
- The pipeline can track multiple models through dvc and gto model registry
- Deployment files:
- Prediction.Dockerfile - code to create the prediction deployment image
- Training.Dockerfil - code to create the training image (i.e. for remote training on EC2/ Fargate)
- Docker development environment:
- If you wish to develop within a docker.
# How to develop using this pipeline:
Run `make init`, which will:
- Download pyenv (Python version management)
- Download Python 3.X.X as defined in the `make` file - current 3.10.12
- Create a virtual environment with this version of python
- Install packages in the training and version control directories in the pipeline folder (dev version if applicable)
- Install pre-commit to enable pre-commit hooks
To use the environment, run `source .dev_env_pipeline/bin/activate`.
To enable the virtual envrionemnt created in vscode:
- Open settings

View file

@ -1,14 +1,14 @@
# Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow)
FROM python:3.10.12-slim
COPY src/requirements/predictions/requirements.txt requirements.txt
COPY pipeline/requirements/predictions/requirements.txt requirements.txt
RUN pip install --upgrade pip
RUN pip install -r requirements.txt
# Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script
COPY src/ /home/src/
COPY pipeline/ /home/pipeline/
WORKDIR /home/src/
WORKDIR /home/pipeline/
CMD [ "python", "generate_predictions.py"]

View file

@ -0,0 +1,3 @@
# The generic reproducible ML-pipeline
Pipeline required to build a model to produce an output

View file

@ -1,3 +1,39 @@
# Pipeline 1
# Training
Pipeline required to build a model to produce an output
This folder contains the code base for training experimentation.
To understand the pipeline, run `dvc dag`
There are 4 main steps:
- Preparing data
- This is loading data (locally or from s3)
- Splitting the data into train and validation
- Creating additional features (if needed)
- **Data is cached**
- This will be down to the dvc remote location
- Build model
- For the prepared data, we build a model using our configurations
- Model is saved (locally or s3)
- **Model and fit metrics are cached**
- This will be down to the dvc remote location
- Generate Predictions
- For the given model, we generate predictions on validation test data
- **Predictions are cached**
- This will be down to the dvc remote location
- Generate Metrics
- For the given model, we generate metrics on validation data/test data
- **Metrics are cached**
- This will be down to the dvc remote location
Workflow:
- Use `dvc metrics show` to view current metrics score
- Adjust parameters/ codebase
- When happy with changes, use `dvc exp run` to trigger an experiment
- Due to cache, only need stages are re-run
- Use `dvc metrics diff` to check the change in metrics
- Use `dvc exp show` to view all experiments
- NOTE: the last experiment will always be applied to the workspace!
- After running experiments, you can apply the the best model to workspace using `dvc exp apply [EXPERIMENT_NAME]`
- This experiment will have the corresponding .dvc files for the hashed model and data
- Use version control as normal
- git add, git commit etc

View file

@ -53,12 +53,12 @@ def build_model(
if train_data is None:
if train_filepath is None:
raise ValueError(f"Need {train_filepath} if no data supplied")
train_data = dataclient.load_data(location=train_filepath)
train_data = dataclient.load_data(location=train_filepath, load_config=None)
if test_data is None:
if test_filepath is None:
raise ValueError(f"Need {test_filepath} if no data supplied")
test_data = dataclient.load_data(location=test_filepath)
test_data = dataclient.load_data(location=test_filepath, load_config=None)
logger.info("----------------------")
logger.info("--- Training model ---")
@ -95,7 +95,9 @@ def build_model(
logger.info("--- Saving fit metrics ---")
logger.info("--------------------------")
dataclient.save_data(obj=metrics_output, location=fit_metrics_filepath)
dataclient.save_data(
obj=metrics_output, location=fit_metrics_filepath, save_config=None
)
if __name__ == "__main__":

View file

@ -8,7 +8,7 @@ import boto3
import pandas as pd
from pathlib import Path
from io import BytesIO
from typing import List, Union
from typing import List, Union, Any
from core.interface.InterfaceDataClient import DataClient
from core.Logger import logger
@ -105,7 +105,7 @@ class AWSS3Client:
def save_data(
self,
obj: object,
obj: Any,
location: str,
save_config: Union[dict, None] = None,
) -> None:
@ -134,7 +134,7 @@ class AWSS3Client:
obj=obj, location=location, save_config=save_config
)
def _save_parquet(self, obj: object, location: str, save_config: dict):
def _save_parquet(self, obj: pd.DataFrame, location: str, save_config: dict):
"""
Save object as parquet
"""

View file

@ -9,7 +9,6 @@ Create additional features from the dataset
import pandas as pd
from typing import List, Callable, Union
from core.interface.InterfaceFeatureProcessor import FeatureProcessor
from core.Logger import logger
def feature_processor_factory(feature_processor_type: str) -> FeatureProcessor:

View file

@ -3,8 +3,7 @@ Interface for all DataClient i.e. s3, database, local etc
"""
import pandas as pd
from io import BytesIO
from typing import Protocol, Union
from typing import Protocol, Union, Any
class DataClient(Protocol):
@ -22,9 +21,10 @@ class DataClient(Protocol):
"""
Generic to load data
"""
...
def save_data(
self, obj: object, location: str, save_config: Union[dict, None]
self, obj: Any, location: str, save_config: Union[dict, None]
) -> None:
"""
Generic to save data

View file

@ -59,14 +59,16 @@ def generate_metrics(
logger.info("-------------------------")
test_data = input_dataclient.load_data(
location=test_data_filepath,
location=test_data_filepath, load_config=None
)
logger.info("---------------------------")
logger.info("--- Loading predictions ---")
logger.info("---------------------------")
predictions = input_dataclient.load_data(location=predictions_output_filepath)
predictions = input_dataclient.load_data(
location=predictions_output_filepath, load_config=None
)
logger.info("--------------------------")
logger.info("--- Generating metrics ---")
@ -81,7 +83,9 @@ def generate_metrics(
logger.info("--- Saving metrics ---")
logger.info("----------------------")
output_dataclient.save_data(obj=metrics_output, location=metrics_output_filepath)
output_dataclient.save_data(
obj=metrics_output, location=metrics_output_filepath, save_config=None
)
if __name__ == "__main__":

View file

@ -52,7 +52,9 @@ def generate_predictions(
logger.info("--- Loading test data ---")
logger.info("-------------------------")
test_data = input_dataclient.load_data(location=test_data_filepath)
test_data = input_dataclient.load_data(
location=test_data_filepath, load_config=None
)
logger.info("---------------------")
logger.info("--- Loading model ---")
@ -78,7 +80,7 @@ def generate_predictions(
predictions_df.columns = [predictions_column_name]
output_dataclient.save_data(
obj=predictions_df, location=predictions_output_filepath
obj=predictions_df, location=predictions_output_filepath, save_config=None
)

View file

@ -79,10 +79,14 @@ def prepare_data(
logger.info("--- Outputting data ---")
logger.info("-----------------------")
output_dataclient.save_data(obj=train, location=output_train_filepath)
output_dataclient.save_data(
obj=train, location=output_train_filepath, save_config=None
)
if test is not None:
output_dataclient.save_data(obj=test, location=output_test_filepath)
output_dataclient.save_data(
obj=test, location=output_test_filepath, save_config=None
)
return train, test

Binary file not shown.

View file

@ -1 +0,0 @@
{}

View file

@ -1,35 +0,0 @@
# Training
This folder contains the code base for training experimentation.
To understand the pipeline, run `dvc dag`
There are 3 main steps:
- Preparing data
- This is loading data (locally or from s3)
- Splitting the data into train and validation
- Creating additional features (if needed)
- **Data is cached**
- This will be down to the dvc remote location
- Build model
- For the prepared data, we build a model using our configurations
- Model is saved (locally or s3)
- **Model is cached**
- This will be down to the dvc remote location
- Generate Metrics
- For the given model, we generate metrics on validation data/test data
- **Metrics are cached**
- This will be down to the dvc remote location
Workflow:
- Use `dvc metrics show` to view current metrics score
- Adjust parameters/ codebase
- When happy with changes, use `dvc exp run` to trigger an experiment
- Due to cache, only need stages are re-run
- Use `dvc metrics diff` to check the change in metrics
- Use `dvc exp show` to view all experiments
- NOTE: the last experiment will always be applied to the workspace!
- After running experiments, you can apply the the best model to workspace using `dvc exp apply [EXPERIMENT_NAME]`
- This experiment will have the corresponding .dvc files for the hashed model and data
- Use version control as normal
- git add, git commit etc

View file

@ -1,3 +0,0 @@
# Pipeline2
PLACEHOLDER PIPELINE IF NEEDED