From 0bf5fdd6d84e7f278e106bc5638e4d8cfad48f96 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 18 Sep 2023 08:13:37 +0100 Subject: [PATCH 1/7] fix type hints --- modules/ml-pipeline/src/pipeline/src/build_model.py | 8 +++++--- .../ml-pipeline/src/pipeline/src/core/DataClient.py | 6 +++--- .../pipeline/src/core/interface/InterfaceDataClient.py | 6 +++--- .../ml-pipeline/src/pipeline/src/generate_metrics.py | 10 +++++++--- .../src/pipeline/src/generate_predictions.py | 6 ++++-- modules/ml-pipeline/src/pipeline/src/prepare_data.py | 8 ++++++-- 6 files changed, 28 insertions(+), 16 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/src/build_model.py b/modules/ml-pipeline/src/pipeline/src/build_model.py index 5dfc71a..a07e9cf 100644 --- a/modules/ml-pipeline/src/pipeline/src/build_model.py +++ b/modules/ml-pipeline/src/pipeline/src/build_model.py @@ -53,12 +53,12 @@ def build_model( if train_data is None: if train_filepath is None: raise ValueError(f"Need {train_filepath} if no data supplied") - train_data = dataclient.load_data(location=train_filepath) + train_data = dataclient.load_data(location=train_filepath, load_config=None) if test_data is None: if test_filepath is None: raise ValueError(f"Need {test_filepath} if no data supplied") - test_data = dataclient.load_data(location=test_filepath) + test_data = dataclient.load_data(location=test_filepath, load_config=None) logger.info("----------------------") logger.info("--- Training model ---") @@ -95,7 +95,9 @@ def build_model( logger.info("--- Saving fit metrics ---") logger.info("--------------------------") - dataclient.save_data(obj=metrics_output, location=fit_metrics_filepath) + dataclient.save_data( + obj=metrics_output, location=fit_metrics_filepath, save_config=None + ) if __name__ == "__main__": diff --git a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py b/modules/ml-pipeline/src/pipeline/src/core/DataClient.py index c8c9f2c..28ffff7 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py +++ b/modules/ml-pipeline/src/pipeline/src/core/DataClient.py @@ -8,7 +8,7 @@ import boto3 import pandas as pd from pathlib import Path from io import BytesIO -from typing import List, Union +from typing import List, Union, Any from core.interface.InterfaceDataClient import DataClient from core.Logger import logger @@ -105,7 +105,7 @@ class AWSS3Client: def save_data( self, - obj: object, + obj: Any, location: str, save_config: Union[dict, None] = None, ) -> None: @@ -134,7 +134,7 @@ class AWSS3Client: obj=obj, location=location, save_config=save_config ) - def _save_parquet(self, obj: object, location: str, save_config: dict): + def _save_parquet(self, obj: pd.DataFrame, location: str, save_config: dict): """ Save object as parquet """ diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py b/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py index d572c2b..5e51a99 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py +++ b/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py @@ -3,8 +3,7 @@ Interface for all DataClient i.e. s3, database, local etc """ import pandas as pd -from io import BytesIO -from typing import Protocol, Union +from typing import Protocol, Union, Any class DataClient(Protocol): @@ -22,9 +21,10 @@ class DataClient(Protocol): """ Generic to load data """ + ... def save_data( - self, obj: object, location: str, save_config: Union[dict, None] + self, obj: Any, location: str, save_config: Union[dict, None] ) -> None: """ Generic to save data diff --git a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py b/modules/ml-pipeline/src/pipeline/src/generate_metrics.py index 7efeda9..58244bc 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/src/generate_metrics.py @@ -59,14 +59,16 @@ def generate_metrics( logger.info("-------------------------") test_data = input_dataclient.load_data( - location=test_data_filepath, + location=test_data_filepath, load_config=None ) logger.info("---------------------------") logger.info("--- Loading predictions ---") logger.info("---------------------------") - predictions = input_dataclient.load_data(location=predictions_output_filepath) + predictions = input_dataclient.load_data( + location=predictions_output_filepath, load_config=None + ) logger.info("--------------------------") logger.info("--- Generating metrics ---") @@ -81,7 +83,9 @@ def generate_metrics( logger.info("--- Saving metrics ---") logger.info("----------------------") - output_dataclient.save_data(obj=metrics_output, location=metrics_output_filepath) + output_dataclient.save_data( + obj=metrics_output, location=metrics_output_filepath, save_config=None + ) if __name__ == "__main__": diff --git a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py index f80ec18..490d7e9 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py @@ -52,7 +52,9 @@ def generate_predictions( logger.info("--- Loading test data ---") logger.info("-------------------------") - test_data = input_dataclient.load_data(location=test_data_filepath) + test_data = input_dataclient.load_data( + location=test_data_filepath, load_config=None + ) logger.info("---------------------") logger.info("--- Loading model ---") @@ -78,7 +80,7 @@ def generate_predictions( predictions_df.columns = [predictions_column_name] output_dataclient.save_data( - obj=predictions_df, location=predictions_output_filepath + obj=predictions_df, location=predictions_output_filepath, save_config=None ) diff --git a/modules/ml-pipeline/src/pipeline/src/prepare_data.py b/modules/ml-pipeline/src/pipeline/src/prepare_data.py index 851be48..8caa101 100644 --- a/modules/ml-pipeline/src/pipeline/src/prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/src/prepare_data.py @@ -79,10 +79,14 @@ def prepare_data( logger.info("--- Outputting data ---") logger.info("-----------------------") - output_dataclient.save_data(obj=train, location=output_train_filepath) + output_dataclient.save_data( + obj=train, location=output_train_filepath, save_config=None + ) if test is not None: - output_dataclient.save_data(obj=test, location=output_test_filepath) + output_dataclient.save_data( + obj=test, location=output_test_filepath, save_config=None + ) return train, test From 18276f76163e3e252a510b33db83c2b737ff1c1a Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 18 Sep 2023 08:15:55 +0100 Subject: [PATCH 2/7] remove import --- modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py b/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py index 7f14e03..03ec4a9 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py +++ b/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py @@ -9,7 +9,6 @@ Create additional features from the dataset import pandas as pd from typing import List, Callable, Union from core.interface.InterfaceFeatureProcessor import FeatureProcessor -from core.Logger import logger def feature_processor_factory(feature_processor_type: str) -> FeatureProcessor: From 3223b002c958eeb3bb9aee702500da94184039da Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 18 Sep 2023 08:17:12 +0100 Subject: [PATCH 3/7] change base pre commit: --- .pre-commit-config.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 34a9b78..d59b9e8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,3 +10,10 @@ repos: rev: 22.10.0 hooks: - id: black +- repo: local + hooks: + - id: dvc-push-experiment + name: DVC - Push to experiment to remote location (experiments) + entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"' + language: system + verbose: true From 40bb9d6f0afefc3b365bd7efb77bcc509ccf29fe Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 18 Sep 2023 19:51:16 +0100 Subject: [PATCH 4/7] change layout --- modules/ml-pipeline/.pre-commit-config.yaml | 2 +- modules/ml-pipeline/Makefile | 2 +- modules/ml-pipeline/README.MD | 24 ++++++++--- .../.devcontainer/devcontainer.json | 0 .../src/{pipeline => }/Development.Dockerfile | 0 .../src/{pipeline => }/Prediction.Dockerfile | 0 modules/ml-pipeline/src/README.md | 3 ++ .../src/pipeline/{src => }/.dvc/.gitignore | 0 .../src/pipeline/{src => }/.dvc/config | 0 .../src/pipeline/{src => }/.dvcignore | 0 modules/ml-pipeline/src/pipeline/README.md | 40 +++++++++++++++++- .../src/pipeline/{src => }/__init__.py | 0 .../src/pipeline/{src => }/build_model.py | 0 .../{src => }/configs/build_model.yaml | 0 .../pipeline/{src => }/configs/client.yaml | 0 .../src/pipeline/{src => }/configs/configs.py | 0 .../{src => }/configs/feature_processor.yaml | 0 .../configs/feature_processor_logic.py | 0 .../{src => }/configs/generate_metrics.yaml | 0 .../configs/generate_predictions.yaml | 0 .../{src => }/configs/prepare_data.yaml | 0 .../{src => }/configs/startup_cleanup.yaml | 0 .../src/pipeline/{src => }/core/DataClient.py | 0 .../{src => }/core/FeatureProcessor.py | 0 .../src/pipeline/{src => }/core/Logger.py | 0 .../src/pipeline/{src => }/core/MLMetrics.py | 0 .../src/pipeline/{src => }/core/MLModels.py | 0 .../src/pipeline/{src => }/core/__init__.py | 0 .../core/interface/InterfaceDataClient.py | 0 .../interface/InterfaceFeatureProcessor.py | 0 .../core/interface/InterfaceMetrics.py | 0 .../core/interface/InterfaceModels.py | 0 .../{src => }/core/interface/__init__.py | 0 .../src/pipeline/{src => }/data/.gitignore | 0 .../src/pipeline/{src => }/dvc.lock | 0 .../src/pipeline/{src => }/dvc.yaml | 0 .../pipeline/{src => }/generate_metrics.py | 0 .../{src => }/generate_predictions.py | 0 .../src/pipeline/{src => }/metrics/.gitignore | 0 .../src/pipeline/{src => }/prepare_data.py | 0 .../predictions/requirements-dev.txt | 0 .../requirements/predictions/requirements.txt | 0 .../training/requirements-dev.txt | 0 .../requirements/training/requirements.txt | 0 .../version_control/requirements.txt | 0 .../ml-pipeline/src/pipeline/src/.DS_Store | Bin 6148 -> 0 bytes .../src/pipeline/src/.vscode/settings.json | 1 - .../ml-pipeline/src/pipeline/src/README.md | 35 --------------- .../src/pipeline/{src => }/startup_cleanup.py | 0 modules/ml-pipeline/src/pipeline2/README.md | 3 -- 50 files changed, 62 insertions(+), 48 deletions(-) rename modules/ml-pipeline/src/{pipeline => }/.devcontainer/devcontainer.json (100%) rename modules/ml-pipeline/src/{pipeline => }/Development.Dockerfile (100%) rename modules/ml-pipeline/src/{pipeline => }/Prediction.Dockerfile (100%) create mode 100644 modules/ml-pipeline/src/README.md rename modules/ml-pipeline/src/pipeline/{src => }/.dvc/.gitignore (100%) rename modules/ml-pipeline/src/pipeline/{src => }/.dvc/config (100%) rename modules/ml-pipeline/src/pipeline/{src => }/.dvcignore (100%) rename modules/ml-pipeline/src/pipeline/{src => }/__init__.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/build_model.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/configs/build_model.yaml (100%) rename modules/ml-pipeline/src/pipeline/{src => }/configs/client.yaml (100%) rename modules/ml-pipeline/src/pipeline/{src => }/configs/configs.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/configs/feature_processor.yaml (100%) rename modules/ml-pipeline/src/pipeline/{src => }/configs/feature_processor_logic.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/configs/generate_metrics.yaml (100%) rename modules/ml-pipeline/src/pipeline/{src => }/configs/generate_predictions.yaml (100%) rename modules/ml-pipeline/src/pipeline/{src => }/configs/prepare_data.yaml (100%) rename modules/ml-pipeline/src/pipeline/{src => }/configs/startup_cleanup.yaml (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/DataClient.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/FeatureProcessor.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/Logger.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/MLMetrics.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/MLModels.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/__init__.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/interface/InterfaceDataClient.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/interface/InterfaceFeatureProcessor.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/interface/InterfaceMetrics.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/interface/InterfaceModels.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/core/interface/__init__.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/data/.gitignore (100%) rename modules/ml-pipeline/src/pipeline/{src => }/dvc.lock (100%) rename modules/ml-pipeline/src/pipeline/{src => }/dvc.yaml (100%) rename modules/ml-pipeline/src/pipeline/{src => }/generate_metrics.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/generate_predictions.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/metrics/.gitignore (100%) rename modules/ml-pipeline/src/pipeline/{src => }/prepare_data.py (100%) rename modules/ml-pipeline/src/pipeline/{src => }/requirements/predictions/requirements-dev.txt (100%) rename modules/ml-pipeline/src/pipeline/{src => }/requirements/predictions/requirements.txt (100%) rename modules/ml-pipeline/src/pipeline/{src => }/requirements/training/requirements-dev.txt (100%) rename modules/ml-pipeline/src/pipeline/{src => }/requirements/training/requirements.txt (100%) rename modules/ml-pipeline/src/pipeline/{src => }/requirements/version_control/requirements.txt (100%) delete mode 100644 modules/ml-pipeline/src/pipeline/src/.DS_Store delete mode 100644 modules/ml-pipeline/src/pipeline/src/.vscode/settings.json delete mode 100644 modules/ml-pipeline/src/pipeline/src/README.md rename modules/ml-pipeline/src/pipeline/{src => }/startup_cleanup.py (100%) delete mode 100644 modules/ml-pipeline/src/pipeline2/README.md diff --git a/modules/ml-pipeline/.pre-commit-config.yaml b/modules/ml-pipeline/.pre-commit-config.yaml index 79ed459..a80ad6e 100644 --- a/modules/ml-pipeline/.pre-commit-config.yaml +++ b/modules/ml-pipeline/.pre-commit-config.yaml @@ -20,6 +20,6 @@ repos: hooks: - id: dvc-push-experiment name: DVC - Push to experiment to remote location (experiments) - entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"' + entry: bash -c 'cd modules/ml-pipeline/src/pipeline && dvc push -r experiments || echo "Up to date!"' language: system verbose: true diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index d4d6fb7..a46b68d 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -11,7 +11,7 @@ dev-pyenv: pyenv install ${PYTHON_VERSION} || echo "Python version already installed" pyenv global ${PYTHON_VERSION} python3 -m venv .dev_env_pipeline - . .dev_env_pipeline/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/src/requirements/training/requirements-dev.txt && pip install -r src/pipeline/src/requirements/version_control/requirements.txt && pre-commit install + . .dev_env_pipeline/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/requirements/training/requirements-dev.txt && pip install -r src/pipeline/requirements/version_control/requirements.txt && pre-commit install echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "source .dev_env_pipeline/bin/activate" diff --git a/modules/ml-pipeline/README.MD b/modules/ml-pipeline/README.MD index 7dae279..2711569 100644 --- a/modules/ml-pipeline/README.MD +++ b/modules/ml-pipeline/README.MD @@ -1,16 +1,30 @@ # ML-pipeline -This is a dummy ML-pipeline, consisting of: +This is a generic ML-pipeline, consisting of: - dvc tracking for version control (data and models) - gto for model registry - docs, created via sphinx (in pre-commit hooks) - tests for unit, integration and end to end testing Within `src` folder, the structure is as follows: -- multiple pipelines can be defined - - i.e. for a product, we might require multuple pipelines do deliver a result - - i.e. multiple models -- these models can be all tracked within the same gto model registry +- `pipeline` folder, which contains all the codebase for the generic pipeline + - The pipeline can track multiple models through dvc and gto model registry +- Deployment files: + - Prediction.Dockerfile - code to create the prediction deployment image + - Training.Dockerfil - code to create the training image (i.e. for remote training on EC2/ Fargate) +- Docker development environment: + - If you wish to develop within a docker. + +# How to develop using this pipeline: + +Run `make init`, which will: +- Download pyenv (Python version management) +- Download Python 3.X.X as defined in the `make` file - current 3.10.12 +- Create a virtual environment with this version of python +- Install packages in the training and version control directories in the pipeline folder (dev version if applicable) +- Install pre-commit to enable pre-commit hooks + +To use the environment, run `source .dev_env_pipeline/bin/activate`. To enable the virtual envrionemnt created in vscode: - Open settings diff --git a/modules/ml-pipeline/src/pipeline/.devcontainer/devcontainer.json b/modules/ml-pipeline/src/.devcontainer/devcontainer.json similarity index 100% rename from modules/ml-pipeline/src/pipeline/.devcontainer/devcontainer.json rename to modules/ml-pipeline/src/.devcontainer/devcontainer.json diff --git a/modules/ml-pipeline/src/pipeline/Development.Dockerfile b/modules/ml-pipeline/src/Development.Dockerfile similarity index 100% rename from modules/ml-pipeline/src/pipeline/Development.Dockerfile rename to modules/ml-pipeline/src/Development.Dockerfile diff --git a/modules/ml-pipeline/src/pipeline/Prediction.Dockerfile b/modules/ml-pipeline/src/Prediction.Dockerfile similarity index 100% rename from modules/ml-pipeline/src/pipeline/Prediction.Dockerfile rename to modules/ml-pipeline/src/Prediction.Dockerfile diff --git a/modules/ml-pipeline/src/README.md b/modules/ml-pipeline/src/README.md new file mode 100644 index 0000000..377206f --- /dev/null +++ b/modules/ml-pipeline/src/README.md @@ -0,0 +1,3 @@ +# The generic reproducible ML-pipeline + +Pipeline required to build a model to produce an output diff --git a/modules/ml-pipeline/src/pipeline/src/.dvc/.gitignore b/modules/ml-pipeline/src/pipeline/.dvc/.gitignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/.dvc/.gitignore rename to modules/ml-pipeline/src/pipeline/.dvc/.gitignore diff --git a/modules/ml-pipeline/src/pipeline/src/.dvc/config b/modules/ml-pipeline/src/pipeline/.dvc/config similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/.dvc/config rename to modules/ml-pipeline/src/pipeline/.dvc/config diff --git a/modules/ml-pipeline/src/pipeline/src/.dvcignore b/modules/ml-pipeline/src/pipeline/.dvcignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/.dvcignore rename to modules/ml-pipeline/src/pipeline/.dvcignore diff --git a/modules/ml-pipeline/src/pipeline/README.md b/modules/ml-pipeline/src/pipeline/README.md index faef685..d47f864 100644 --- a/modules/ml-pipeline/src/pipeline/README.md +++ b/modules/ml-pipeline/src/pipeline/README.md @@ -1,3 +1,39 @@ -# Pipeline 1 +# Training -Pipeline required to build a model to produce an output +This folder contains the code base for training experimentation. + +To understand the pipeline, run `dvc dag` + +There are 4 main steps: +- Preparing data + - This is loading data (locally or from s3) + - Splitting the data into train and validation + - Creating additional features (if needed) + - **Data is cached** + - This will be down to the dvc remote location +- Build model + - For the prepared data, we build a model using our configurations + - Model is saved (locally or s3) + - **Model and fit metrics are cached** + - This will be down to the dvc remote location +- Generate Predictions + - For the given model, we generate predictions on validation test data + - **Predictions are cached** + - This will be down to the dvc remote location +- Generate Metrics + - For the given model, we generate metrics on validation data/test data + - **Metrics are cached** + - This will be down to the dvc remote location + +Workflow: +- Use `dvc metrics show` to view current metrics score +- Adjust parameters/ codebase +- When happy with changes, use `dvc exp run` to trigger an experiment +- Due to cache, only need stages are re-run +- Use `dvc metrics diff` to check the change in metrics +- Use `dvc exp show` to view all experiments + - NOTE: the last experiment will always be applied to the workspace! +- After running experiments, you can apply the the best model to workspace using `dvc exp apply [EXPERIMENT_NAME]` + - This experiment will have the corresponding .dvc files for the hashed model and data +- Use version control as normal + - git add, git commit etc diff --git a/modules/ml-pipeline/src/pipeline/src/__init__.py b/modules/ml-pipeline/src/pipeline/__init__.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/__init__.py rename to modules/ml-pipeline/src/pipeline/__init__.py diff --git a/modules/ml-pipeline/src/pipeline/src/build_model.py b/modules/ml-pipeline/src/pipeline/build_model.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/build_model.py rename to modules/ml-pipeline/src/pipeline/build_model.py diff --git a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml rename to modules/ml-pipeline/src/pipeline/configs/build_model.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/client.yaml b/modules/ml-pipeline/src/pipeline/configs/client.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/client.yaml rename to modules/ml-pipeline/src/pipeline/configs/client.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/configs.py b/modules/ml-pipeline/src/pipeline/configs/configs.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/configs.py rename to modules/ml-pipeline/src/pipeline/configs/configs.py diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml rename to modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/feature_processor_logic.py rename to modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py diff --git a/modules/ml-pipeline/src/pipeline/src/configs/generate_metrics.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/generate_metrics.yaml rename to modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/generate_predictions.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/generate_predictions.yaml rename to modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml rename to modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml b/modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml rename to modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py b/modules/ml-pipeline/src/pipeline/core/DataClient.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/DataClient.py rename to modules/ml-pipeline/src/pipeline/core/DataClient.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py b/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py rename to modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/Logger.py b/modules/ml-pipeline/src/pipeline/core/Logger.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/Logger.py rename to modules/ml-pipeline/src/pipeline/core/Logger.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/MLMetrics.py b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/MLMetrics.py rename to modules/ml-pipeline/src/pipeline/core/MLMetrics.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/MLModels.py rename to modules/ml-pipeline/src/pipeline/core/MLModels.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/__init__.py b/modules/ml-pipeline/src/pipeline/core/__init__.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/__init__.py rename to modules/ml-pipeline/src/pipeline/core/__init__.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceDataClient.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceDataClient.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceFeatureProcessor.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceFeatureProcessor.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceFeatureProcessor.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceFeatureProcessor.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceMetrics.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceMetrics.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceMetrics.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceMetrics.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceModels.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceModels.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceModels.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceModels.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/__init__.py b/modules/ml-pipeline/src/pipeline/core/interface/__init__.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/__init__.py rename to modules/ml-pipeline/src/pipeline/core/interface/__init__.py diff --git a/modules/ml-pipeline/src/pipeline/src/data/.gitignore b/modules/ml-pipeline/src/pipeline/data/.gitignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/data/.gitignore rename to modules/ml-pipeline/src/pipeline/data/.gitignore diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/dvc.lock rename to modules/ml-pipeline/src/pipeline/dvc.lock diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/dvc.yaml rename to modules/ml-pipeline/src/pipeline/dvc.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py b/modules/ml-pipeline/src/pipeline/generate_metrics.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/generate_metrics.py rename to modules/ml-pipeline/src/pipeline/generate_metrics.py diff --git a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/generate_predictions.py rename to modules/ml-pipeline/src/pipeline/generate_predictions.py diff --git a/modules/ml-pipeline/src/pipeline/src/metrics/.gitignore b/modules/ml-pipeline/src/pipeline/metrics/.gitignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/metrics/.gitignore rename to modules/ml-pipeline/src/pipeline/metrics/.gitignore diff --git a/modules/ml-pipeline/src/pipeline/src/prepare_data.py b/modules/ml-pipeline/src/pipeline/prepare_data.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/prepare_data.py rename to modules/ml-pipeline/src/pipeline/prepare_data.py diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements-dev.txt rename to modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements.txt rename to modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt rename to modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/training/requirements.txt rename to modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt rename to modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/src/.DS_Store b/modules/ml-pipeline/src/pipeline/src/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 Date: Mon, 18 Sep 2023 19:58:24 +0100 Subject: [PATCH 5/7] change workflow --- ...Pipeline.yml => MLPipelineDevPipeline.yml} | 6 +-- .github/workflows/MLPipelinePostMerge.yml | 54 +++---------------- .github/workflows/MLPipelinePullRequest.yml | 12 ++--- README.md | 14 ++++- 4 files changed, 29 insertions(+), 57 deletions(-) rename .github/workflows/{MLMonitoringDevPipeline.yml => MLPipelineDevPipeline.yml} (86%) diff --git a/.github/workflows/MLMonitoringDevPipeline.yml b/.github/workflows/MLPipelineDevPipeline.yml similarity index 86% rename from .github/workflows/MLMonitoringDevPipeline.yml rename to .github/workflows/MLPipelineDevPipeline.yml index db46d5f..dacfcca 100644 --- a/.github/workflows/MLMonitoringDevPipeline.yml +++ b/.github/workflows/MLPipelineDevPipeline.yml @@ -16,19 +16,19 @@ jobs: - name: Install packages to retrieve artifacts run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Retrieve artifacts (dvc.lock) env: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline dvc pull -r dev - name: Build Prediction docker image (TODO - NEED LAMBDA IMAGE, need to add version from gto registry) run: | - cd modules/ml-pipeline/src/pipeline/ + cd modules/ml-pipeline/src/ REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') docker build . --file Prediction.Dockerfile --tag ${REGISTER_MODEL_NAME} diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index 657d995..14f2f02 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -26,7 +26,7 @@ jobs: - name: Install packages to register model run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Register Model run: | @@ -64,7 +64,7 @@ jobs: - name: Install packages to register model run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Register Model run: | @@ -102,7 +102,7 @@ jobs: - name: Install packages to register model run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Register Model run: | @@ -138,14 +138,14 @@ jobs: - name: Install packages to retrieve artifacts run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Retrieve artifacts (dvc.lock) env: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline dvc pull -r experiments - name: Push artifacts to Dev @@ -153,7 +153,7 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline dvc push -r dev Register-New-Model-Dev: @@ -173,7 +173,7 @@ jobs: - name: Install packages to register model run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Register Model run: | @@ -211,43 +211,3 @@ jobs: git add . git commit -m "Update Registry" git push origin master - - - Register-Prediction-Image-Dev: - needs: [Promote-Artefacts-To-Dev, Register-New-Model-Dev] - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - name: Install packages to retrieve artifacts - run: | - pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt - - - name: Retrieve artifacts (dvc.lock) - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - run: | - cd modules/ml-pipeline/src/pipeline/src - dvc pull -r dev - - - name: Build Prediction docker image (TODO - NEED LAMBDA IMAGE, need to add version from gto registry) - run: | - cd modules/ml-pipeline/src/pipeline/ - REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') - docker build . --file Prediction.Dockerfile --tag ${REGISTER_MODEL_NAME} - - - name: ECR Login - Dev - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - run: | - echo "LOGIN TO ECR" - - - name: Push Prediction image to ECR - Dev - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - run: | - echo "PUSH TO ECR" diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index e1ebd5d..3d5b24e 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -44,19 +44,19 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Retrieve artifacts (dvc.lock) env: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline dvc pull -r experiments - name: Build Prediction docker Image run: | - cd modules/ml-pipeline/src/pipeline/ + cd modules/ml-pipeline/src/ docker build . --file Prediction.Dockerfile --tag prediction_test - name: Run Prediction docker container @@ -72,14 +72,14 @@ jobs: - name: Install packages to retrieve artifacts run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Retrieve artifacts (dvc.lock) env: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline dvc pull -r experiments - uses: actions/setup-python@v4 @@ -90,7 +90,7 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline echo "## Model metrics" > report.md # Compare metrics to master diff --git a/README.md b/README.md index 2bf78c8..35242a0 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,21 @@ Creating a ML-toolkit that can be reused: - ML pipeline: - - A dummy pipeline that has data version control, experiment + - A generic pipeline that has data version control, experiment tracking and a model registry - ML monitoring: - A bolt-on service that can implement model monitoring +There are multiple protected branches which adapt the generic pipeline to produce different models: +- sap_change-** +- heat_change-** +- carbon_change-** + +These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline +- There can be different additional logic for each branch but the pipeline will be the same. + +# Deployment + +TBD + From e6b5d5900bf0caba89ff6f9b90dac2d03e726c7d Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 18 Sep 2023 20:09:16 +0100 Subject: [PATCH 6/7] fixed docker --- modules/ml-pipeline/src/Prediction.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/ml-pipeline/src/Prediction.Dockerfile b/modules/ml-pipeline/src/Prediction.Dockerfile index 096b51b..ada1b63 100644 --- a/modules/ml-pipeline/src/Prediction.Dockerfile +++ b/modules/ml-pipeline/src/Prediction.Dockerfile @@ -1,7 +1,7 @@ # Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow) FROM python:3.10.12-slim -COPY src/requirements/predictions/requirements.txt requirements.txt +COPY pipeline/requirements/predictions/requirements.txt requirements.txt RUN pip install --upgrade pip RUN pip install -r requirements.txt From dd86f103e92f7fad8022fe4fa9c6b8777550feec Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Mon, 18 Sep 2023 20:10:06 +0100 Subject: [PATCH 7/7] fixed docker --- modules/ml-pipeline/src/Prediction.Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/ml-pipeline/src/Prediction.Dockerfile b/modules/ml-pipeline/src/Prediction.Dockerfile index ada1b63..09b775b 100644 --- a/modules/ml-pipeline/src/Prediction.Dockerfile +++ b/modules/ml-pipeline/src/Prediction.Dockerfile @@ -7,8 +7,8 @@ RUN pip install --upgrade pip RUN pip install -r requirements.txt # Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script -COPY src/ /home/src/ +COPY pipeline/ /home/pipeline/ -WORKDIR /home/src/ +WORKDIR /home/pipeline/ CMD [ "python", "generate_predictions.py"]