diff --git a/modules/ml-pipeline/.pre-commit-config.yaml b/modules/ml-pipeline/.pre-commit-config.yaml index 79ed459..a80ad6e 100644 --- a/modules/ml-pipeline/.pre-commit-config.yaml +++ b/modules/ml-pipeline/.pre-commit-config.yaml @@ -20,6 +20,6 @@ repos: hooks: - id: dvc-push-experiment name: DVC - Push to experiment to remote location (experiments) - entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"' + entry: bash -c 'cd modules/ml-pipeline/src/pipeline && dvc push -r experiments || echo "Up to date!"' language: system verbose: true diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index d4d6fb7..a46b68d 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -11,7 +11,7 @@ dev-pyenv: pyenv install ${PYTHON_VERSION} || echo "Python version already installed" pyenv global ${PYTHON_VERSION} python3 -m venv .dev_env_pipeline - . .dev_env_pipeline/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/src/requirements/training/requirements-dev.txt && pip install -r src/pipeline/src/requirements/version_control/requirements.txt && pre-commit install + . .dev_env_pipeline/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/requirements/training/requirements-dev.txt && pip install -r src/pipeline/requirements/version_control/requirements.txt && pre-commit install echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "source .dev_env_pipeline/bin/activate" diff --git a/modules/ml-pipeline/README.MD b/modules/ml-pipeline/README.MD index 7dae279..2711569 100644 --- a/modules/ml-pipeline/README.MD +++ b/modules/ml-pipeline/README.MD @@ -1,16 +1,30 @@ # ML-pipeline -This is a dummy ML-pipeline, consisting of: +This is a generic ML-pipeline, consisting of: - dvc tracking for version control (data and models) - gto for model registry - docs, created via sphinx (in pre-commit hooks) - tests for unit, integration and end to end testing Within `src` folder, the structure is as follows: -- multiple pipelines can be defined - - i.e. for a product, we might require multuple pipelines do deliver a result - - i.e. multiple models -- these models can be all tracked within the same gto model registry +- `pipeline` folder, which contains all the codebase for the generic pipeline + - The pipeline can track multiple models through dvc and gto model registry +- Deployment files: + - Prediction.Dockerfile - code to create the prediction deployment image + - Training.Dockerfil - code to create the training image (i.e. for remote training on EC2/ Fargate) +- Docker development environment: + - If you wish to develop within a docker. + +# How to develop using this pipeline: + +Run `make init`, which will: +- Download pyenv (Python version management) +- Download Python 3.X.X as defined in the `make` file - current 3.10.12 +- Create a virtual environment with this version of python +- Install packages in the training and version control directories in the pipeline folder (dev version if applicable) +- Install pre-commit to enable pre-commit hooks + +To use the environment, run `source .dev_env_pipeline/bin/activate`. To enable the virtual envrionemnt created in vscode: - Open settings diff --git a/modules/ml-pipeline/src/pipeline/.devcontainer/devcontainer.json b/modules/ml-pipeline/src/.devcontainer/devcontainer.json similarity index 100% rename from modules/ml-pipeline/src/pipeline/.devcontainer/devcontainer.json rename to modules/ml-pipeline/src/.devcontainer/devcontainer.json diff --git a/modules/ml-pipeline/src/pipeline/Development.Dockerfile b/modules/ml-pipeline/src/Development.Dockerfile similarity index 100% rename from modules/ml-pipeline/src/pipeline/Development.Dockerfile rename to modules/ml-pipeline/src/Development.Dockerfile diff --git a/modules/ml-pipeline/src/pipeline/Prediction.Dockerfile b/modules/ml-pipeline/src/Prediction.Dockerfile similarity index 100% rename from modules/ml-pipeline/src/pipeline/Prediction.Dockerfile rename to modules/ml-pipeline/src/Prediction.Dockerfile diff --git a/modules/ml-pipeline/src/README.md b/modules/ml-pipeline/src/README.md new file mode 100644 index 0000000..377206f --- /dev/null +++ b/modules/ml-pipeline/src/README.md @@ -0,0 +1,3 @@ +# The generic reproducible ML-pipeline + +Pipeline required to build a model to produce an output diff --git a/modules/ml-pipeline/src/pipeline/src/.dvc/.gitignore b/modules/ml-pipeline/src/pipeline/.dvc/.gitignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/.dvc/.gitignore rename to modules/ml-pipeline/src/pipeline/.dvc/.gitignore diff --git a/modules/ml-pipeline/src/pipeline/src/.dvc/config b/modules/ml-pipeline/src/pipeline/.dvc/config similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/.dvc/config rename to modules/ml-pipeline/src/pipeline/.dvc/config diff --git a/modules/ml-pipeline/src/pipeline/src/.dvcignore b/modules/ml-pipeline/src/pipeline/.dvcignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/.dvcignore rename to modules/ml-pipeline/src/pipeline/.dvcignore diff --git a/modules/ml-pipeline/src/pipeline/README.md b/modules/ml-pipeline/src/pipeline/README.md index faef685..d47f864 100644 --- a/modules/ml-pipeline/src/pipeline/README.md +++ b/modules/ml-pipeline/src/pipeline/README.md @@ -1,3 +1,39 @@ -# Pipeline 1 +# Training -Pipeline required to build a model to produce an output +This folder contains the code base for training experimentation. + +To understand the pipeline, run `dvc dag` + +There are 4 main steps: +- Preparing data + - This is loading data (locally or from s3) + - Splitting the data into train and validation + - Creating additional features (if needed) + - **Data is cached** + - This will be down to the dvc remote location +- Build model + - For the prepared data, we build a model using our configurations + - Model is saved (locally or s3) + - **Model and fit metrics are cached** + - This will be down to the dvc remote location +- Generate Predictions + - For the given model, we generate predictions on validation test data + - **Predictions are cached** + - This will be down to the dvc remote location +- Generate Metrics + - For the given model, we generate metrics on validation data/test data + - **Metrics are cached** + - This will be down to the dvc remote location + +Workflow: +- Use `dvc metrics show` to view current metrics score +- Adjust parameters/ codebase +- When happy with changes, use `dvc exp run` to trigger an experiment +- Due to cache, only need stages are re-run +- Use `dvc metrics diff` to check the change in metrics +- Use `dvc exp show` to view all experiments + - NOTE: the last experiment will always be applied to the workspace! +- After running experiments, you can apply the the best model to workspace using `dvc exp apply [EXPERIMENT_NAME]` + - This experiment will have the corresponding .dvc files for the hashed model and data +- Use version control as normal + - git add, git commit etc diff --git a/modules/ml-pipeline/src/pipeline/src/__init__.py b/modules/ml-pipeline/src/pipeline/__init__.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/__init__.py rename to modules/ml-pipeline/src/pipeline/__init__.py diff --git a/modules/ml-pipeline/src/pipeline/src/build_model.py b/modules/ml-pipeline/src/pipeline/build_model.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/build_model.py rename to modules/ml-pipeline/src/pipeline/build_model.py diff --git a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml rename to modules/ml-pipeline/src/pipeline/configs/build_model.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/client.yaml b/modules/ml-pipeline/src/pipeline/configs/client.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/client.yaml rename to modules/ml-pipeline/src/pipeline/configs/client.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/configs.py b/modules/ml-pipeline/src/pipeline/configs/configs.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/configs.py rename to modules/ml-pipeline/src/pipeline/configs/configs.py diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml rename to modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/feature_processor_logic.py rename to modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py diff --git a/modules/ml-pipeline/src/pipeline/src/configs/generate_metrics.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/generate_metrics.yaml rename to modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/generate_predictions.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/generate_predictions.yaml rename to modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml rename to modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml b/modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml rename to modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py b/modules/ml-pipeline/src/pipeline/core/DataClient.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/DataClient.py rename to modules/ml-pipeline/src/pipeline/core/DataClient.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py b/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py rename to modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/Logger.py b/modules/ml-pipeline/src/pipeline/core/Logger.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/Logger.py rename to modules/ml-pipeline/src/pipeline/core/Logger.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/MLMetrics.py b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/MLMetrics.py rename to modules/ml-pipeline/src/pipeline/core/MLMetrics.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/MLModels.py rename to modules/ml-pipeline/src/pipeline/core/MLModels.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/__init__.py b/modules/ml-pipeline/src/pipeline/core/__init__.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/__init__.py rename to modules/ml-pipeline/src/pipeline/core/__init__.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceDataClient.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceDataClient.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceFeatureProcessor.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceFeatureProcessor.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceFeatureProcessor.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceFeatureProcessor.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceMetrics.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceMetrics.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceMetrics.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceMetrics.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceModels.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceModels.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceModels.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceModels.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/__init__.py b/modules/ml-pipeline/src/pipeline/core/interface/__init__.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/__init__.py rename to modules/ml-pipeline/src/pipeline/core/interface/__init__.py diff --git a/modules/ml-pipeline/src/pipeline/src/data/.gitignore b/modules/ml-pipeline/src/pipeline/data/.gitignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/data/.gitignore rename to modules/ml-pipeline/src/pipeline/data/.gitignore diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/dvc.lock rename to modules/ml-pipeline/src/pipeline/dvc.lock diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/dvc.yaml rename to modules/ml-pipeline/src/pipeline/dvc.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py b/modules/ml-pipeline/src/pipeline/generate_metrics.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/generate_metrics.py rename to modules/ml-pipeline/src/pipeline/generate_metrics.py diff --git a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/generate_predictions.py rename to modules/ml-pipeline/src/pipeline/generate_predictions.py diff --git a/modules/ml-pipeline/src/pipeline/src/metrics/.gitignore b/modules/ml-pipeline/src/pipeline/metrics/.gitignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/metrics/.gitignore rename to modules/ml-pipeline/src/pipeline/metrics/.gitignore diff --git a/modules/ml-pipeline/src/pipeline/src/prepare_data.py b/modules/ml-pipeline/src/pipeline/prepare_data.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/prepare_data.py rename to modules/ml-pipeline/src/pipeline/prepare_data.py diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements-dev.txt rename to modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements.txt rename to modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt rename to modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/training/requirements.txt rename to modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt rename to modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/src/.DS_Store b/modules/ml-pipeline/src/pipeline/src/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/modules/ml-pipeline/src/pipeline/src/.DS_Store and /dev/null differ diff --git a/modules/ml-pipeline/src/pipeline/src/.vscode/settings.json b/modules/ml-pipeline/src/pipeline/src/.vscode/settings.json deleted file mode 100644 index 0967ef4..0000000 --- a/modules/ml-pipeline/src/pipeline/src/.vscode/settings.json +++ /dev/null @@ -1 +0,0 @@ -{} diff --git a/modules/ml-pipeline/src/pipeline/src/README.md b/modules/ml-pipeline/src/pipeline/src/README.md deleted file mode 100644 index ca5e98e..0000000 --- a/modules/ml-pipeline/src/pipeline/src/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Training - -This folder contains the code base for training experimentation. - -To understand the pipeline, run `dvc dag` - -There are 3 main steps: -- Preparing data - - This is loading data (locally or from s3) - - Splitting the data into train and validation - - Creating additional features (if needed) - - **Data is cached** - - This will be down to the dvc remote location -- Build model - - For the prepared data, we build a model using our configurations - - Model is saved (locally or s3) - - **Model is cached** - - This will be down to the dvc remote location -- Generate Metrics - - For the given model, we generate metrics on validation data/test data - - **Metrics are cached** - - This will be down to the dvc remote location - -Workflow: -- Use `dvc metrics show` to view current metrics score -- Adjust parameters/ codebase -- When happy with changes, use `dvc exp run` to trigger an experiment -- Due to cache, only need stages are re-run -- Use `dvc metrics diff` to check the change in metrics -- Use `dvc exp show` to view all experiments - - NOTE: the last experiment will always be applied to the workspace! -- After running experiments, you can apply the the best model to workspace using `dvc exp apply [EXPERIMENT_NAME]` - - This experiment will have the corresponding .dvc files for the hashed model and data -- Use version control as normal - - git add, git commit etc diff --git a/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py b/modules/ml-pipeline/src/pipeline/startup_cleanup.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/startup_cleanup.py rename to modules/ml-pipeline/src/pipeline/startup_cleanup.py diff --git a/modules/ml-pipeline/src/pipeline2/README.md b/modules/ml-pipeline/src/pipeline2/README.md deleted file mode 100644 index dbcce65..0000000 --- a/modules/ml-pipeline/src/pipeline2/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Pipeline2 - -PLACEHOLDER PIPELINE IF NEEDED