diff --git a/.github/workflows/MLMonitoringDevPipeline.yml b/.github/workflows/MLPipelineDevPipeline.yml similarity index 86% rename from .github/workflows/MLMonitoringDevPipeline.yml rename to .github/workflows/MLPipelineDevPipeline.yml index db46d5f..dacfcca 100644 --- a/.github/workflows/MLMonitoringDevPipeline.yml +++ b/.github/workflows/MLPipelineDevPipeline.yml @@ -16,19 +16,19 @@ jobs: - name: Install packages to retrieve artifacts run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Retrieve artifacts (dvc.lock) env: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline dvc pull -r dev - name: Build Prediction docker image (TODO - NEED LAMBDA IMAGE, need to add version from gto registry) run: | - cd modules/ml-pipeline/src/pipeline/ + cd modules/ml-pipeline/src/ REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') docker build . --file Prediction.Dockerfile --tag ${REGISTER_MODEL_NAME} diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index 657d995..14f2f02 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -26,7 +26,7 @@ jobs: - name: Install packages to register model run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Register Model run: | @@ -64,7 +64,7 @@ jobs: - name: Install packages to register model run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Register Model run: | @@ -102,7 +102,7 @@ jobs: - name: Install packages to register model run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Register Model run: | @@ -138,14 +138,14 @@ jobs: - name: Install packages to retrieve artifacts run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Retrieve artifacts (dvc.lock) env: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline dvc pull -r experiments - name: Push artifacts to Dev @@ -153,7 +153,7 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline dvc push -r dev Register-New-Model-Dev: @@ -173,7 +173,7 @@ jobs: - name: Install packages to register model run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Register Model run: | @@ -211,43 +211,3 @@ jobs: git add . git commit -m "Update Registry" git push origin master - - - Register-Prediction-Image-Dev: - needs: [Promote-Artefacts-To-Dev, Register-New-Model-Dev] - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - name: Install packages to retrieve artifacts - run: | - pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt - - - name: Retrieve artifacts (dvc.lock) - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - run: | - cd modules/ml-pipeline/src/pipeline/src - dvc pull -r dev - - - name: Build Prediction docker image (TODO - NEED LAMBDA IMAGE, need to add version from gto registry) - run: | - cd modules/ml-pipeline/src/pipeline/ - REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') - docker build . --file Prediction.Dockerfile --tag ${REGISTER_MODEL_NAME} - - - name: ECR Login - Dev - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - run: | - echo "LOGIN TO ECR" - - - name: Push Prediction image to ECR - Dev - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - run: | - echo "PUSH TO ECR" diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index e1ebd5d..3d5b24e 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -44,19 +44,19 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Retrieve artifacts (dvc.lock) env: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline dvc pull -r experiments - name: Build Prediction docker Image run: | - cd modules/ml-pipeline/src/pipeline/ + cd modules/ml-pipeline/src/ docker build . --file Prediction.Dockerfile --tag prediction_test - name: Run Prediction docker container @@ -72,14 +72,14 @@ jobs: - name: Install packages to retrieve artifacts run: | pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + pip install -r modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt - name: Retrieve artifacts (dvc.lock) env: AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline dvc pull -r experiments - uses: actions/setup-python@v4 @@ -90,7 +90,7 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - cd modules/ml-pipeline/src/pipeline/src + cd modules/ml-pipeline/src/pipeline echo "## Model metrics" > report.md # Compare metrics to master diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 34a9b78..d59b9e8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,3 +10,10 @@ repos: rev: 22.10.0 hooks: - id: black +- repo: local + hooks: + - id: dvc-push-experiment + name: DVC - Push to experiment to remote location (experiments) + entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"' + language: system + verbose: true diff --git a/README.md b/README.md index 2bf78c8..35242a0 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,21 @@ Creating a ML-toolkit that can be reused: - ML pipeline: - - A dummy pipeline that has data version control, experiment + - A generic pipeline that has data version control, experiment tracking and a model registry - ML monitoring: - A bolt-on service that can implement model monitoring +There are multiple protected branches which adapt the generic pipeline to produce different models: +- sap_change-** +- heat_change-** +- carbon_change-** + +These branches will differ by the configuration files that define the data used and the outputs of the ML-pipeline +- There can be different additional logic for each branch but the pipeline will be the same. + +# Deployment + +TBD + diff --git a/modules/ml-pipeline/.pre-commit-config.yaml b/modules/ml-pipeline/.pre-commit-config.yaml index 79ed459..a80ad6e 100644 --- a/modules/ml-pipeline/.pre-commit-config.yaml +++ b/modules/ml-pipeline/.pre-commit-config.yaml @@ -20,6 +20,6 @@ repos: hooks: - id: dvc-push-experiment name: DVC - Push to experiment to remote location (experiments) - entry: bash -c 'cd modules/ml-pipeline/src/pipeline/src && dvc push -r experiments || echo "Up to date!"' + entry: bash -c 'cd modules/ml-pipeline/src/pipeline && dvc push -r experiments || echo "Up to date!"' language: system verbose: true diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index d4d6fb7..a46b68d 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -11,7 +11,7 @@ dev-pyenv: pyenv install ${PYTHON_VERSION} || echo "Python version already installed" pyenv global ${PYTHON_VERSION} python3 -m venv .dev_env_pipeline - . .dev_env_pipeline/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/src/requirements/training/requirements-dev.txt && pip install -r src/pipeline/src/requirements/version_control/requirements.txt && pre-commit install + . .dev_env_pipeline/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/requirements/training/requirements-dev.txt && pip install -r src/pipeline/requirements/version_control/requirements.txt && pre-commit install echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "source .dev_env_pipeline/bin/activate" diff --git a/modules/ml-pipeline/README.MD b/modules/ml-pipeline/README.MD index 7dae279..2711569 100644 --- a/modules/ml-pipeline/README.MD +++ b/modules/ml-pipeline/README.MD @@ -1,16 +1,30 @@ # ML-pipeline -This is a dummy ML-pipeline, consisting of: +This is a generic ML-pipeline, consisting of: - dvc tracking for version control (data and models) - gto for model registry - docs, created via sphinx (in pre-commit hooks) - tests for unit, integration and end to end testing Within `src` folder, the structure is as follows: -- multiple pipelines can be defined - - i.e. for a product, we might require multuple pipelines do deliver a result - - i.e. multiple models -- these models can be all tracked within the same gto model registry +- `pipeline` folder, which contains all the codebase for the generic pipeline + - The pipeline can track multiple models through dvc and gto model registry +- Deployment files: + - Prediction.Dockerfile - code to create the prediction deployment image + - Training.Dockerfil - code to create the training image (i.e. for remote training on EC2/ Fargate) +- Docker development environment: + - If you wish to develop within a docker. + +# How to develop using this pipeline: + +Run `make init`, which will: +- Download pyenv (Python version management) +- Download Python 3.X.X as defined in the `make` file - current 3.10.12 +- Create a virtual environment with this version of python +- Install packages in the training and version control directories in the pipeline folder (dev version if applicable) +- Install pre-commit to enable pre-commit hooks + +To use the environment, run `source .dev_env_pipeline/bin/activate`. To enable the virtual envrionemnt created in vscode: - Open settings diff --git a/modules/ml-pipeline/src/pipeline/.devcontainer/devcontainer.json b/modules/ml-pipeline/src/.devcontainer/devcontainer.json similarity index 100% rename from modules/ml-pipeline/src/pipeline/.devcontainer/devcontainer.json rename to modules/ml-pipeline/src/.devcontainer/devcontainer.json diff --git a/modules/ml-pipeline/src/pipeline/Development.Dockerfile b/modules/ml-pipeline/src/Development.Dockerfile similarity index 100% rename from modules/ml-pipeline/src/pipeline/Development.Dockerfile rename to modules/ml-pipeline/src/Development.Dockerfile diff --git a/modules/ml-pipeline/src/pipeline/Prediction.Dockerfile b/modules/ml-pipeline/src/Prediction.Dockerfile similarity index 73% rename from modules/ml-pipeline/src/pipeline/Prediction.Dockerfile rename to modules/ml-pipeline/src/Prediction.Dockerfile index 096b51b..09b775b 100644 --- a/modules/ml-pipeline/src/pipeline/Prediction.Dockerfile +++ b/modules/ml-pipeline/src/Prediction.Dockerfile @@ -1,14 +1,14 @@ # Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow) FROM python:3.10.12-slim -COPY src/requirements/predictions/requirements.txt requirements.txt +COPY pipeline/requirements/predictions/requirements.txt requirements.txt RUN pip install --upgrade pip RUN pip install -r requirements.txt # Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script -COPY src/ /home/src/ +COPY pipeline/ /home/pipeline/ -WORKDIR /home/src/ +WORKDIR /home/pipeline/ CMD [ "python", "generate_predictions.py"] diff --git a/modules/ml-pipeline/src/README.md b/modules/ml-pipeline/src/README.md new file mode 100644 index 0000000..377206f --- /dev/null +++ b/modules/ml-pipeline/src/README.md @@ -0,0 +1,3 @@ +# The generic reproducible ML-pipeline + +Pipeline required to build a model to produce an output diff --git a/modules/ml-pipeline/src/pipeline/src/.dvc/.gitignore b/modules/ml-pipeline/src/pipeline/.dvc/.gitignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/.dvc/.gitignore rename to modules/ml-pipeline/src/pipeline/.dvc/.gitignore diff --git a/modules/ml-pipeline/src/pipeline/src/.dvc/config b/modules/ml-pipeline/src/pipeline/.dvc/config similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/.dvc/config rename to modules/ml-pipeline/src/pipeline/.dvc/config diff --git a/modules/ml-pipeline/src/pipeline/src/.dvcignore b/modules/ml-pipeline/src/pipeline/.dvcignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/.dvcignore rename to modules/ml-pipeline/src/pipeline/.dvcignore diff --git a/modules/ml-pipeline/src/pipeline/README.md b/modules/ml-pipeline/src/pipeline/README.md index faef685..d47f864 100644 --- a/modules/ml-pipeline/src/pipeline/README.md +++ b/modules/ml-pipeline/src/pipeline/README.md @@ -1,3 +1,39 @@ -# Pipeline 1 +# Training -Pipeline required to build a model to produce an output +This folder contains the code base for training experimentation. + +To understand the pipeline, run `dvc dag` + +There are 4 main steps: +- Preparing data + - This is loading data (locally or from s3) + - Splitting the data into train and validation + - Creating additional features (if needed) + - **Data is cached** + - This will be down to the dvc remote location +- Build model + - For the prepared data, we build a model using our configurations + - Model is saved (locally or s3) + - **Model and fit metrics are cached** + - This will be down to the dvc remote location +- Generate Predictions + - For the given model, we generate predictions on validation test data + - **Predictions are cached** + - This will be down to the dvc remote location +- Generate Metrics + - For the given model, we generate metrics on validation data/test data + - **Metrics are cached** + - This will be down to the dvc remote location + +Workflow: +- Use `dvc metrics show` to view current metrics score +- Adjust parameters/ codebase +- When happy with changes, use `dvc exp run` to trigger an experiment +- Due to cache, only need stages are re-run +- Use `dvc metrics diff` to check the change in metrics +- Use `dvc exp show` to view all experiments + - NOTE: the last experiment will always be applied to the workspace! +- After running experiments, you can apply the the best model to workspace using `dvc exp apply [EXPERIMENT_NAME]` + - This experiment will have the corresponding .dvc files for the hashed model and data +- Use version control as normal + - git add, git commit etc diff --git a/modules/ml-pipeline/src/pipeline/src/__init__.py b/modules/ml-pipeline/src/pipeline/__init__.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/__init__.py rename to modules/ml-pipeline/src/pipeline/__init__.py diff --git a/modules/ml-pipeline/src/pipeline/src/build_model.py b/modules/ml-pipeline/src/pipeline/build_model.py similarity index 95% rename from modules/ml-pipeline/src/pipeline/src/build_model.py rename to modules/ml-pipeline/src/pipeline/build_model.py index 5dfc71a..a07e9cf 100644 --- a/modules/ml-pipeline/src/pipeline/src/build_model.py +++ b/modules/ml-pipeline/src/pipeline/build_model.py @@ -53,12 +53,12 @@ def build_model( if train_data is None: if train_filepath is None: raise ValueError(f"Need {train_filepath} if no data supplied") - train_data = dataclient.load_data(location=train_filepath) + train_data = dataclient.load_data(location=train_filepath, load_config=None) if test_data is None: if test_filepath is None: raise ValueError(f"Need {test_filepath} if no data supplied") - test_data = dataclient.load_data(location=test_filepath) + test_data = dataclient.load_data(location=test_filepath, load_config=None) logger.info("----------------------") logger.info("--- Training model ---") @@ -95,7 +95,9 @@ def build_model( logger.info("--- Saving fit metrics ---") logger.info("--------------------------") - dataclient.save_data(obj=metrics_output, location=fit_metrics_filepath) + dataclient.save_data( + obj=metrics_output, location=fit_metrics_filepath, save_config=None + ) if __name__ == "__main__": diff --git a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml rename to modules/ml-pipeline/src/pipeline/configs/build_model.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/client.yaml b/modules/ml-pipeline/src/pipeline/configs/client.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/client.yaml rename to modules/ml-pipeline/src/pipeline/configs/client.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/configs.py b/modules/ml-pipeline/src/pipeline/configs/configs.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/configs.py rename to modules/ml-pipeline/src/pipeline/configs/configs.py diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml rename to modules/ml-pipeline/src/pipeline/configs/feature_processor.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor_logic.py b/modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/feature_processor_logic.py rename to modules/ml-pipeline/src/pipeline/configs/feature_processor_logic.py diff --git a/modules/ml-pipeline/src/pipeline/src/configs/generate_metrics.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/generate_metrics.yaml rename to modules/ml-pipeline/src/pipeline/configs/generate_metrics.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/generate_predictions.yaml b/modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/generate_predictions.yaml rename to modules/ml-pipeline/src/pipeline/configs/generate_predictions.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml rename to modules/ml-pipeline/src/pipeline/configs/prepare_data.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml b/modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/configs/startup_cleanup.yaml rename to modules/ml-pipeline/src/pipeline/configs/startup_cleanup.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py b/modules/ml-pipeline/src/pipeline/core/DataClient.py similarity index 98% rename from modules/ml-pipeline/src/pipeline/src/core/DataClient.py rename to modules/ml-pipeline/src/pipeline/core/DataClient.py index c8c9f2c..28ffff7 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/DataClient.py +++ b/modules/ml-pipeline/src/pipeline/core/DataClient.py @@ -8,7 +8,7 @@ import boto3 import pandas as pd from pathlib import Path from io import BytesIO -from typing import List, Union +from typing import List, Union, Any from core.interface.InterfaceDataClient import DataClient from core.Logger import logger @@ -105,7 +105,7 @@ class AWSS3Client: def save_data( self, - obj: object, + obj: Any, location: str, save_config: Union[dict, None] = None, ) -> None: @@ -134,7 +134,7 @@ class AWSS3Client: obj=obj, location=location, save_config=save_config ) - def _save_parquet(self, obj: object, location: str, save_config: dict): + def _save_parquet(self, obj: pd.DataFrame, location: str, save_config: dict): """ Save object as parquet """ diff --git a/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py b/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py similarity index 99% rename from modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py rename to modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py index 7f14e03..03ec4a9 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/FeatureProcessor.py +++ b/modules/ml-pipeline/src/pipeline/core/FeatureProcessor.py @@ -9,7 +9,6 @@ Create additional features from the dataset import pandas as pd from typing import List, Callable, Union from core.interface.InterfaceFeatureProcessor import FeatureProcessor -from core.Logger import logger def feature_processor_factory(feature_processor_type: str) -> FeatureProcessor: diff --git a/modules/ml-pipeline/src/pipeline/src/core/Logger.py b/modules/ml-pipeline/src/pipeline/core/Logger.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/Logger.py rename to modules/ml-pipeline/src/pipeline/core/Logger.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/MLMetrics.py b/modules/ml-pipeline/src/pipeline/core/MLMetrics.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/MLMetrics.py rename to modules/ml-pipeline/src/pipeline/core/MLMetrics.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/MLModels.py b/modules/ml-pipeline/src/pipeline/core/MLModels.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/MLModels.py rename to modules/ml-pipeline/src/pipeline/core/MLModels.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/__init__.py b/modules/ml-pipeline/src/pipeline/core/__init__.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/__init__.py rename to modules/ml-pipeline/src/pipeline/core/__init__.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceDataClient.py similarity index 81% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceDataClient.py index d572c2b..5e51a99 100644 --- a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceDataClient.py +++ b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceDataClient.py @@ -3,8 +3,7 @@ Interface for all DataClient i.e. s3, database, local etc """ import pandas as pd -from io import BytesIO -from typing import Protocol, Union +from typing import Protocol, Union, Any class DataClient(Protocol): @@ -22,9 +21,10 @@ class DataClient(Protocol): """ Generic to load data """ + ... def save_data( - self, obj: object, location: str, save_config: Union[dict, None] + self, obj: Any, location: str, save_config: Union[dict, None] ) -> None: """ Generic to save data diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceFeatureProcessor.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceFeatureProcessor.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceFeatureProcessor.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceFeatureProcessor.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceMetrics.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceMetrics.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceMetrics.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceMetrics.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceModels.py b/modules/ml-pipeline/src/pipeline/core/interface/InterfaceModels.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/InterfaceModels.py rename to modules/ml-pipeline/src/pipeline/core/interface/InterfaceModels.py diff --git a/modules/ml-pipeline/src/pipeline/src/core/interface/__init__.py b/modules/ml-pipeline/src/pipeline/core/interface/__init__.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/core/interface/__init__.py rename to modules/ml-pipeline/src/pipeline/core/interface/__init__.py diff --git a/modules/ml-pipeline/src/pipeline/src/data/.gitignore b/modules/ml-pipeline/src/pipeline/data/.gitignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/data/.gitignore rename to modules/ml-pipeline/src/pipeline/data/.gitignore diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/dvc.lock rename to modules/ml-pipeline/src/pipeline/dvc.lock diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/dvc.yaml rename to modules/ml-pipeline/src/pipeline/dvc.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py b/modules/ml-pipeline/src/pipeline/generate_metrics.py similarity index 93% rename from modules/ml-pipeline/src/pipeline/src/generate_metrics.py rename to modules/ml-pipeline/src/pipeline/generate_metrics.py index 7efeda9..58244bc 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/generate_metrics.py @@ -59,14 +59,16 @@ def generate_metrics( logger.info("-------------------------") test_data = input_dataclient.load_data( - location=test_data_filepath, + location=test_data_filepath, load_config=None ) logger.info("---------------------------") logger.info("--- Loading predictions ---") logger.info("---------------------------") - predictions = input_dataclient.load_data(location=predictions_output_filepath) + predictions = input_dataclient.load_data( + location=predictions_output_filepath, load_config=None + ) logger.info("--------------------------") logger.info("--- Generating metrics ---") @@ -81,7 +83,9 @@ def generate_metrics( logger.info("--- Saving metrics ---") logger.info("----------------------") - output_dataclient.save_data(obj=metrics_output, location=metrics_output_filepath) + output_dataclient.save_data( + obj=metrics_output, location=metrics_output_filepath, save_config=None + ) if __name__ == "__main__": diff --git a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py b/modules/ml-pipeline/src/pipeline/generate_predictions.py similarity index 97% rename from modules/ml-pipeline/src/pipeline/src/generate_predictions.py rename to modules/ml-pipeline/src/pipeline/generate_predictions.py index f80ec18..490d7e9 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/generate_predictions.py @@ -52,7 +52,9 @@ def generate_predictions( logger.info("--- Loading test data ---") logger.info("-------------------------") - test_data = input_dataclient.load_data(location=test_data_filepath) + test_data = input_dataclient.load_data( + location=test_data_filepath, load_config=None + ) logger.info("---------------------") logger.info("--- Loading model ---") @@ -78,7 +80,7 @@ def generate_predictions( predictions_df.columns = [predictions_column_name] output_dataclient.save_data( - obj=predictions_df, location=predictions_output_filepath + obj=predictions_df, location=predictions_output_filepath, save_config=None ) diff --git a/modules/ml-pipeline/src/pipeline/src/metrics/.gitignore b/modules/ml-pipeline/src/pipeline/metrics/.gitignore similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/metrics/.gitignore rename to modules/ml-pipeline/src/pipeline/metrics/.gitignore diff --git a/modules/ml-pipeline/src/pipeline/src/prepare_data.py b/modules/ml-pipeline/src/pipeline/prepare_data.py similarity index 95% rename from modules/ml-pipeline/src/pipeline/src/prepare_data.py rename to modules/ml-pipeline/src/pipeline/prepare_data.py index 851be48..8caa101 100644 --- a/modules/ml-pipeline/src/pipeline/src/prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/prepare_data.py @@ -79,10 +79,14 @@ def prepare_data( logger.info("--- Outputting data ---") logger.info("-----------------------") - output_dataclient.save_data(obj=train, location=output_train_filepath) + output_dataclient.save_data( + obj=train, location=output_train_filepath, save_config=None + ) if test is not None: - output_dataclient.save_data(obj=test, location=output_test_filepath) + output_dataclient.save_data( + obj=test, location=output_test_filepath, save_config=None + ) return train, test diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements-dev.txt rename to modules/ml-pipeline/src/pipeline/requirements/predictions/requirements-dev.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements.txt rename to modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt rename to modules/ml-pipeline/src/pipeline/requirements/training/requirements-dev.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/training/requirements.txt rename to modules/ml-pipeline/src/pipeline/requirements/training/requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt rename to modules/ml-pipeline/src/pipeline/requirements/version_control/requirements.txt diff --git a/modules/ml-pipeline/src/pipeline/src/.DS_Store b/modules/ml-pipeline/src/pipeline/src/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/modules/ml-pipeline/src/pipeline/src/.DS_Store and /dev/null differ diff --git a/modules/ml-pipeline/src/pipeline/src/.vscode/settings.json b/modules/ml-pipeline/src/pipeline/src/.vscode/settings.json deleted file mode 100644 index 0967ef4..0000000 --- a/modules/ml-pipeline/src/pipeline/src/.vscode/settings.json +++ /dev/null @@ -1 +0,0 @@ -{} diff --git a/modules/ml-pipeline/src/pipeline/src/README.md b/modules/ml-pipeline/src/pipeline/src/README.md deleted file mode 100644 index ca5e98e..0000000 --- a/modules/ml-pipeline/src/pipeline/src/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Training - -This folder contains the code base for training experimentation. - -To understand the pipeline, run `dvc dag` - -There are 3 main steps: -- Preparing data - - This is loading data (locally or from s3) - - Splitting the data into train and validation - - Creating additional features (if needed) - - **Data is cached** - - This will be down to the dvc remote location -- Build model - - For the prepared data, we build a model using our configurations - - Model is saved (locally or s3) - - **Model is cached** - - This will be down to the dvc remote location -- Generate Metrics - - For the given model, we generate metrics on validation data/test data - - **Metrics are cached** - - This will be down to the dvc remote location - -Workflow: -- Use `dvc metrics show` to view current metrics score -- Adjust parameters/ codebase -- When happy with changes, use `dvc exp run` to trigger an experiment -- Due to cache, only need stages are re-run -- Use `dvc metrics diff` to check the change in metrics -- Use `dvc exp show` to view all experiments - - NOTE: the last experiment will always be applied to the workspace! -- After running experiments, you can apply the the best model to workspace using `dvc exp apply [EXPERIMENT_NAME]` - - This experiment will have the corresponding .dvc files for the hashed model and data -- Use version control as normal - - git add, git commit etc diff --git a/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py b/modules/ml-pipeline/src/pipeline/startup_cleanup.py similarity index 100% rename from modules/ml-pipeline/src/pipeline/src/startup_cleanup.py rename to modules/ml-pipeline/src/pipeline/startup_cleanup.py diff --git a/modules/ml-pipeline/src/pipeline2/README.md b/modules/ml-pipeline/src/pipeline2/README.md deleted file mode 100644 index dbcce65..0000000 --- a/modules/ml-pipeline/src/pipeline2/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Pipeline2 - -PLACEHOLDER PIPELINE IF NEEDED