From 0b69afa17133b919db66ff44a02f579981dfdec0 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 10 Sep 2023 19:44:24 +0100 Subject: [PATCH 1/5] add a bad model --- .../pipeline/src/configs/prepare_data.yaml | 2 +- modules/ml-pipeline/src/pipeline/src/dvc.lock | 34 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml index 9a0c3bd..27bedaa 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml @@ -1,6 +1,6 @@ dataclient_type: minio data_location: s3://dev_bucket -train_proportion: 0.8 +train_proportion: 0.85 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/src/dvc.lock index ed7c57c..220ef2a 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock @@ -11,12 +11,12 @@ stages: configs/prepare_data.yaml: output_test_filepath: ./data/prepared_data/test.parquet output_train_filepath: ./data/prepared_data/train.parquet - train_proportion: 0.8 + train_proportion: 0.85 outs: - path: data/prepared_data/ hash: md5 - md5: f0d462fe6b1a856a827409a745539285.dir - size: 36169 + md5: e933c262266b270aa53bf63dc8bf8a64.dir + size: 35580 nfiles: 2 build_model: cmd: python build_model.py @@ -27,8 +27,8 @@ stages: size: 3576 - path: data/prepared_data hash: md5 - md5: f0d462fe6b1a856a827409a745539285.dir - size: 36169 + md5: e933c262266b270aa53bf63dc8bf8a64.dir + size: 35580 nfiles: 2 params: configs/build_model.yaml: @@ -41,7 +41,7 @@ stages: outs: - path: data/model/ hash: md5 - md5: fb7ae4137b445dc91e840b794d72e940.dir + md5: ee01cc8135569d30b42ef4b7d181548f.dir size: 1096 nfiles: 1 generate_predictions: @@ -49,13 +49,13 @@ stages: deps: - path: data/model hash: md5 - md5: fb7ae4137b445dc91e840b794d72e940.dir + md5: ee01cc8135569d30b42ef4b7d181548f.dir size: 1096 nfiles: 1 - path: data/prepared_data hash: md5 - md5: f0d462fe6b1a856a827409a745539285.dir - size: 36169 + md5: e933c262266b270aa53bf63dc8bf8a64.dir + size: 35580 nfiles: 2 - path: generate_predictions.py hash: md5 @@ -69,21 +69,21 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 4d5854903b25bdae15d99c934ebcfb99.dir - size: 2531 + md5: b847bb0e13ae3af76c0d0b66d2bc0bfb.dir + size: 2334 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: 4d5854903b25bdae15d99c934ebcfb99.dir - size: 2531 + md5: b847bb0e13ae3af76c0d0b66d2bc0bfb.dir + size: 2334 nfiles: 1 - path: data/prepared_data hash: md5 - md5: f0d462fe6b1a856a827409a745539285.dir - size: 36169 + md5: e933c262266b270aa53bf63dc8bf8a64.dir + size: 35580 nfiles: 2 - path: generate_metrics.py hash: md5 @@ -96,5 +96,5 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 3c9306e992b07491ff7e642949d6bc47 - size: 182 + md5: b3a6747855e3da77946292edddb0bf9e + size: 181 From 8f166beffedeb3fcc81222d94523b3a14d0a83be Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 10 Sep 2023 19:52:02 +0100 Subject: [PATCH 2/5] test getting master --- .github/workflows/MLPipelinePullRequest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index 588e239..2c45fdb 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -76,7 +76,7 @@ jobs: echo "## Model metrics" > report.md # Compare metrics to master - git fetch --depth=1 origin master:master + git fetch origin master dvc metrics diff --md master >> report.md cml comment create report.md From 29fde8359fff77b646bdfb5b3d397e177c1c74a2 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 10 Sep 2023 19:54:18 +0100 Subject: [PATCH 3/5] test getting master --- .github/workflows/MLPipelinePullRequest.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index 2c45fdb..314870b 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -50,9 +50,6 @@ jobs: steps: - uses: actions/checkout@v3 - name: Install packages to retrieve artifacts - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | pip install --upgrade pip pip install dvc dvc_s3 @@ -70,6 +67,8 @@ jobs: - uses: iterative/setup-cml@v1 - name: Generate report env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | cd modules/ml-pipeline/src/pipeline/src From 4fc4406df9dcc90e3f587ce00d65e4d4a44be8f5 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 10 Sep 2023 19:56:52 +0100 Subject: [PATCH 4/5] test getting master - final --- .github/workflows/MLPipelinePullRequest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index 314870b..d6af551 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -75,7 +75,7 @@ jobs: echo "## Model metrics" > report.md # Compare metrics to master - git fetch origin master + git fetch --depth=1 origin master:master dvc metrics diff --md master >> report.md cml comment create report.md From b0f12db16d48972870574fbeadf4ddc395e58941 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 10 Sep 2023 20:04:42 +0100 Subject: [PATCH 5/5] split out version control system requirements --- .github/workflows/MLPipelinePullRequest.yml | 6 ++-- modules/ml-pipeline/Makefile | 2 +- .../pipeline/src/configs/prepare_data.yaml | 2 +- modules/ml-pipeline/src/pipeline/src/dvc.lock | 34 +++++++++---------- .../predictions/requirements-dev.txt | 3 -- .../training/requirements-dev.txt | 3 -- .../requirements/training/requirements.txt | 2 -- .../version_control/requirements.txt | 3 ++ 8 files changed, 24 insertions(+), 31 deletions(-) create mode 100644 modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index d6af551..186c382 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -23,8 +23,7 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | pip install --upgrade pip - pip install dvc dvc_s3 - # pip install -r src/requirements/predictions/requirements-dev.txt + pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt - name: Retrieve artifacts (dvc.lock) env: @@ -52,8 +51,7 @@ jobs: - name: Install packages to retrieve artifacts run: | pip install --upgrade pip - pip install dvc dvc_s3 - # pip install -r src/requirements/predictions/requirements-dev.txt + pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt - name: Retrieve artifacts (dvc.lock) env: diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index c9b0ddf..45a16f5 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -11,7 +11,7 @@ dev-pyenv: pyenv install ${PYTHON_VERSION} || echo "Python version already installed" pyenv global ${PYTHON_VERSION} python3 -m venv .dev_env - . .dev_env/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/src/requirements/training/requirements-dev.txt && pre-commit install + . .dev_env/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/src/requirements/training/requirements-dev.txt && pip install -r src/pipeline/src/requirements/version_control/requirements.txt && pre-commit install echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" echo "source .dev_env/bin/activate" diff --git a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml index 27bedaa..9a0c3bd 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml @@ -1,6 +1,6 @@ dataclient_type: minio data_location: s3://dev_bucket -train_proportion: 0.85 +train_proportion: 0.8 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/src/dvc.lock index 220ef2a..ed7c57c 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock @@ -11,12 +11,12 @@ stages: configs/prepare_data.yaml: output_test_filepath: ./data/prepared_data/test.parquet output_train_filepath: ./data/prepared_data/train.parquet - train_proportion: 0.85 + train_proportion: 0.8 outs: - path: data/prepared_data/ hash: md5 - md5: e933c262266b270aa53bf63dc8bf8a64.dir - size: 35580 + md5: f0d462fe6b1a856a827409a745539285.dir + size: 36169 nfiles: 2 build_model: cmd: python build_model.py @@ -27,8 +27,8 @@ stages: size: 3576 - path: data/prepared_data hash: md5 - md5: e933c262266b270aa53bf63dc8bf8a64.dir - size: 35580 + md5: f0d462fe6b1a856a827409a745539285.dir + size: 36169 nfiles: 2 params: configs/build_model.yaml: @@ -41,7 +41,7 @@ stages: outs: - path: data/model/ hash: md5 - md5: ee01cc8135569d30b42ef4b7d181548f.dir + md5: fb7ae4137b445dc91e840b794d72e940.dir size: 1096 nfiles: 1 generate_predictions: @@ -49,13 +49,13 @@ stages: deps: - path: data/model hash: md5 - md5: ee01cc8135569d30b42ef4b7d181548f.dir + md5: fb7ae4137b445dc91e840b794d72e940.dir size: 1096 nfiles: 1 - path: data/prepared_data hash: md5 - md5: e933c262266b270aa53bf63dc8bf8a64.dir - size: 35580 + md5: f0d462fe6b1a856a827409a745539285.dir + size: 36169 nfiles: 2 - path: generate_predictions.py hash: md5 @@ -69,21 +69,21 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: b847bb0e13ae3af76c0d0b66d2bc0bfb.dir - size: 2334 + md5: 4d5854903b25bdae15d99c934ebcfb99.dir + size: 2531 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: b847bb0e13ae3af76c0d0b66d2bc0bfb.dir - size: 2334 + md5: 4d5854903b25bdae15d99c934ebcfb99.dir + size: 2531 nfiles: 1 - path: data/prepared_data hash: md5 - md5: e933c262266b270aa53bf63dc8bf8a64.dir - size: 35580 + md5: f0d462fe6b1a856a827409a745539285.dir + size: 36169 nfiles: 2 - path: generate_metrics.py hash: md5 @@ -96,5 +96,5 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: b3a6747855e3da77946292edddb0bf9e - size: 181 + md5: 3c9306e992b07491ff7e642949d6bc47 + size: 182 diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements-dev.txt index 28f6af4..5aac406 100644 --- a/modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/src/requirements/predictions/requirements-dev.txt @@ -1,9 +1,6 @@ joblib==1.3.2 boto3==1.28.17 pandas==1.5.3 -dvc==3.18.0 -dvc-s3==2.23.0 -gto==1.0.4 scikit-learn==1.3.0 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt b/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt index 28f6af4..5aac406 100644 --- a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt +++ b/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements-dev.txt @@ -1,9 +1,6 @@ joblib==1.3.2 boto3==1.28.17 pandas==1.5.3 -dvc==3.18.0 -dvc-s3==2.23.0 -gto==1.0.4 scikit-learn==1.3.0 pyarrow==13.0.0 pre-commit==3.3.3 diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements.txt b/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements.txt index 8c146e8..196dfe7 100644 --- a/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements.txt +++ b/modules/ml-pipeline/src/pipeline/src/requirements/training/requirements.txt @@ -1,5 +1,3 @@ boto3==1.28.41 pandas==1.5.3 -dvc==3.18.0 -gto==1.0.4 scikit-learn==1.3.0 diff --git a/modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt b/modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt new file mode 100644 index 0000000..8459d38 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt @@ -0,0 +1,3 @@ +dvc==3.18.0 +dvc-s3==2.23.0 +gto==1.0.4