diff --git a/modules/ml-pipeline/src/pipeline/.devcontainer/devcontainer.json b/modules/ml-pipeline/src/pipeline/.devcontainer/devcontainer.json new file mode 100644 index 0000000..222e7be --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/.devcontainer/devcontainer.json @@ -0,0 +1,29 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile +{ + "name": "Existing Dockerfile", + "build": { + // Sets the run context to one level up instead of the .devcontainer folder. + "context": "..", + // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. + "dockerfile": "../Dockerfile" + }, + "features": { + "ghcr.io/devcontainers/features/git:1": {} + } + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Uncomment the next line to run commands after the container is created. + // "postCreateCommand": "cat /etc/os-release", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "devcontainer" +} diff --git a/modules/ml-pipeline/src/pipeline/Dockerfile b/modules/ml-pipeline/src/pipeline/Dockerfile index b94ddc0..224be6b 100644 --- a/modules/ml-pipeline/src/pipeline/Dockerfile +++ b/modules/ml-pipeline/src/pipeline/Dockerfile @@ -1,4 +1,9 @@ # Dockerfile that grabs the current dvc hashed model -FROM python:3.9-slim +FROM python:3.10.12-slim + +COPY training/requirements/requirements-dev.txt requirements.txt + +RUN pip install --upgrade pip +RUN pip install -r requirements.txt + -RUN pip install -r experimentation/requirements/training.txt diff --git a/modules/ml-pipeline/src/pipeline/training/.dvc/config b/modules/ml-pipeline/src/pipeline/training/.dvc/config index e69de29..03ccfbc 100644 --- a/modules/ml-pipeline/src/pipeline/training/.dvc/config +++ b/modules/ml-pipeline/src/pipeline/training/.dvc/config @@ -0,0 +1,2 @@ +['remote "myremote"'] + url = /tmp/dvcstore diff --git a/modules/ml-pipeline/src/pipeline/training/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/training/configs/prepare_data.yaml index 9a0c3bd..de82af2 100644 --- a/modules/ml-pipeline/src/pipeline/training/configs/prepare_data.yaml +++ b/modules/ml-pipeline/src/pipeline/training/configs/prepare_data.yaml @@ -1,6 +1,6 @@ dataclient_type: minio data_location: s3://dev_bucket -train_proportion: 0.8 +train_proportion: 0.75 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/training/dvc.lock b/modules/ml-pipeline/src/pipeline/training/dvc.lock index 01ff355..4f0e5d4 100644 --- a/modules/ml-pipeline/src/pipeline/training/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/training/dvc.lock @@ -11,24 +11,24 @@ stages: configs/prepare_data.yaml: output_test_filepath: ./data/prepared_data/test.parquet output_train_filepath: ./data/prepared_data/train.parquet - train_proportion: 0.8 + train_proportion: 0.75 outs: - path: data/prepared_data/ hash: md5 - md5: 9afb06007a6da3cef1619e937e9c413e.dir - size: 36344 + md5: 5cededc181e2b86c8ff63498a3e79462.dir + size: 36481 nfiles: 2 build_model: cmd: python build_model.py deps: - path: build_model.py hash: md5 - md5: 43ff6a4781efacff4234fe261022a5dd + md5: 152d52b7754b4c6f96f3481dc26562fc size: 3576 - path: data/prepared_data hash: md5 - md5: 9afb06007a6da3cef1619e937e9c413e.dir - size: 36344 + md5: 5cededc181e2b86c8ff63498a3e79462.dir + size: 36481 nfiles: 2 params: configs/build_model.yaml: @@ -41,7 +41,7 @@ stages: outs: - path: data/model/ hash: md5 - md5: e0f58c7c4e12fa92b29d973dd5f3f565.dir + md5: e4904765c79b0139ccb80d56ef044383.dir size: 1096 nfiles: 1 generate_predictions: @@ -49,18 +49,18 @@ stages: deps: - path: data/model hash: md5 - md5: e0f58c7c4e12fa92b29d973dd5f3f565.dir + md5: e4904765c79b0139ccb80d56ef044383.dir size: 1096 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 9afb06007a6da3cef1619e937e9c413e.dir - size: 36344 + md5: 5cededc181e2b86c8ff63498a3e79462.dir + size: 36481 nfiles: 2 - path: generate_predictions.py hash: md5 - md5: 209fe6efbebfd3d7aa1a1bb27885d3c1 - size: 3114 + md5: b3250f5b597fe33bf57e8bc225606be7 + size: 3268 params: configs/generate_predictions.yaml: predictions_output_filepath: ./data/predictions/predictions.parquet @@ -68,26 +68,26 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 28ad87e9a47ea8815a0b8ba5808e750c.dir - size: 2531 + md5: d78af354c4ecb0b0296ac093e515a01b.dir + size: 2734 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: 28ad87e9a47ea8815a0b8ba5808e750c.dir - size: 2531 + md5: d78af354c4ecb0b0296ac093e515a01b.dir + size: 2734 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 9afb06007a6da3cef1619e937e9c413e.dir - size: 36344 + md5: 5cededc181e2b86c8ff63498a3e79462.dir + size: 36481 nfiles: 2 - path: generate_metrics.py hash: md5 - md5: 4c1379bf37f5e5ad5843eb5b5a22ebc5 - size: 3407 + md5: 8c78578a8c45edf4b93a85c42c2b2192 + size: 3561 params: configs/generate_metrics.yaml: metrics_output_filepath: ./metrics/metrics.json @@ -95,5 +95,5 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 82e90d3f00bfd7c5a3fb83c2c14f4cff + md5: a32cd5e65ca89db0920297a7c6ca39bf size: 182