From e04f6125e0db47ec8d37761e1d281f8f6bef2a8c Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sun, 2 Nov 2025 13:11:33 +0000 Subject: [PATCH] add single row dataset for testing --- .github/workflows/MLPipelinePullRequest.yml | 31 ++++++++- .../src/pipeline/1_prepare_data.py | 5 ++ .../src/pipeline/configs/build_model.yaml | 2 +- .../src/pipeline/configs/settings.yaml | 1 + modules/ml-pipeline/src/pipeline/dvc.lock | 68 +++++++++---------- .../src/pipeline/metrics/.gitignore | 4 +- 6 files changed, 72 insertions(+), 39 deletions(-) diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index 94dc17e..30641cf 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -54,13 +54,40 @@ jobs: cd modules/ml-pipeline/src/pipeline dvc pull -r experiments + - name: Set timestamp + id: set_timestamp + run: | + echo "timestamp=$(date +%Y%m%d)" >> $GITHUB_ENV + echo "Generated timestamp: $timestamp" + + - name: Upload sample row dataset to S3 + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + run: | + cd modules/ml-pipeline/src/pipeline/data/prepared_data/ + aws s3 cp sample_test.parquet s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}_sample_test.parquet + - name: Build Lambda docker Image run: | docker build . --file ./deployment/Dockerfile.prediction.lambda --tag lambda_test - - name: Run lambda docker container + - name: Remove uploaded sample row dataset from S3 + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | - docker run lambda_test + aws s3 rm s3://retrofit-data-dev/sap_change_model/sample_data_for_cicd/${timestamp}_sample_test.parquet + # - name: Run lambda docker container + # env: + # AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + # AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + # run: | + # docker run -p 9000:8080 \ + # -e AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} \ + # -e AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} \ + # -e RUNTIME_ENVIRONMENT=dev \ + # -e PREDICTIONS_BUCKET=retrofit-sap-predictions-dev lambda_test Verify-Model: diff --git a/modules/ml-pipeline/src/pipeline/1_prepare_data.py b/modules/ml-pipeline/src/pipeline/1_prepare_data.py index 75d784f..6b4ab84 100644 --- a/modules/ml-pipeline/src/pipeline/1_prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/1_prepare_data.py @@ -29,6 +29,7 @@ data_filepath = prepare_data_params["data_filepath"] train_proportion = prepare_data_params["train_proportion"] output_train_filepath = prepare_data_params["output_train_filepath"] output_test_filepath = prepare_data_params["output_test_filepath"] +sample_test_filepath = prepare_data_params["sample_test_filepath"] feature_processor_config = feature_process_params["feature_processor_config"] logger.info(f"--- Initiate DataClient ---") @@ -99,6 +100,10 @@ def prepare_data( logger.info("--- Outputting data ---") + output_dataclient.save_data( + obj=data.sample(1), location=sample_test_filepath, save_config=None + ) + output_dataclient.save_data( obj=train, location=output_train_filepath, save_config=None ) diff --git a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml index 38c0910..69349ba 100644 --- a/modules/ml-pipeline/src/pipeline/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/build_model.yaml @@ -14,7 +14,7 @@ default: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error #mean_absolute_error - time_limit: 1800 + time_limit: 180 presets: medium_quality excluded_model_types: ['RF', 'CAT', 'NN_TORCH', 'KNN', 'XT'] infer_limit: 0.0005 diff --git a/modules/ml-pipeline/src/pipeline/configs/settings.yaml b/modules/ml-pipeline/src/pipeline/configs/settings.yaml index 28d5cd9..a6b493e 100644 --- a/modules/ml-pipeline/src/pipeline/configs/settings.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/settings.yaml @@ -26,6 +26,7 @@ default: train_proportion: 0.9 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet + sample_test_filepath: ./data/prepared_data/sample_test.parquet feature_processor: feature_processor_type: dataframe diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index 5502f03..1a3143a 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -16,8 +16,8 @@ stages: deps: - path: 1_prepare_data.py hash: md5 - md5: 11a3b8bfdfe199ab7ecc39ccc5652649 - size: 4298 + md5: a5ce162e1c402c0f811a80ef78cf4dd5 + size: 4481 params: configs/settings.yaml: default.feature_processor.feature_processor_config.drop_columns: @@ -61,9 +61,9 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: 5c56787d9e6450e26a78c15700e104c7.dir - size: 45746089 - nfiles: 2 + md5: 02b2c25e488f75c4a676540c127b8930.dir + size: 45890160 + nfiles: 3 build_model: cmd: python 2_build_model.py deps: @@ -73,9 +73,9 @@ stages: size: 4820 - path: data/prepared_data hash: md5 - md5: 5c56787d9e6450e26a78c15700e104c7.dir - size: 45746089 - nfiles: 2 + md5: 02b2c25e488f75c4a676540c127b8930.dir + size: 45890160 + nfiles: 3 params: configs/build_model.yaml: default: @@ -91,7 +91,7 @@ stages: output_filepath: ./data/model/allmodels/ problem_type: regression eval_metric: mean_squared_error - time_limit: 1800 + time_limit: 180 presets: medium_quality excluded_model_types: - RF @@ -107,18 +107,18 @@ stages: outs: - path: data/fit_predictions/ hash: md5 - md5: 4fa77e3f129d2e6f9ef7222c44978c26.dir - size: 3474669 + md5: 7f9a534daf824434262bee89e2ee2cfd.dir + size: 3475064 nfiles: 1 - path: data/model/ hash: md5 - md5: e27b9216bc7455f8245d5b49f27b2707.dir - size: 753575768 - nfiles: 30 + md5: c67bb2e8b24d9c574bc7c522ac3d66b9.dir + size: 414148418 + nfiles: 24 - path: metrics/fit_metrics.json hash: md5 - md5: 426a162284ca9e29c043eb1d72e547e6 - size: 224 + md5: 7763f689b46c38ec8f0cc605deac4c2a + size: 221 generate_predictions: cmd: python 3_generate_predictions.py deps: @@ -128,14 +128,14 @@ stages: size: 2464 - path: data/model hash: md5 - md5: e27b9216bc7455f8245d5b49f27b2707.dir - size: 753575768 - nfiles: 30 + md5: c67bb2e8b24d9c574bc7c522ac3d66b9.dir + size: 414148418 + nfiles: 24 - path: data/prepared_data hash: md5 - md5: 5c56787d9e6450e26a78c15700e104c7.dir - size: 45746089 - nfiles: 2 + md5: 02b2c25e488f75c4a676540c127b8930.dir + size: 45890160 + nfiles: 3 params: configs/settings.yaml: default.generate_predictions.input_dataclient_type: local @@ -148,8 +148,8 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 6e004c7f4812b5cabbee62fe8fb0d82f.dir - size: 484524 + md5: 2d9353f60e16d4f85dd4a08a71dce548.dir + size: 483856 nfiles: 1 generate_metrics: cmd: python 4_generate_metrics.py @@ -160,14 +160,14 @@ stages: size: 3484 - path: data/predictions hash: md5 - md5: 6e004c7f4812b5cabbee62fe8fb0d82f.dir - size: 484524 + md5: 2d9353f60e16d4f85dd4a08a71dce548.dir + size: 483856 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 5c56787d9e6450e26a78c15700e104c7.dir - size: 45746089 - nfiles: 2 + md5: 02b2c25e488f75c4a676540c127b8930.dir + size: 45890160 + nfiles: 3 params: configs/settings.yaml: default.generate_metrics.dataclient_type: local @@ -176,8 +176,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: b9ae6d24424f2d5389697577e9076b91 - size: 223 + md5: 8a52e3a0047c68b9de5c371a1d406f73 + size: 224 generate_scenerio_metrics: cmd: python 5_generate_scenarios.py deps: @@ -197,9 +197,9 @@ stages: outs: - path: metrics/scenario_metrics.md hash: md5 - md5: 32d78c20d91fedf2f5dbb4162f323e25 - size: 356 + md5: 666f73f6fdb49484737f1a7edd798727 + size: 363 - path: metrics/scenario_table.md hash: md5 - md5: 52cbd19566151b0c300f9673252704d2 + md5: 71c9fcb9ec304353aba0d7f5c58ca8b2 size: 872 diff --git a/modules/ml-pipeline/src/pipeline/metrics/.gitignore b/modules/ml-pipeline/src/pipeline/metrics/.gitignore index a19c4d2..6427764 100644 --- a/modules/ml-pipeline/src/pipeline/metrics/.gitignore +++ b/modules/ml-pipeline/src/pipeline/metrics/.gitignore @@ -1,4 +1,4 @@ +/fit_metrics.json +/metrics.json /scenario_table.md /scenario_metrics.md -/metrics.json -/fit_metrics.json