From ddcce2b56d58a28b2edf641db24919b79c4f7d0d Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 14 Sep 2023 23:28:00 +0100 Subject: [PATCH 1/2] cleaned up workflow --- .github/workflows/MLPipelinePostMerge.yml | 153 ++++++++++++------ .github/workflows/MLPipelinePullRequest.yml | 2 +- .../src/pipeline/src/startup_cleanup.py | 17 +- 3 files changed, 116 insertions(+), 56 deletions(-) diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index 6ea18c5..19c6ce1 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -15,34 +15,88 @@ on: permissions: write-all jobs: - Use-Major-Label-on-Merge: + Register-Major-Model-Dev: if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'major')) }} runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Test echoo - run: | - echo "Hello Major" + - uses: actions/checkout@v4 + with: + fetch-depth: 0 - Use-Minor-Label-on-Merge: + - name: Install packages to register model + run: | + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + + - name: Register Model + run: | + REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + + git config user.name "Github-Bot" + git config user.email "Github-Bot@no-reply.com" + + # gto register test --repo https://github.com/Hestia-Homes/ML/ + # echo "chicken" >> test.md + + # gto register ${REGISTER_MODEL_NAME} --bump-major + # gto assign regression --stage dev + # gto show >> Model_Register.md + + Register-Minor-Model-Dev: if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'minor')) }} runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Test echoo - run: | - echo "Hello Minor" + - uses: actions/checkout@v4 + with: + fetch-depth: 0 - Use-Patch-Label-on-Merge: + - name: Install packages to register model + run: | + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + + - name: Register Model + run: | + REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + + git config user.name "Github-Bot" + git config user.email "Github-Bot@no-reply.com" + + # gto register test --repo https://github.com/Hestia-Homes/ML/ + # echo "chicken" >> test.md + + # gto register ${REGISTER_MODEL_NAME} --bump-minor + # gto assign regression --stage dev + # gto show >> Model_Register.md + + Register-Patch-Model-Dev: if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'patch')) }} runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Test echoo - run: | - echo "Hello Patch" + - uses: actions/checkout@v4 + with: + fetch-depth: 0 - Promote-Model-To-Dev: + - name: Install packages to register model + run: | + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + + - name: Register Model + run: | + REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + + git config user.name "Github-Bot" + git config user.email "Github-Bot@no-reply.com" + + # gto register test --repo https://github.com/Hestia-Homes/ML/ + # echo "chicken" >> test.md + + # gto register ${REGISTER_MODEL_NAME} --bump-major + # gto assign regression --stage dev + # gto show >> Model_Register.md + + Promote-Artefacts-To-Dev: if: github.event.pull_request.merged == true runs-on: ubuntu-latest @@ -70,55 +124,52 @@ jobs: dvc push -r dev - Register-New-Model-Dev: - if: github.event.pull_request.merged == true - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 + # Register-New-Model-Dev: + # if: github.event.pull_request.merged == true + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v4 + # with: + # fetch-depth: 0 - - name: Install packages to register model - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - run: | - pip install --upgrade pip - pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + # - name: Install packages to register model + # env: + # AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + # AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + # run: | + # pip install --upgrade pip + # pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt - - name: Register Model - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - run: | + # - name: Register Model + # env: + # AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + # AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + # run: | - # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') - REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') + # # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + # REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') - git config user.name "Github-Bot" - git config user.email "Github-Bot@no-reply.com" + # git config user.name "Github-Bot" + # git config user.email "Github-Bot@no-reply.com" - # git tag model@v0.0.1 - # git push origin model@v0.0.1 + # # git tag model@v0.0.1 + # # git push origin model@v0.0.1 - # gto register test --repo https://github.com/Hestia-Homes/ML/ - # echo "chicken" >> test.md + # # gto register test --repo https://github.com/Hestia-Homes/ML/ + # # echo "chicken" >> test.md - # gto -v register ${REGISTER_MODEL_NAME} - # gto assign regression --stage dev - # gto show + # # gto -v register ${REGISTER_MODEL_NAME} + # # gto assign regression --stage dev + # # gto show Register-Prediction-Image-Dev: - needs: Promote-Model-To-Dev - # needs: [Promote-Model-To-Dev, Register-New-Model-Dev] WILL ADD BACK ONCE REGISTER WORKS + needs: Promote-Artefacts-To-Dev + # needs: [Promote-Artefacts-To-Dev, Register-New-Model-Dev] WILL ADD BACK ONCE REGISTER WORKS runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Install packages to retrieve artifacts - env: - AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} run: | pip install --upgrade pip pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index 9ac5602..e1ebd5d 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -13,7 +13,7 @@ permissions: write-all jobs: - No-Label: + Check-Label: runs-on: ubuntu-latest steps: - uses: yogevbd/enforce-label-action@2.1.0 diff --git a/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py b/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py index d30308c..af63291 100644 --- a/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py +++ b/modules/ml-pipeline/src/pipeline/src/startup_cleanup.py @@ -15,6 +15,15 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: """ Remove the directory where artefacts are stored """ + + logger.info("---------------------") + logger.info(f"--- Run Clean up ---") + logger.info("---------------------") + + logger.info("-------------------------") + logger.info(f"--- Delete artefacts ---") + logger.info("-------------------------") + artefact_directory_path = Path(artefacts_directory) if artefact_directory_path.exists(): @@ -22,6 +31,10 @@ def run_cleanup(artefacts_directory: str, metrics_directory: str) -> None: logger.info(f"Removing the directory: {artefacts_directory}") shutil.rmtree(artefact_directory_path) + logger.info("-----------------------") + logger.info(f"--- Delete metrics ---") + logger.info("-----------------------") + metrics_directory_path = Path(metrics_directory) if metrics_directory_path.exists(): @@ -36,10 +49,6 @@ if __name__ == "__main__": logger.info(f"--- {__file__} - Start! ---") logger.info("----------------------------") - logger.info("---------------------") - logger.info(f"--- Run Clean up ---") - logger.info("---------------------") - run_cleanup( artefacts_directory=startup_cleanup_params["artefacts"], metrics_directory=startup_cleanup_params["metrics"], From 88319c1480c106afe6dc640feff7f3453c4f6d98 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Fri, 15 Sep 2023 00:12:08 +0100 Subject: [PATCH 2/2] fixed bug and using sklearn for simiplicity --- .../src/pipeline/src/configs/build_model.yaml | 4 +- .../src/pipeline/src/configs/client.yaml | 10 +++ .../src/configs/feature_processor.yaml | 4 +- .../pipeline/src/configs/prepare_data.yaml | 8 +-- modules/ml-pipeline/src/pipeline/src/dvc.lock | 62 +++++++++---------- modules/ml-pipeline/src/pipeline/src/dvc.yaml | 1 + .../src/pipeline/src/generate_metrics.py | 10 ++- .../src/pipeline/src/generate_predictions.py | 20 ++++-- .../src/pipeline/src/prepare_data.py | 20 +++--- 9 files changed, 82 insertions(+), 57 deletions(-) create mode 100644 modules/ml-pipeline/src/pipeline/src/configs/client.yaml diff --git a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml index a1307c1..d97cf22 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml @@ -1,5 +1,5 @@ -model_type: AutogluonAutoML -model_save_filepath: ./data/model/autogluonmodel/ +model_type: SKLearnLinearRegression +model_save_filepath: ./data/model/model.joblib SKLearnLinearRegression: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/client.yaml b/modules/ml-pipeline/src/pipeline/src/configs/client.yaml new file mode 100644 index 0000000..65dc7a2 --- /dev/null +++ b/modules/ml-pipeline/src/pipeline/src/configs/client.yaml @@ -0,0 +1,10 @@ +aws-s3: + AWS_ACCESS_KEY_ID: null + AWS_SECRET_ACCESS_KEY: null + ENDPOINT_URL: null +aws-s3-mock: + AWS_ACCESS_KEY_ID: minio + AWS_SECRET_ACCESS_KEY: minio123 + ENDPOINT_URL: http://localhost:9000 +local: + null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml index 233a329..18e6f84 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml @@ -4,5 +4,5 @@ feature_processor_config: subsample_seed: 0 target: RDSAP_CHANGE drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"] - # retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] - retain_features: null + retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] + # retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml index 736f5d2..af8a802 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml @@ -1,14 +1,8 @@ input_dataclient_type: aws-s3 -input_dataclient: - AWS_ACCESS_KEY_ID: null - AWS_SECRET_ACCESS_KEY: null - ENDPOINT_URL: null output_dataclient_type: local -output_dataclient: - null datahandler_type: parquet data_filepath: s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet -train_proportion: 0.8 +train_proportion: 0.1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/src/dvc.lock index 01a400f..b5d7e23 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock @@ -5,18 +5,18 @@ stages: deps: - path: prepare_data.py hash: md5 - md5: 87a83e62512bff93c89f3e93c1ed248d - size: 5593 + md5: 9c31bfb1b75ea3c9685ec459cbb50e62 + size: 5921 params: configs/prepare_data.yaml: output_test_filepath: ./data/prepared_data/test.parquet output_train_filepath: ./data/prepared_data/train.parquet - train_proportion: 0.8 + train_proportion: 0.1 outs: - path: data/prepared_data/ hash: md5 - md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir - size: 4428909 + md5: f7e45d3997cf165904174b2bc2d2eba5.dir + size: 4396934 nfiles: 2 build_model: cmd: python build_model.py @@ -27,8 +27,8 @@ stages: size: 3948 - path: data/prepared_data hash: md5 - md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir - size: 4428909 + md5: f7e45d3997cf165904174b2bc2d2eba5.dir + size: 4396934 nfiles: 2 params: configs/build_model.yaml: @@ -42,31 +42,31 @@ stages: SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear - model_save_filepath: ./data/model/autogluonmodel/ - model_type: AutogluonAutoML + model_save_filepath: ./data/model/model.joblib + model_type: SKLearnLinearRegression outs: - path: data/model/ hash: md5 - md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir - size: 1264795580 - nfiles: 28 + md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir + size: 904 + nfiles: 1 generate_predictions: cmd: python generate_predictions.py deps: - path: data/model hash: md5 - md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir - size: 1264795580 - nfiles: 28 + md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir + size: 904 + nfiles: 1 - path: data/prepared_data hash: md5 - md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir - size: 4428909 + md5: f7e45d3997cf165904174b2bc2d2eba5.dir + size: 4396934 nfiles: 2 - path: generate_predictions.py hash: md5 - md5: 76c45e7575ec979e6c4c8e2cf754a720 - size: 4225 + md5: 32c0ecd082e1f8fc4426338d6629979c + size: 4686 params: configs/generate_predictions.yaml: input_dataclient_type: local @@ -77,26 +77,26 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir - size: 672577 + md5: ea0431b600f0ef357de3a543482cefe7.dir + size: 4085105 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir - size: 672577 + md5: ea0431b600f0ef357de3a543482cefe7.dir + size: 4085105 nfiles: 1 - path: data/prepared_data hash: md5 - md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir - size: 4428909 + md5: f7e45d3997cf165904174b2bc2d2eba5.dir + size: 4396934 nfiles: 2 - path: generate_metrics.py hash: md5 - md5: cc368845f62523575a9ed5c791e27815 - size: 4329 + md5: 4709c42d93f8e717a3d9e4958e46cd76 + size: 4587 params: configs/generate_metrics.yaml: dataclient_type: local @@ -107,15 +107,15 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: 3f03e50a419af6730351a5016e2ae98a - size: 182 + md5: ae53c4781cb8a754d24e29ba7ddb16ea + size: 183 startup_cleanup: cmd: python startup_cleanup.py deps: - path: startup_cleanup.py hash: md5 - md5: f7fe2ca33004b34530da0a3ab48c1790 - size: 1458 + md5: 2e51fbcac960d0f960bf32a8ec7486a0 + size: 1748 params: configs/startup_cleanup.yaml: artefacts: ./data diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.yaml b/modules/ml-pipeline/src/pipeline/src/dvc.yaml index 7e98535..42e8947 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/src/dvc.yaml @@ -7,6 +7,7 @@ stages: - configs/startup_cleanup.yaml: - artefacts - metrics + always_changed: true prepare_data: cmd: python prepare_data.py deps: diff --git a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py b/modules/ml-pipeline/src/pipeline/src/generate_metrics.py index a7def45..3a5c668 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_metrics.py +++ b/modules/ml-pipeline/src/pipeline/src/generate_metrics.py @@ -21,6 +21,9 @@ from core.Logger import logger RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" prepare_data_params = yaml.safe_load(open(prepare_data_path)) @@ -96,7 +99,12 @@ if __name__ == "__main__": logger.info("----------------------------") model = model_factory(build_model_params["model_type"]) - dataclient = dataclient_factory(generate_metrics_params["dataclient_type"]) + + dataclient_type = generate_metrics_params["dataclient_type"] + dataclient = dataclient_factory(dataclient_type) + dataclient.ingest_configurations(client_params[dataclient_type]) + dataclient.establish_client() + input_datahandler = datahandler_factory( generate_metrics_params["input_datahandler_type"] ) diff --git a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py index 552db47..48e192b 100644 --- a/modules/ml-pipeline/src/pipeline/src/generate_predictions.py +++ b/modules/ml-pipeline/src/pipeline/src/generate_predictions.py @@ -19,6 +19,9 @@ from core.Logger import logger RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" prepare_data_params = yaml.safe_load(open(prepare_data_path)) @@ -97,12 +100,19 @@ if __name__ == "__main__": # We may have different locations of loading hence why we use one specified in generate_predictions.yaml # I.e. for metric runs, this will be a local data client # For predictions, we will want a cloud data client - input_dataclient = dataclient_factory( - generate_predictions_params["input_dataclient_type"] - ) - output_dataclient = dataclient_factory( - generate_predictions_params["output_dataclient_type"] + + input_dataclient_type = generate_predictions_params["input_dataclient_type"] + input_dataclient = dataclient_factory(input_dataclient_type) + input_dataclient.ingest_configurations(config=client_params[input_dataclient_type]) + input_dataclient.establish_client() + + output_dataclient_type = generate_predictions_params["output_dataclient_type"] + output_dataclient = dataclient_factory(output_dataclient_type) + output_dataclient.ingest_configurations( + config=client_params[output_dataclient_type] ) + output_dataclient.establish_client() + datahandler = datahandler_factory(prepare_data_params["datahandler_type"]) generate_predictions( diff --git a/modules/ml-pipeline/src/pipeline/src/prepare_data.py b/modules/ml-pipeline/src/pipeline/src/prepare_data.py index 6df07fb..400adbf 100644 --- a/modules/ml-pipeline/src/pipeline/src/prepare_data.py +++ b/modules/ml-pipeline/src/pipeline/src/prepare_data.py @@ -21,6 +21,9 @@ from core.FeatureProcessor import feature_processor_factory RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local") +client_path = Path(__file__).parent / "configs" / "client.yaml" +client_params = yaml.safe_load(open(client_path)) + prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml" prepare_data_params = yaml.safe_load(open(prepare_data_path)) @@ -94,7 +97,7 @@ def prepare_data( dataclient=output_dataclient, obj=train, location=output_train_filepath ) - if test: + if test is not None: datahandler.save_data( dataclient=output_dataclient, obj=test, location=output_test_filepath ) @@ -112,18 +115,17 @@ if __name__ == "__main__": logger.info(f"--- Initiate DataClient ---") logger.info("----------------------------") - input_dataclient = dataclient_factory(prepare_data_params["input_dataclient_type"]) - output_dataclient = dataclient_factory( - prepare_data_params["output_dataclient_type"] - ) + input_dataclient_type = prepare_data_params["input_dataclient_type"] + output_dataclient_type = prepare_data_params["output_dataclient_type"] - input_dataclient.ingest_configurations( - config=prepare_data_params["input_dataclient"] - ) + input_dataclient = dataclient_factory(input_dataclient_type) + output_dataclient = dataclient_factory(output_dataclient_type) + + input_dataclient.ingest_configurations(config=client_params[input_dataclient_type]) input_dataclient.establish_client() output_dataclient.ingest_configurations( - config=prepare_data_params["output_dataclient"] + config=client_params[output_dataclient_type] ) output_dataclient.establish_client()