diff --git a/.github/workflows/MLMonitoringData.yml b/.github/workflows/MLMonitoringData.yml new file mode 100644 index 0000000..df764fd --- /dev/null +++ b/.github/workflows/MLMonitoringData.yml @@ -0,0 +1,36 @@ +name: (REPLACE WITH LAMBDA) Run monitoring on data to ensure that fundamentally, the data and its relationships haven't changed + +on: + push: + branches: + - "off-**" + +jobs: + + Verify-Data: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Install packages to generate data report + run: | + pip install --upgrade pip + pip install -r modules/ml-monitoring/src/evidently/src/requirements/requirements.txt + + - name: Run Monitoring Data report + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + run: | + cd modules/ml-monitoring/src/evidently/src + python regression_report.py + + # - name: Build Monitoring Data docker Image + # run: | + # cd modules/ml-monitoring/src/evidently/ + # docker build . --file MonitoringData.Dockerfile --tag monitoring_data_test + + # - name: Run Monitoring Data docker container + # run: | + # docker run monitoring_data_test diff --git a/.github/workflows/MLMonitoringPromotion.yml b/.github/workflows/MLMonitoringPromotion.yml new file mode 100644 index 0000000..89d6977 --- /dev/null +++ b/.github/workflows/MLMonitoringPromotion.yml @@ -0,0 +1,27 @@ +name: Run monitoring on an potential promotions (i.e. a new model registering, make sure results are not just "metric" better but everything makes sense) + +on: + push: + tags: + - 'NewModel**' + +jobs: + + Verify-Model: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Install packages to generate Model report + run: | + pip install --upgrade pip + pip install -r modules/ml-monitoring/src/evidently/src/requirements/requirements.txt + + - name: Build Monitoring Model docker Image + run: | + cd modules/ml-monitoring/src/evidently/ + docker build . --file MonitoringModel.Dockerfile --tag monitoring_model_test + + - name: Run Monitoring Model docker container + run: | + docker run monitoring_model_test diff --git a/.github/workflows/MLPipelinePostMerge.yml b/.github/workflows/MLPipelinePostMerge.yml index 19c6ce1..b8ab439 100644 --- a/.github/workflows/MLPipelinePostMerge.yml +++ b/.github/workflows/MLPipelinePostMerge.yml @@ -31,16 +31,27 @@ jobs: - name: Register Model run: | REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + # REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') git config user.name "Github-Bot" git config user.email "Github-Bot@no-reply.com" - # gto register test --repo https://github.com/Hestia-Homes/ML/ - # echo "chicken" >> test.md + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') || false + if [ -z "${latest_version}" ]; then + increment_version="1.0.0" + else + increment_version=$(echo ${latest_version} | awk -F'.' '{OFS="."; $1+=1; print}') + fi - # gto register ${REGISTER_MODEL_NAME} --bump-major - # gto assign regression --stage dev - # gto show >> Model_Register.md + new_tag=${REGISTER_MODEL_NAME}@v${increment_version} + + git tag -a ${new_tag} -m "Registering new Major Version" + git push origin ${new_tag} + + gto show > MODEL_REGISTRY.md + git add . + git commit -m "Update Registry" + git push Register-Minor-Model-Dev: if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'minor')) }} @@ -58,16 +69,27 @@ jobs: - name: Register Model run: | REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + # REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') git config user.name "Github-Bot" git config user.email "Github-Bot@no-reply.com" - # gto register test --repo https://github.com/Hestia-Homes/ML/ - # echo "chicken" >> test.md + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') + if [ -z "${latest_version}" ]; then + increment_version="0.1.0" + else + increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$2++; print}') + fi - # gto register ${REGISTER_MODEL_NAME} --bump-minor - # gto assign regression --stage dev - # gto show >> Model_Register.md + new_tag=${REGISTER_MODEL_NAME}@v${increment_version} + + git tag -a ${new_tag} -m "Registering new Minor Version" + git push origin ${new_tag} + + gto show > MODEL_REGISTRY.md + git add . + git commit -m "Update Registry" + git push Register-Patch-Model-Dev: if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'patch')) }} @@ -85,16 +107,27 @@ jobs: - name: Register Model run: | REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + # REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') git config user.name "Github-Bot" git config user.email "Github-Bot@no-reply.com" - # gto register test --repo https://github.com/Hestia-Homes/ML/ - # echo "chicken" >> test.md + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref | awk -F"@v" '{print $2}') + if [ -z "${latest_version}" ]; then + increment_version="0.0.1" + else + increment_version=$(echo ${latest_version} | awk 'BEGIN{FS=OFS="."} {$3++; print}') + fi - # gto register ${REGISTER_MODEL_NAME} --bump-major - # gto assign regression --stage dev - # gto show >> Model_Register.md + new_tag=${REGISTER_MODEL_NAME}@v${increment_version} + + git tag -a ${new_tag} -m "Registering new Patch Version" + git push origin ${new_tag} + + gto show > MODEL_REGISTRY.md + git add . + git commit -m "Update Registry" + git push Promote-Artefacts-To-Dev: if: github.event.pull_request.merged == true @@ -123,44 +156,43 @@ jobs: cd modules/ml-pipeline/src/pipeline/src dvc push -r dev + Register-New-Model-Dev: + if: github.event.pull_request.merged == true + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 - # Register-New-Model-Dev: - # if: github.event.pull_request.merged == true - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v4 - # with: - # fetch-depth: 0 + - name: Install packages to register model + run: | + pip install --upgrade pip + pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt - # - name: Install packages to register model - # env: - # AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - # AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - # run: | - # pip install --upgrade pip - # pip install -r modules/ml-pipeline/src/pipeline/src/requirements/version_control/requirements.txt + - name: Register Model + env: + AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} + run: | - # - name: Register Model - # env: - # AWS_ACCESS_KEY_ID: ${{ secrets.ROBOT_AWS_ACCESS_KEY_ID }} - # AWS_SECRET_ACCESS_KEY: ${{ secrets.ROBOT_AWS_SECRET_ACCESS_KEY }} - # run: | + REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') + # REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') - # # REGISTER_MODEL_NAME=$(echo ${{ github.event.pull_request.head.ref }} | awk -F"-" '{print $1}') - # REGISTER_MODEL_NAME=$(echo ${{github.ref_name}} | awk -F"-" '{print $1}') + git config user.name "Github-Bot" + git config user.email "Github-Bot@no-reply.com" - # git config user.name "Github-Bot" - # git config user.email "Github-Bot@no-reply.com" + latest_version=$(gto show ${REGISTER_MODEL_NAME}@latest --ref) + + new_tag=${latest_version}#dev - # # git tag model@v0.0.1 - # # git push origin model@v0.0.1 + git tag -a ${new_tag} -m "Registering Latest Version to Dev" + git push origin ${new_tag} - # # gto register test --repo https://github.com/Hestia-Homes/ML/ - # # echo "chicken" >> test.md + gto show > MODEL_REGISTRY.md + git add . + git commit -m "Update Registry" + git push - # # gto -v register ${REGISTER_MODEL_NAME} - # # gto assign regression --stage dev - # # gto show Register-Prediction-Image-Dev: needs: Promote-Artefacts-To-Dev diff --git a/MODEL_REGISTRY.md b/MODEL_REGISTRY.md new file mode 100644 index 0000000..117ac39 --- /dev/null +++ b/MODEL_REGISTRY.md @@ -0,0 +1,11 @@ +╒════════╤══════════╕ +│ name │ latest │ +╞════════╪══════════╡ +│ model │ v2.1.3 │ +╘════════╧══════════╛ +╒════════╤══════════╕ +│ name │ latest │ +╞════════╪══════════╡ +│ bob │ v1.0.0 │ +│ model │ v2.1.3 │ +╘════════╧══════════╛ diff --git a/modules/ml-monitoring/.gitignore b/modules/ml-monitoring/.gitignore new file mode 100644 index 0000000..832692f --- /dev/null +++ b/modules/ml-monitoring/.gitignore @@ -0,0 +1 @@ +.dev_env_monitoring/ diff --git a/modules/ml-monitoring/Makefile b/modules/ml-monitoring/Makefile new file mode 100644 index 0000000..20767ff --- /dev/null +++ b/modules/ml-monitoring/Makefile @@ -0,0 +1,20 @@ +export PYENV_ROOT=$(HOME)/.pyenv +export PATH := $(PYENV_ROOT)/bin:$(PATH) +PYTHON_VERSION ?= 3.10.12 + +.PHONY: init +init: dev-pyenv + +.PHONY: dev-pyenv +dev-pyenv: + curl https://pyenv.run | bash || echo "Pyenv - Already installed" + pyenv install ${PYTHON_VERSION} || echo "Python version already installed" + pyenv global ${PYTHON_VERSION} + python3 -m venv .dev_env_monitoring + . .dev_env_monitoring/bin/activate && pip install --upgrade pip && pip install -r src/evidently/src/requirements/requirements-dev.txt && pre-commit install + echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" + echo "source .dev_env_monitoring/bin/activate" + +.PHONY: dvc-init +dvc-init: + . .dev_env_monitoring/bin/activate && dvc init --subdir diff --git a/modules/ml-monitoring/src/evidently/.DS_Store b/modules/ml-monitoring/src/evidently/.DS_Store new file mode 100644 index 0000000..21bc00d Binary files /dev/null and b/modules/ml-monitoring/src/evidently/.DS_Store differ diff --git a/modules/ml-monitoring/src/evidently/MonitoringData.Dockerfile b/modules/ml-monitoring/src/evidently/MonitoringData.Dockerfile new file mode 100644 index 0000000..71661d8 --- /dev/null +++ b/modules/ml-monitoring/src/evidently/MonitoringData.Dockerfile @@ -0,0 +1,14 @@ +# Dockerfile that can be used to test loading a model to generate a prediction (part of CI/CD flow) +FROM python:3.10.12-slim + +COPY src/requirements/requirements.txt requirements.txt + +RUN pip install --upgrade pip +RUN pip install -r requirements.txt + +# Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script +COPY src/ /home/src/ + +WORKDIR /home/src/ + +CMD [ "python", "regression_report.py"] diff --git a/modules/ml-monitoring/src/evidently/src/regression_report.py b/modules/ml-monitoring/src/evidently/src/regression_report.py new file mode 100644 index 0000000..49f424c --- /dev/null +++ b/modules/ml-monitoring/src/evidently/src/regression_report.py @@ -0,0 +1,40 @@ +import boto3 +import pandas as pd +from evidently.report import Report +from evidently.metric_preset import ( + DataDriftPreset, + DataQualityPreset, +) + + +def run_evidently_dashboard(local_output: str = "./report.html"): + + # DUMMY TEST CASE + ref = pd.read_parquet( + "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet" + ).head(100) + cur = pd.read_parquet( + "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data.parquet" + ).head(100) + + report = Report( + metrics=[ + DataDriftPreset(), + DataQualityPreset(), + ] + ) + + report.run(reference_data=ref, current_data=cur) + report.save_html(local_output) + + s3 = boto3.client("s3") + + s3.upload_file(local_output, "retrofit-data-dev", "monitoring/test-report.html") + + print( + f'{local_output} uploaded to {"retrofit-data-dev" + "/" + "monitoring/test-report.html"} successfully.' + ) + + +if __name__ == "__main__": + run_evidently_dashboard() diff --git a/modules/ml-monitoring/src/evidently/src/requirements/requirements-dev.txt b/modules/ml-monitoring/src/evidently/src/requirements/requirements-dev.txt new file mode 100644 index 0000000..b5e534e --- /dev/null +++ b/modules/ml-monitoring/src/evidently/src/requirements/requirements-dev.txt @@ -0,0 +1,4 @@ +evidently==0.4.4 +pre-commit==3.3.3 +sphinx==7.2.5 +sphinx_rtd_theme==1.3.0 diff --git a/modules/ml-monitoring/src/evidently/src/requirements/requirements.txt b/modules/ml-monitoring/src/evidently/src/requirements/requirements.txt new file mode 100644 index 0000000..6c60e50 --- /dev/null +++ b/modules/ml-monitoring/src/evidently/src/requirements/requirements.txt @@ -0,0 +1,2 @@ +boto3==1.28.41 +evidently==0.4.4 diff --git a/modules/ml-pipeline/Makefile b/modules/ml-pipeline/Makefile index 45a16f5..d4d6fb7 100644 --- a/modules/ml-pipeline/Makefile +++ b/modules/ml-pipeline/Makefile @@ -10,11 +10,11 @@ dev-pyenv: curl https://pyenv.run | bash || echo "Pyenv - Already installed" pyenv install ${PYTHON_VERSION} || echo "Python version already installed" pyenv global ${PYTHON_VERSION} - python3 -m venv .dev_env - . .dev_env/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/src/requirements/training/requirements-dev.txt && pip install -r src/pipeline/src/requirements/version_control/requirements.txt && pre-commit install + python3 -m venv .dev_env_pipeline + . .dev_env_pipeline/bin/activate && pip install --upgrade pip && pip install -r src/pipeline/src/requirements/training/requirements-dev.txt && pip install -r src/pipeline/src/requirements/version_control/requirements.txt && pre-commit install echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" - echo "source .dev_env/bin/activate" + echo "source .dev_env_pipeline/bin/activate" .PHONY: dvc-init dvc-init: - . .dev_env/bin/activate && dvc init --subdir + . .dev_env_pipeline/bin/activate && dvc init --subdir diff --git a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml index d97cf22..a1307c1 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/build_model.yaml @@ -1,5 +1,5 @@ -model_type: SKLearnLinearRegression -model_save_filepath: ./data/model/model.joblib +model_type: AutogluonAutoML +model_save_filepath: ./data/model/autogluonmodel/ SKLearnLinearRegression: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml index 18e6f84..9aa02f0 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/feature_processor.yaml @@ -3,6 +3,6 @@ feature_processor_config: subsample_amount: null subsample_seed: 0 target: RDSAP_CHANGE - drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"] - retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"] - # retain_features: null + drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE"] + # retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "TOTAL_FLOOR_AREA_ENDING"] + retain_features: null diff --git a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml index af8a802..273e78d 100644 --- a/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml +++ b/modules/ml-pipeline/src/pipeline/src/configs/prepare_data.yaml @@ -1,7 +1,7 @@ input_dataclient_type: aws-s3 output_dataclient_type: local datahandler_type: parquet -data_filepath: s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet +data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet train_proportion: 0.1 output_train_filepath: ./data/prepared_data/train.parquet output_test_filepath: ./data/prepared_data/test.parquet diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.lock b/modules/ml-pipeline/src/pipeline/src/dvc.lock index b5d7e23..f04423a 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/src/dvc.lock @@ -15,8 +15,8 @@ stages: outs: - path: data/prepared_data/ hash: md5 - md5: f7e45d3997cf165904174b2bc2d2eba5.dir - size: 4396934 + md5: febdc8362200167078dfa578cf2bc889.dir + size: 24296908 nfiles: 2 build_model: cmd: python build_model.py @@ -27,8 +27,8 @@ stages: size: 3948 - path: data/prepared_data hash: md5 - md5: f7e45d3997cf165904174b2bc2d2eba5.dir - size: 4396934 + md5: febdc8362200167078dfa578cf2bc889.dir + size: 24296908 nfiles: 2 params: configs/build_model.yaml: @@ -42,26 +42,26 @@ stages: SKLearnLinearRegression: SKLearnSVMRegression: kernel: linear - model_save_filepath: ./data/model/model.joblib - model_type: SKLearnLinearRegression + model_save_filepath: ./data/model/autogluonmodel/ + model_type: AutogluonAutoML outs: - path: data/model/ hash: md5 - md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir - size: 904 - nfiles: 1 + md5: 154f823d56a9892948a633789d9b08a5.dir + size: 680552724 + nfiles: 18 generate_predictions: cmd: python generate_predictions.py deps: - path: data/model hash: md5 - md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir - size: 904 - nfiles: 1 + md5: 154f823d56a9892948a633789d9b08a5.dir + size: 680552724 + nfiles: 18 - path: data/prepared_data hash: md5 - md5: f7e45d3997cf165904174b2bc2d2eba5.dir - size: 4396934 + md5: febdc8362200167078dfa578cf2bc889.dir + size: 24296908 nfiles: 2 - path: generate_predictions.py hash: md5 @@ -77,21 +77,21 @@ stages: outs: - path: data/predictions/ hash: md5 - md5: ea0431b600f0ef357de3a543482cefe7.dir - size: 4085105 + md5: d8abefde18d78588158ef6acf282e2ed.dir + size: 2948553 nfiles: 1 generate_metrics: cmd: python generate_metrics.py deps: - path: data/predictions hash: md5 - md5: ea0431b600f0ef357de3a543482cefe7.dir - size: 4085105 + md5: d8abefde18d78588158ef6acf282e2ed.dir + size: 2948553 nfiles: 1 - path: data/prepared_data hash: md5 - md5: f7e45d3997cf165904174b2bc2d2eba5.dir - size: 4396934 + md5: febdc8362200167078dfa578cf2bc889.dir + size: 24296908 nfiles: 2 - path: generate_metrics.py hash: md5 @@ -107,8 +107,8 @@ stages: outs: - path: metrics/metrics.json hash: md5 - md5: ae53c4781cb8a754d24e29ba7ddb16ea - size: 183 + md5: f5aaae75ea74241500cd1ce76751c579 + size: 182 startup_cleanup: cmd: python startup_cleanup.py deps: diff --git a/modules/ml-pipeline/src/pipeline/src/dvc.yaml b/modules/ml-pipeline/src/pipeline/src/dvc.yaml index 42e8947..afaaa71 100644 --- a/modules/ml-pipeline/src/pipeline/src/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/src/dvc.yaml @@ -19,6 +19,7 @@ stages: - train_proportion outs: - data/prepared_data/ + always_changed: true build_model: cmd: python build_model.py deps: @@ -28,6 +29,7 @@ stages: - configs/build_model.yaml: outs: - data/model/ + always_changed: true generate_predictions: cmd: python generate_predictions.py deps: @@ -38,6 +40,7 @@ stages: - configs/generate_predictions.yaml: outs: - data/predictions/ + always_changed: true generate_metrics: cmd: python generate_metrics.py deps: @@ -48,5 +51,6 @@ stages: - configs/generate_metrics.yaml: outs: - metrics/metrics.json + always_changed: true metrics: - metrics/metrics.json