From f2de544b6b579620034c990f6321e6ba72183909 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Tue, 17 Oct 2023 23:52:51 +0000 Subject: [PATCH] push up temp code --- .vscode/settings.json | 5 + deployment/Dockerfile.monitoring.lambda | 29 +++ modules/ml-monitoring/.gitignore | 1 + modules/ml-monitoring/Makefile | 17 +- .../src/evidently/src/configs/settings.yaml | 0 .../src/evidently/src/core/MLReport.py | 38 ++++ .../src/core/interface/InterfaceMLReport.py | 19 ++ .../src/evidently/src/generate_report.py | 174 ++++++++++++++++++ .../src/requirements/requirements-dev.txt | 3 +- .../src/requirements/requirements.txt | 2 +- 10 files changed, 284 insertions(+), 4 deletions(-) create mode 100644 .vscode/settings.json create mode 100644 deployment/Dockerfile.monitoring.lambda create mode 100644 modules/ml-monitoring/src/evidently/src/configs/settings.yaml create mode 100644 modules/ml-monitoring/src/evidently/src/core/MLReport.py create mode 100644 modules/ml-monitoring/src/evidently/src/core/interface/InterfaceMLReport.py create mode 100644 modules/ml-monitoring/src/evidently/src/generate_report.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..3baa4e1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python.analysis.extraPaths": [ + "./modules/ml-monitoring/src/evidently/src" + ] +} diff --git a/deployment/Dockerfile.monitoring.lambda b/deployment/Dockerfile.monitoring.lambda new file mode 100644 index 0000000..2649f9b --- /dev/null +++ b/deployment/Dockerfile.monitoring.lambda @@ -0,0 +1,29 @@ +FROM public.ecr.aws/lambda/python:3.10 + +# Set the working directory +WORKDIR ${LAMBDA_TASK_ROOT} +ENV PYTHONPATH "${PYTHONPATH}:${LAMBDA_TASK_ROOT}" + +# Environment variables +ARG RUNTIME_ENVIRONMENT +ENV RUNTIME_ENVIRONMENT=${RUNTIME_ENVIRONMENT} + +# Install necessary build tools - required to test locally +RUN yum install -y gcc python3-devel + +# Install python packages +COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./predictions_requirements.txt +COPY modules/ml-monitoring/src/evidently/src/requirements/requirements.txt ./monitoring_requirements.txt +RUN pip install --no-cache-dir -r ./predictions_requirements.txt -r ./monitoring_requirements.txt + +# Copy the project code +COPY modules/ml-pipeline/src/pipeline ./pipeline +# Copy the monitoring code +COPY modules/ml-monitoring/src/evidently/src ./monitoring + +# Copy the handler +COPY deployment/handlers/monitoring_app.py ./pipeline/monitoring_app.py +WORKDIR ${LAMBDA_TASK_ROOT}/pipeline + + +CMD [ "monitoring_app.handler" ] diff --git a/modules/ml-monitoring/.gitignore b/modules/ml-monitoring/.gitignore index 832692f..a795b06 100644 --- a/modules/ml-monitoring/.gitignore +++ b/modules/ml-monitoring/.gitignore @@ -1 +1,2 @@ .dev_env_monitoring/ +workspace/ diff --git a/modules/ml-monitoring/Makefile b/modules/ml-monitoring/Makefile index 20767ff..6fc7b4d 100644 --- a/modules/ml-monitoring/Makefile +++ b/modules/ml-monitoring/Makefile @@ -1,9 +1,24 @@ export PYENV_ROOT=$(HOME)/.pyenv export PATH := $(PYENV_ROOT)/bin:$(PATH) PYTHON_VERSION ?= 3.10.12 +CONDA_ENV=dev_env_monitoring .PHONY: init -init: dev-pyenv +init: dev-conda + +.PHONY: dev-conda +dev-conda: + # conda deactivate || echo "Not in conda environment" + # conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously" + conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y + conda init bash + conda run -v -n ${CONDA_ENV} pip install --upgrade pip + conda run -v -n ${CONDA_ENV} pip install -r src/evidently/src/requirements/requirements-dev.txt + # conda run -vvvv -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt + conda run -v -n ${CONDA_ENV} pre-commit install + conda run -v -n ${CONDA_ENV} pip install ipykernel + echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND" + echo "conda activate ${CONDA_ENV}" .PHONY: dev-pyenv dev-pyenv: diff --git a/modules/ml-monitoring/src/evidently/src/configs/settings.yaml b/modules/ml-monitoring/src/evidently/src/configs/settings.yaml new file mode 100644 index 0000000..e69de29 diff --git a/modules/ml-monitoring/src/evidently/src/core/MLReport.py b/modules/ml-monitoring/src/evidently/src/core/MLReport.py new file mode 100644 index 0000000..897e15a --- /dev/null +++ b/modules/ml-monitoring/src/evidently/src/core/MLReport.py @@ -0,0 +1,38 @@ +""" +Implementations of the MLReport protocol +""" + +from core.interface.InterfaceMLReport import MLReport + + +def report_factory(report_type: str, report_config: dict | None = None) -> MLReport: + """ + Select the type of reporting you require + """ + + if report_config is None: + report_config = {} + + reports = { + "data-quality": DataQualityReport, + "regression": RegressionReport, + # Add more report types here + } + + if report_type not in reports: + raise ValueError("Report type specified is not in factory") + + return reports[report_type](**report_config) + + +class DataQualityReport: + def generate_report(): + return 1 + + +class RegressionReport: + + report_type = "regression" + + def generate_report(): + return 1 diff --git a/modules/ml-monitoring/src/evidently/src/core/interface/InterfaceMLReport.py b/modules/ml-monitoring/src/evidently/src/core/interface/InterfaceMLReport.py new file mode 100644 index 0000000..bbf7b3d --- /dev/null +++ b/modules/ml-monitoring/src/evidently/src/core/interface/InterfaceMLReport.py @@ -0,0 +1,19 @@ +""" +Interface for generating MLReports +""" + +import pandas as pd +from typing import Protocol + + +class MLReport(Protocol): + """ + Declare methods for MLReport implementation + """ + + def generate_report( + reference_data: pd.DataFrame, current_data: pd.DataFrame, location: str + ) -> None: + """ + Create a html report + """ diff --git a/modules/ml-monitoring/src/evidently/src/generate_report.py b/modules/ml-monitoring/src/evidently/src/generate_report.py new file mode 100644 index 0000000..fcaf1e0 --- /dev/null +++ b/modules/ml-monitoring/src/evidently/src/generate_report.py @@ -0,0 +1,174 @@ +""" +Create a report regarding the data quality +""" + +import datetime + +from sklearn import datasets + +from evidently.metrics import ColumnDriftMetric +from evidently.metrics import ColumnSummaryMetric +from evidently.metrics import DatasetDriftMetric +from evidently.metrics import DatasetMissingValuesMetric +from evidently.report import Report +from evidently.test_preset import DataDriftTestPreset +from evidently.test_suite import TestSuite +from evidently.ui.dashboards import CounterAgg +from evidently.ui.dashboards import DashboardPanelCounter +from evidently.ui.dashboards import DashboardPanelPlot +from evidently.ui.dashboards import PanelValue +from evidently.ui.dashboards import PlotType +from evidently.ui.dashboards import ReportFilter +from evidently.ui.remote import RemoteWorkspace +from evidently.ui.workspace import Workspace +from evidently.ui.workspace import WorkspaceBase +import pandas as pd + +# DUMMY TEST CASE +ref = pd.read_parquet( + "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet" +).head(1000) +cur = pd.read_parquet( + "s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/test_data.parquet" +) + +WORKSPACE = "workspace" + +YOUR_PROJECT_NAME = "Data Drift Monitoring" +YOUR_PROJECT_DESCRIPTION = "Monitoring Data for modelling process" + + +def create_report(i: int): + data_drift_report = Report( + metrics=[ + DatasetDriftMetric(), + DatasetMissingValuesMetric(), + # ColumnDriftMetric(column_name="age", stattest="wasserstein"), + # ColumnSummaryMetric(column_name="age"), + # ColumnDriftMetric(column_name="education-num", stattest="wasserstein"), + # ColumnSummaryMetric(column_name="education-num"), + ], + timestamp=datetime.datetime.now() + datetime.timedelta(days=i), + ) + + data_drift_report.run( + reference_data=ref, current_data=cur.iloc[100 * i : 100 * (i + 1), :] + ) + return data_drift_report + + +def create_test_suite(i: int): + data_drift_test_suite = TestSuite( + tests=[DataDriftTestPreset()], + timestamp=datetime.datetime.now() + datetime.timedelta(days=i), + ) + + data_drift_test_suite.run( + reference_data=ref, current_data=cur.iloc[100 * i : 100 * (i + 1), :] + ) + return data_drift_test_suite + + +def create_project(workspace: WorkspaceBase): + project = workspace.create_project(YOUR_PROJECT_NAME) + project.description = YOUR_PROJECT_DESCRIPTION + project.dashboard.add_panel( + DashboardPanelCounter( + filter=ReportFilter(metadata_values={}, tag_values=[]), + agg=CounterAgg.NONE, + title="Census Income Dataset (Adult)", + ) + ) + # project.dashboard.add_panel( + # DashboardPanelCounter( + # title="Model Calls", + # filter=ReportFilter(metadata_values={}, tag_values=[]), + # value=PanelValue( + # metric_id="DatasetMissingValuesMetric", + # field_path=DatasetMissingValuesMetric.fields.current.number_of_rows, + # legend="count", + # ), + # text="count", + # agg=CounterAgg.SUM, + # size=1, + # ) + # ) + # project.dashboard.add_panel( + # DashboardPanelCounter( + # title="Share of Drifted Features", + # filter=ReportFilter(metadata_values={}, tag_values=[]), + # value=PanelValue( + # metric_id="DatasetDriftMetric", + # field_path="share_of_drifted_columns", + # legend="share", + # ), + # text="share", + # agg=CounterAgg.LAST, + # size=1, + # ) + # ) + # project.dashboard.add_panel( + # DashboardPanelPlot( + # title="Dataset Quality", + # filter=ReportFilter(metadata_values={}, tag_values=[]), + # values=[ + # PanelValue(metric_id="DatasetDriftMetric", field_path="share_of_drifted_columns", legend="Drift Share"), + # PanelValue( + # metric_id="DatasetMissingValuesMetric", + # field_path=DatasetMissingValuesMetric.fields.current.share_of_missing_values, + # legend="Missing Values Share", + # ), + # ], + # plot_type=PlotType.LINE, + # ) + # ) + # project.dashboard.add_panel( + # DashboardPanelPlot( + # title="Age: Wasserstein drift distance", + # filter=ReportFilter(metadata_values={}, tag_values=[]), + # values=[ + # PanelValue( + # metric_id="ColumnDriftMetric", + # metric_args={"column_name.name": "age"}, + # field_path=ColumnDriftMetric.fields.drift_score, + # legend="Drift Score", + # ), + # ], + # plot_type=PlotType.BAR, + # size=1, + # ) + # ) + # project.dashboard.add_panel( + # DashboardPanelPlot( + # title="Education-num: Wasserstein drift distance", + # filter=ReportFilter(metadata_values={}, tag_values=[]), + # values=[ + # PanelValue( + # metric_id="ColumnDriftMetric", + # metric_args={"column_name.name": "education-num"}, + # field_path=ColumnDriftMetric.fields.drift_score, + # legend="Drift Score", + # ), + # ], + # plot_type=PlotType.BAR, + # size=1, + # ) + # ) + project.save() + return project + + +def create_demo_project(workspace: str): + ws = Workspace.create(workspace) + project = create_project(ws) + + for i in range(0, 5): + report = create_report(i=i) + ws.add_report(project.id, report) + + test_suite = create_test_suite(i=i) + ws.add_test_suite(project.id, test_suite) + + +if __name__ == "__main__": + create_demo_project(WORKSPACE) diff --git a/modules/ml-monitoring/src/evidently/src/requirements/requirements-dev.txt b/modules/ml-monitoring/src/evidently/src/requirements/requirements-dev.txt index b5e534e..0a2d320 100644 --- a/modules/ml-monitoring/src/evidently/src/requirements/requirements-dev.txt +++ b/modules/ml-monitoring/src/evidently/src/requirements/requirements-dev.txt @@ -1,4 +1,3 @@ +boto3==1.28.41 evidently==0.4.4 pre-commit==3.3.3 -sphinx==7.2.5 -sphinx_rtd_theme==1.3.0 diff --git a/modules/ml-monitoring/src/evidently/src/requirements/requirements.txt b/modules/ml-monitoring/src/evidently/src/requirements/requirements.txt index 6c60e50..291da8b 100644 --- a/modules/ml-monitoring/src/evidently/src/requirements/requirements.txt +++ b/modules/ml-monitoring/src/evidently/src/requirements/requirements.txt @@ -1,2 +1,2 @@ boto3==1.28.41 -evidently==0.4.4 +evidently==0.4.6