From c3985e2104d9acfa112ad4b0247a47755c552e97 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Wed, 27 Mar 2024 12:22:58 +0000 Subject: [PATCH] add metrics for scenarios --- .github/workflows/MLPipelinePullRequest.yml | 6 +++- .../src/pipeline/5_generate_scenarios.py | 33 ++++++++++++++++--- .../src/pipeline/configs/scenarios.yaml | 6 ++-- modules/ml-pipeline/src/pipeline/dvc.lock | 15 ++++++--- modules/ml-pipeline/src/pipeline/dvc.yaml | 1 + .../src/pipeline/metrics/.gitignore | 1 + 6 files changed, 50 insertions(+), 12 deletions(-) diff --git a/.github/workflows/MLPipelinePullRequest.yml b/.github/workflows/MLPipelinePullRequest.yml index 493aef9..8e59cc8 100644 --- a/.github/workflows/MLPipelinePullRequest.yml +++ b/.github/workflows/MLPipelinePullRequest.yml @@ -98,10 +98,14 @@ jobs: git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH} dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md - echo "## Scenario metrics" >> report.md + echo "## Scenario comparison" >> report.md cat metrics/scenario_table.md >> report.md + echo "## Scenario metrics" >> report.md + + cat metrics/scenario_metrics.md >> report.md + cml comment create report.md # echo "## Residuals plot from model" >> report.md diff --git a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py index 28bcb9d..9d2fa68 100644 --- a/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py +++ b/modules/ml-pipeline/src/pipeline/5_generate_scenarios.py @@ -8,9 +8,11 @@ import os import pandas as pd from core.interface.InterfaceModels import MLModel from core.interface.InterfaceDataClient import DataClient +from core.interface.InterfaceMetrics import MLMetrics from configs.post_prediction_logic import post_prediction_logic from core.DataClient import dataclient_factory from core.MLModels import model_factory +from core.MLMetrics import metrics_factory from core.Logger import logger from config import settings @@ -30,7 +32,8 @@ model_filepath = build_model_params["model_save_filepath"] target = feature_process_params["feature_processor_config"]["target"] scenario_data_filepaths = scenarios_params["scenario_data_filepaths"] predictions_column_name = generate_predictions_params["predictions_column_name"] -output_filepath = scenarios_params["output_filepath"] +comparison_output_filepath = scenarios_params["comparison_output_filepath"] +metrics_output_filepath = scenarios_params["metrics_output_filepath"] logger.info(f"--- Initiate MLModel ---") @@ -51,15 +54,21 @@ output_dataclient = dataclient_factory( dataclient_config=client_params[output_dataclient_type], ) +logger.info(f"--- Initiate MLMetrics ---") + +metrics = metrics_factory(generate_metrics_params["metrics_type"]) + def generate_scenario_predictions( input_dataclient: DataClient, output_dataclient: DataClient, model: MLModel, + metrics: MLMetrics, model_filepath: str, scenario_data_filepaths: list, predictions_column_name: str, - output_filepath: str, + comparison_output_filepath: str, + metrics_output_filepath: str, ): """ Given the new model, we generate prediction for expected scenarios @@ -98,16 +107,30 @@ def generate_scenario_predictions( scenario_data[predictions_column_name] - scenario_data["sap_starting"] ) + logger.info("--- Generate Metrics ---") + + metrics_dict = metrics.generate_metrics( + scenario_data["impact"], scenario_data["predicted_impact"] + ) + + metrics_df = pd.DataFrame(metrics_dict, index=[0]).T.reset_index() + metrics_df.columns = ["metric", "value"] + logger.info("--- Save prediction into metrics ---") output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]] output_dataclient.save_data( - obj=output_df, location=output_filepath, save_config=None + obj=output_df, location=comparison_output_filepath, save_config=None + ) + + output_dataclient.save_data( + obj=metrics_df, location=metrics_output_filepath, save_config=None ) if __name__ == "__main__": + logger.info(f"--- {__file__} - Start! ---") logger.info(f"--- Generate Scenario Predictions ---") @@ -116,10 +139,12 @@ if __name__ == "__main__": input_dataclient=input_dataclient, output_dataclient=output_dataclient, model=model, + metrics=metrics, model_filepath=model_filepath, scenario_data_filepaths=scenario_data_filepaths, predictions_column_name=predictions_column_name, - output_filepath=output_filepath, + comparison_output_filepath=comparison_output_filepath, + metrics_output_filepath=metrics_output_filepath, ) logger.info(f"--- {__file__} - Complete! ---") diff --git a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml index e76336a..2df0cb6 100644 --- a/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml +++ b/modules/ml-pipeline/src/pipeline/configs/scenarios.yaml @@ -4,5 +4,7 @@ default: output_dataclient_type: local scenario_data_filepaths: # - s3://retrofit-data-dev/scenario_data/22-03-2024-19-20-09/recommendations_scoring_data.parquet - - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet - output_filepath: ./metrics/scenario_table.md + # - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet + - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + comparison_output_filepath: ./metrics/scenario_table.md + metrics_output_filepath: ./metrics/scenario_metrics.md diff --git a/modules/ml-pipeline/src/pipeline/dvc.lock b/modules/ml-pipeline/src/pipeline/dvc.lock index d6bce15..104dc83 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.lock +++ b/modules/ml-pipeline/src/pipeline/dvc.lock @@ -208,18 +208,23 @@ stages: deps: - path: 5_generate_scenarios.py hash: md5 - md5: 30f80ffeb6ee50c5f7b82943a4dc7702 - size: 4014 + md5: a18f6c6ae2082f038df47386cf3e418e + size: 4896 params: configs/scenarios.yaml: default.scenarios: input_dataclient_type: aws-s3 output_dataclient_type: local scenario_data_filepaths: - - s3://retrofit-data-dev/scenario_data/24-03-2024-20-23-25/recommendations_scoring_data.parquet - output_filepath: ./metrics/scenario_table.md + - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet + comparison_output_filepath: ./metrics/scenario_table.md + metrics_output_filepath: ./metrics/scenario_metrics.md outs: + - path: metrics/scenario_metrics.md + hash: md5 + md5: 64e7db945ff655ae03c20c9845f19106 + size: 363 - path: metrics/scenario_table.md hash: md5 - md5: 54856c66fca8b2ebd1fa4dea2d25734a + md5: d4f8afe07b774374aeaa48f1b7b8a5fc size: 2133 diff --git a/modules/ml-pipeline/src/pipeline/dvc.yaml b/modules/ml-pipeline/src/pipeline/dvc.yaml index 5ce35ce..6026a83 100644 --- a/modules/ml-pipeline/src/pipeline/dvc.yaml +++ b/modules/ml-pipeline/src/pipeline/dvc.yaml @@ -80,6 +80,7 @@ stages: - default.scenarios outs: - metrics/scenario_table.md + - metrics/scenario_metrics.md always_changed: true metrics: - metrics/metrics.json diff --git a/modules/ml-pipeline/src/pipeline/metrics/.gitignore b/modules/ml-pipeline/src/pipeline/metrics/.gitignore index 189c2ee..6427764 100644 --- a/modules/ml-pipeline/src/pipeline/metrics/.gitignore +++ b/modules/ml-pipeline/src/pipeline/metrics/.gitignore @@ -1,3 +1,4 @@ /fit_metrics.json /metrics.json /scenario_table.md +/scenario_metrics.md