try the scenario cml

This commit is contained in:
Michael Duong 2024-02-16 16:43:23 +00:00
parent 6e76716fbc
commit 2221283de4
9 changed files with 205 additions and 26 deletions

View file

@ -98,6 +98,10 @@ jobs:
git fetch --depth=1 origin ${TARGET_BRANCH}:${TARGET_BRANCH}
dvc metrics diff --md --all ${TARGET_BRANCH} >> report.md
echo "## Scenario metrics" > report.md
cat metrics/scenarios/scenario_table.md >> report.md
cml comment create report.md
# echo "## Residuals plot from model" >> report.md

View file

@ -0,0 +1,125 @@
"""
Fourth part of the pipeline:
After the model is built and metrics are generated,
we want to test this model against known scenarios
"""
import os
import pandas as pd
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
from configs.post_prediction_logic import post_prediction_logic
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.Logger import logger
from config import settings
logger.info(f"--- Initiate Parameters ---")
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_params = settings.client
prepare_data_params = settings.prepare_data
build_model_params = settings.build_model
generate_predictions_params = settings.generate_predictions
generate_metrics_params = settings.generate_metrics
feature_process_params = settings.feature_processor
scenarios_params = settings.scenarios
model_filepath = build_model_params["model_save_filepath"]
target = feature_process_params["feature_processor_config"]["target"]
scenario_data_filepaths = scenarios_params["scenario_data_filepaths"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
output_filepath = scenarios_params["output_filepath"]
logger.info(f"--- Initiate MLModel ---")
model = model_factory(build_model_params["model_type"])
logger.info(f"--- Initiate DataClient ---")
# Use data client for input and output, as we use dvc to cache later to the cloud
input_dataclient_type = scenarios_params["input_dataclient_type"]
input_dataclient = dataclient_factory(
dataclient_type=input_dataclient_type,
dataclient_config=client_params[input_dataclient_type],
)
output_dataclient_type = scenarios_params["output_dataclient_type"]
output_dataclient = dataclient_factory(
dataclient_type=output_dataclient_type,
dataclient_config=client_params[output_dataclient_type],
)
def generate_scenario_predictions(
input_dataclient: DataClient,
output_dataclient: DataClient,
model: MLModel,
model_filepath: str,
scenario_data_filepaths: list,
predictions_column_name: str,
output_filepath: str,
):
"""
Given the new model, we generate prediction for expected scenarios
"""
logger.info("--- Loading Scenario Data ---")
scenario_data = pd.DataFrame()
# Can have multiple scenario data files
for scenario_data_filepath in scenario_data_filepaths:
scenario_data = pd.concat(
[
scenario_data,
input_dataclient.load_data(scenario_data_filepath, load_config=None),
]
)
logger.info("--- Loading Model ---")
model.load_model(model_filepath)
logger.info("--- Generating Predictions ---")
predictions = model.predict(
data=scenario_data, post_prediction_logic=post_prediction_logic
)
logger.info("--- Generate Scenario Predicted Impact ---")
predictions_df = pd.DataFrame(predictions)
predictions_df.columns = [predictions_column_name]
scenario_data = pd.concat([scenario_data, predictions_df], axis=1)
scenario_data["predicted_impact"] = abs(
scenario_data[predictions_column_name] - scenario_data["sap_starting"]
)
logger.info("--- Save prediction into metrics ---")
output_df = scenario_data[["uprn", "id", "impact", "predicted_impact"]]
output_dataclient.save_data(
obj=output_df, location=output_filepath, save_config=None
)
if __name__ == "__main__":
logger.info(f"--- {__file__} - Start! ---")
logger.info(f"--- Generate Scenario Predictions ---")
generate_scenario_predictions(
input_dataclient=input_dataclient,
output_dataclient=output_dataclient,
model=model,
model_filepath=model_filepath,
scenario_data_filepaths=scenario_data_filepaths,
predictions_column_name=predictions_column_name,
output_filepath=output_filepath,
)
logger.info(f"--- {__file__} - Complete! ---")

View file

@ -7,6 +7,7 @@ settings = Dynaconf(
"./configs/settings.yaml",
"./configs/build_model.yaml",
"./configs/analysis.yaml",
"./configs/scenarios.yaml",
],
)

View file

@ -14,7 +14,7 @@ default:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 4000
time_limit: 60
presets: medium_quality
excluded_model_types: ['RF', 'NN_TORCH', 'KNN', 'XT', 'CAT', 'FASTAI']
infer_limit: 0.05

View file

@ -0,0 +1,9 @@
default:
scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
[
s3://retrofit-data-dev/scenario_data/recommendations_scoring_data.parquet,
]
output_filepath: ./metrics/scenario_table.md

View file

@ -245,7 +245,8 @@ class LocalClient:
save_methods = {
".parquet": self._save_parquet,
".json": self._save_json
".json": self._save_json,
".md": self._save_md,
# "": _save_directory(**save_config),
# ADD MORE save_methods HERE
}
@ -294,3 +295,10 @@ class LocalClient:
# Write the contents of the buffer to the local file
with open(location, "wb") as f:
f.write(buffer.getvalue())
def _save_md(self, obj: pd.DataFrame, location: str, save_config: dict):
"""
Save object as markdown
"""
obj.to_markdown(location, **save_config)

View file

@ -31,8 +31,8 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: 8f0f5481075094460ab852ace2fa9b7a.dir
size: 43692138
md5: 86d085385f7e170d951e95d5e9d0f0bc.dir
size: 43684784
nfiles: 2
build_model:
cmd: python 2_build_model.py
@ -43,8 +43,8 @@ stages:
size: 4820
- path: data/prepared_data
hash: md5
md5: 8f0f5481075094460ab852ace2fa9b7a.dir
size: 43692138
md5: 86d085385f7e170d951e95d5e9d0f0bc.dir
size: 43684784
nfiles: 2
params:
configs/build_model.yaml:
@ -61,7 +61,7 @@ stages:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error
time_limit: 4000
time_limit: 60
presets: medium_quality
excluded_model_types:
- RF
@ -75,17 +75,17 @@ stages:
outs:
- path: data/fit_predictions/
hash: md5
md5: e2a05a84a14d35516a6cda8e0a1e963c.dir
size: 3681005
md5: 69cbcceee3e360e0040a7c45ed72ef7f.dir
size: 3674358
nfiles: 1
- path: data/model/
hash: md5
md5: 7b0382d001ed2bd7aec5c8112f69d129.dir
size: 793365790
nfiles: 30
md5: 09757210fdbaa9ad216a84285cf1cbf2.dir
size: 353975267
nfiles: 21
- path: metrics/fit_metrics.json
hash: md5
md5: bcfd8d3bd3af858fa3dc26433bc8cd9e
md5: 69be95e8d60eb7cef41ec1e69fa9d2ce
size: 224
generate_predictions:
cmd: python 3_generate_predictions.py
@ -96,13 +96,13 @@ stages:
size: 2464
- path: data/model
hash: md5
md5: 7b0382d001ed2bd7aec5c8112f69d129.dir
size: 793365790
nfiles: 30
md5: 09757210fdbaa9ad216a84285cf1cbf2.dir
size: 353975267
nfiles: 21
- path: data/prepared_data
hash: md5
md5: 8f0f5481075094460ab852ace2fa9b7a.dir
size: 43692138
md5: 86d085385f7e170d951e95d5e9d0f0bc.dir
size: 43684784
nfiles: 2
params:
configs/settings.yaml:
@ -114,8 +114,8 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 90b5275b5d9829a42573ade3f5a025d2.dir
size: 648526
md5: 2a0421436d59d95e52a51571c34e0ce9.dir
size: 647012
nfiles: 1
generate_metrics:
cmd: python 4_generate_metrics.py
@ -126,13 +126,13 @@ stages:
size: 3484
- path: data/predictions
hash: md5
md5: 90b5275b5d9829a42573ade3f5a025d2.dir
size: 648526
md5: 2a0421436d59d95e52a51571c34e0ce9.dir
size: 647012
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 8f0f5481075094460ab852ace2fa9b7a.dir
size: 43692138
md5: 86d085385f7e170d951e95d5e9d0f0bc.dir
size: 43684784
nfiles: 2
params:
configs/settings.yaml:
@ -142,8 +142,8 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: be48389ba2755e6c18e41243aaa9bb81
size: 226
md5: 83698142cedb9fb4df5ab82f408690a2
size: 222
startup_cleanup:
cmd: python 0_startup_cleanup.py
deps:
@ -155,3 +155,23 @@ stages:
configs/settings.yaml:
default.startup_cleanup.artefacts: ./data
default.startup_cleanup.metrics: ./metrics
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
- path: 5_generate_scenarios.py
hash: md5
md5: 30f80ffeb6ee50c5f7b82943a4dc7702
size: 4014
params:
configs/scenarios.yaml:
default.scenarios:
input_dataclient_type: aws-s3
output_dataclient_type: local
scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/recommendations_scoring_data.parquet
output_filepath: ./metrics/scenario_table.md
outs:
- path: metrics/scenario_table.md
hash: md5
md5: 36b1b26224ebbbfd5b2bbb15ae173247
size: 1648

View file

@ -71,6 +71,17 @@ stages:
outs:
- metrics/metrics.json
always_changed: true
generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py
deps:
- 5_generate_scenarios.py
params:
- configs/scenarios.yaml:
- default.scenarios
outs:
- metrics/scenario_table.md
always_changed: true
metrics:
- metrics/metrics.json
- metrics/fit_metrics.json
- metrics/scenario_table.md

View file

@ -1,2 +1,3 @@
/fit_metrics.json
/metrics.json
/scenario_table.md