From 1028861a1b0f8e9bb4a117335150923d6101f222 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 12 Feb 2024 16:54:16 +0000 Subject: [PATCH] implementing the scoring process into testing pipeline --- etl/testing_data/sap_model_simulation.py | 82 +++++++++++++++++++++--- 1 file changed, 73 insertions(+), 9 deletions(-) diff --git a/etl/testing_data/sap_model_simulation.py b/etl/testing_data/sap_model_simulation.py index 6ff89691..0434ffd5 100644 --- a/etl/testing_data/sap_model_simulation.py +++ b/etl/testing_data/sap_model_simulation.py @@ -2,9 +2,12 @@ import json import pandas as pd from tqdm import tqdm -from utils.s3 import read_dataframe_from_s3_parquet, save_data_to_s3 +from utils.s3 import read_dataframe_from_s3_parquet, save_data_to_s3, save_dataframe_to_s3_parquet from backend.Property import Property +# This is the github pr number +MODEL_VERSION = "100" + def app(): dataset = read_dataframe_from_s3_parquet( @@ -76,6 +79,20 @@ def app(): ~dataset["internal_insulation"]) ]["walls_thermal_transmittance"].min() + ending_colums = [col for col in dataset.columns if col.endswith("_ending")] + # For the purpose of scoring, we want to simulate JUST the impact of the measure we're testing. We therefore + # need to make sure that every "_ending" column is equal to its starting value + column_config = {} + for ending_col in ending_colums: + base_col = ending_col.replace("_ending", "") + # We check if the starting column ends with _starting or is just the base col + if base_col + "_starting" in dataset.columns: + column_config[ending_col] = base_col + "_starting" + elif base_col in dataset.columns: + column_config[ending_col] = base_col + else: + raise ValueError("something went wrong") + loft_insulation_testing_data = [] solid_wall_testing_data = [] cavity_wall_testing_data = [] @@ -93,7 +110,11 @@ def app(): (dataset["built_form"] == property_config.built_form) & (dataset["floor_area_quantile"] == property_config.floor_area_quantile) & (dataset["construction_age_band"] == property_config.construction_age_band) - ] + ].copy() + + # Re-set all of the ending columns + for col in ending_colums: + population[col] = population[column_config[col]] # 1) Loft insulation @@ -185,9 +206,9 @@ def app(): ) # Simulated EWI - best_external_wall_uvalue_wall_insulation_simulation = Property.create_recommendation_scoring_data( + external_wall_insulation_simulation = Property.create_recommendation_scoring_data( property_id=row["uprn"].values[0], - recommendation_record=row.copy(), + recommendation_record=row.copy().to_dict("records")[0], recommendation={ "recommendation_id": "external_wall_insulation", "type": "external_wall_insulation", @@ -198,7 +219,7 @@ def app(): # The iww/ewi simulations will be next to each other, so we can see how they differ for the same property solid_wall_testing_data.append(internal_wall_insulation_simulation) - solid_wall_testing_data.append(best_external_wall_uvalue_wall_insulation_simulation) + solid_wall_testing_data.append(external_wall_insulation_simulation) # 3) Cavity wall insulation cavity_wall_sample = population[ @@ -291,9 +312,7 @@ def app(): ] if not single_glazing_sample.empty: - row = single_glazing_sample[ - single_glazing_sample["multi_glaze_proportion_starting"] == value - ].sample(1) + row = single_glazing_sample.sample(1) # For single glazed windows, we can recommend double glazing or secondary glazing @@ -456,7 +475,7 @@ def app(): # We store all of this data in s3, as it is save_data_to_s3( bucket_name="retrofit-datalake-dev", - s3_file_name="sap_change_model/loft_insulation_testing_data.parquet", + s3_file_name="sap_change_model/simulation-pipeline-data.json", data=json.dumps( { "loft_insulation_testing_data": loft_insulation_testing_data, @@ -472,3 +491,48 @@ def app(): } ) ) + + # For each simulation type, we score against the model + from backend.ml_models.api import ModelApi + from datetime import datetime + + created_at = datetime.now().isoformat() + model_api = ModelApi(portfolio_id="simulation-testing-pipeline", timestamp=created_at) + model_api.MODEL_PREFIXES = ["sap_change_predictions"] + + # 1) Loft insulation + # We chunk up the data into 200 rows + loft_insulation_testing_df = pd.DataFrame(loft_insulation_testing_data) + + loft_insulation_predictions = [] + loft_to_loop_over = range(0, loft_insulation_testing_df.shape[0], 200) + for chunk in tqdm(loft_to_loop_over, total=len(loft_to_loop_over)): + loft_insulation_predictions_dict = model_api.predict_all( + df=loft_insulation_testing_df.iloc[chunk:chunk + 200], + bucket="retrofit-data-dev", + prediction_buckets={ + "sap_change_predictions": "retrofit-sap-predictions-dev", + } + ) + + loft_insulation_predictions.append(loft_insulation_predictions_dict["sap_change_predictions"]) + + loft_insulation_predictions = pd.concat(loft_insulation_predictions) + # Store final parquet in s3 + save_dataframe_to_s3_parquet( + df=loft_insulation_predictions, + bucket_name="retrofit-data-dev", + file_key=f"sap_change_model/simulation-pipeline-loft-insulation-predictions_{MODEL_VERSION}.parquet" + ) + + # We now merge the loft insulation predictions onto the scoring data and calculate exactly how much the insulation + # is worth + + loft_insulation_comparison_df = loft_insulation_testing_df[ + ["simulation_ending_insulation_thickness", "simulation_starting_insulation_thickness", "uprn", "id", ""] + ].merge( + loft_insulation_predictions, + left_on="id", + right_on="id", + how="left" + )