implementing the scoring process into testing pipeline

2026-06-08 11:17:27 +00:00 · 2024-02-12 16:54:16 +00:00 · 2024-02-12 16:54:16 +00:00 · 1028861a1b
commit 1028861a1b
parent 7c109ebf5d
1 changed files with 73 additions and 9 deletions
--- a/etl/testing_data/sap_model_simulation.py
+++ b/etl/testing_data/sap_model_simulation.py
@ -2,9 +2,12 @@ import json

 import pandas as pd
 from tqdm import tqdm
-from utils.s3 import read_dataframe_from_s3_parquet, save_data_to_s3
+from utils.s3 import read_dataframe_from_s3_parquet, save_data_to_s3, save_dataframe_to_s3_parquet
 from backend.Property import Property

+# This is the github pr number
+MODEL_VERSION = "100"
+

 def app():
    dataset = read_dataframe_from_s3_parquet(
@ -76,6 +79,20 @@ def app():
            ~dataset["internal_insulation"])
        ]["walls_thermal_transmittance"].min()

+    ending_colums = [col for col in dataset.columns if col.endswith("_ending")]
+    # For the purpose of scoring, we want to simulate JUST the impact of the measure we're testing. We therefore
+    # need to make sure that every "_ending" column is equal to its starting value
+    column_config = {}
+    for ending_col in ending_colums:
+        base_col = ending_col.replace("_ending", "")
+        # We check if the starting column ends with _starting or is just the base col
+        if base_col + "_starting" in dataset.columns:
+            column_config[ending_col] = base_col + "_starting"
+        elif base_col in dataset.columns:
+            column_config[ending_col] = base_col
+        else:
+            raise ValueError("something went wrong")
+
    loft_insulation_testing_data = []
    solid_wall_testing_data = []
    cavity_wall_testing_data = []
@ -93,7 +110,11 @@ def app():
            (dataset["built_form"] == property_config.built_form) &
            (dataset["floor_area_quantile"] == property_config.floor_area_quantile) &
            (dataset["construction_age_band"] == property_config.construction_age_band)
-            ]
+            ].copy()
+
+        # Re-set all of the ending columns
+        for col in ending_colums:
+            population[col] = population[column_config[col]]

        # 1) Loft insulation

@ -185,9 +206,9 @@ def app():
            )

            # Simulated EWI
-            best_external_wall_uvalue_wall_insulation_simulation = Property.create_recommendation_scoring_data(
+            external_wall_insulation_simulation = Property.create_recommendation_scoring_data(
                property_id=row["uprn"].values[0],
-                recommendation_record=row.copy(),
+                recommendation_record=row.copy().to_dict("records")[0],
                recommendation={
                    "recommendation_id": "external_wall_insulation",
                    "type": "external_wall_insulation",
@ -198,7 +219,7 @@ def app():

            # The iww/ewi simulations will be next to each other, so we can see how they differ for the same property
            solid_wall_testing_data.append(internal_wall_insulation_simulation)
-            solid_wall_testing_data.append(best_external_wall_uvalue_wall_insulation_simulation)
+            solid_wall_testing_data.append(external_wall_insulation_simulation)

        # 3) Cavity wall insulation
        cavity_wall_sample = population[
@ -291,9 +312,7 @@ def app():
        ]

        if not single_glazing_sample.empty:
-            row = single_glazing_sample[
-                single_glazing_sample["multi_glaze_proportion_starting"] == value
-                ].sample(1)
+            row = single_glazing_sample.sample(1)

            # For single glazed windows, we can recommend double glazing or secondary glazing

@ -456,7 +475,7 @@ def app():
    # We store all of this data in s3, as it is
    save_data_to_s3(
        bucket_name="retrofit-datalake-dev",
-        s3_file_name="sap_change_model/loft_insulation_testing_data.parquet",
+        s3_file_name="sap_change_model/simulation-pipeline-data.json",
        data=json.dumps(
            {
                "loft_insulation_testing_data": loft_insulation_testing_data,
@ -472,3 +491,48 @@ def app():
            }
        )
    )
+
+    # For each simulation type, we score against the model
+    from backend.ml_models.api import ModelApi
+    from datetime import datetime
+
+    created_at = datetime.now().isoformat()
+    model_api = ModelApi(portfolio_id="simulation-testing-pipeline", timestamp=created_at)
+    model_api.MODEL_PREFIXES = ["sap_change_predictions"]
+
+    # 1) Loft insulation
+    # We chunk up the data into 200 rows
+    loft_insulation_testing_df = pd.DataFrame(loft_insulation_testing_data)
+
+    loft_insulation_predictions = []
+    loft_to_loop_over = range(0, loft_insulation_testing_df.shape[0], 200)
+    for chunk in tqdm(loft_to_loop_over, total=len(loft_to_loop_over)):
+        loft_insulation_predictions_dict = model_api.predict_all(
+            df=loft_insulation_testing_df.iloc[chunk:chunk + 200],
+            bucket="retrofit-data-dev",
+            prediction_buckets={
+                "sap_change_predictions": "retrofit-sap-predictions-dev",
+            }
+        )
+
+        loft_insulation_predictions.append(loft_insulation_predictions_dict["sap_change_predictions"])
+
+    loft_insulation_predictions = pd.concat(loft_insulation_predictions)
+    # Store final parquet in s3
+    save_dataframe_to_s3_parquet(
+        df=loft_insulation_predictions,
+        bucket_name="retrofit-data-dev",
+        file_key=f"sap_change_model/simulation-pipeline-loft-insulation-predictions_{MODEL_VERSION}.parquet"
+    )
+
+    # We now merge the loft insulation predictions onto the scoring data and calculate exactly how much the insulation
+    # is worth
+
+    loft_insulation_comparison_df = loft_insulation_testing_df[
+        ["simulation_ending_insulation_thickness", "simulation_starting_insulation_thickness", "uprn", "id", ""]
+    ].merge(
+        loft_insulation_predictions,
+        left_on="id",
+        right_on="id",
+        how="left"
+    )