From 1028861a1b0f8e9bb4a117335150923d6101f222 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 12 Feb 2024 16:54:16 +0000
Subject: [PATCH] implementing the scoring process into testing pipeline

---
 etl/testing_data/sap_model_simulation.py | 82 +++++++++++++++++++++---
 1 file changed, 73 insertions(+), 9 deletions(-)

diff --git a/etl/testing_data/sap_model_simulation.py b/etl/testing_data/sap_model_simulation.py
index 6ff89691..0434ffd5 100644
--- a/etl/testing_data/sap_model_simulation.py
+++ b/etl/testing_data/sap_model_simulation.py
@@ -2,9 +2,12 @@ import json
 
 import pandas as pd
 from tqdm import tqdm
-from utils.s3 import read_dataframe_from_s3_parquet, save_data_to_s3
+from utils.s3 import read_dataframe_from_s3_parquet, save_data_to_s3, save_dataframe_to_s3_parquet
 from backend.Property import Property
 
+# This is the github pr number
+MODEL_VERSION = "100"
+
 
 def app():
     dataset = read_dataframe_from_s3_parquet(
@@ -76,6 +79,20 @@ def app():
             ~dataset["internal_insulation"])
         ]["walls_thermal_transmittance"].min()
 
+    ending_colums = [col for col in dataset.columns if col.endswith("_ending")]
+    # For the purpose of scoring, we want to simulate JUST the impact of the measure we're testing. We therefore
+    # need to make sure that every "_ending" column is equal to its starting value
+    column_config = {}
+    for ending_col in ending_colums:
+        base_col = ending_col.replace("_ending", "")
+        # We check if the starting column ends with _starting or is just the base col
+        if base_col + "_starting" in dataset.columns:
+            column_config[ending_col] = base_col + "_starting"
+        elif base_col in dataset.columns:
+            column_config[ending_col] = base_col
+        else:
+            raise ValueError("something went wrong")
+
     loft_insulation_testing_data = []
     solid_wall_testing_data = []
     cavity_wall_testing_data = []
@@ -93,7 +110,11 @@ def app():
             (dataset["built_form"] == property_config.built_form) &
             (dataset["floor_area_quantile"] == property_config.floor_area_quantile) &
             (dataset["construction_age_band"] == property_config.construction_age_band)
-            ]
+            ].copy()
+
+        # Re-set all of the ending columns
+        for col in ending_colums:
+            population[col] = population[column_config[col]]
 
         # 1) Loft insulation
 
@@ -185,9 +206,9 @@ def app():
             )
 
             # Simulated EWI
-            best_external_wall_uvalue_wall_insulation_simulation = Property.create_recommendation_scoring_data(
+            external_wall_insulation_simulation = Property.create_recommendation_scoring_data(
                 property_id=row["uprn"].values[0],
-                recommendation_record=row.copy(),
+                recommendation_record=row.copy().to_dict("records")[0],
                 recommendation={
                     "recommendation_id": "external_wall_insulation",
                     "type": "external_wall_insulation",
@@ -198,7 +219,7 @@ def app():
 
             # The iww/ewi simulations will be next to each other, so we can see how they differ for the same property
             solid_wall_testing_data.append(internal_wall_insulation_simulation)
-            solid_wall_testing_data.append(best_external_wall_uvalue_wall_insulation_simulation)
+            solid_wall_testing_data.append(external_wall_insulation_simulation)
 
         # 3) Cavity wall insulation
         cavity_wall_sample = population[
@@ -291,9 +312,7 @@ def app():
         ]
 
         if not single_glazing_sample.empty:
-            row = single_glazing_sample[
-                single_glazing_sample["multi_glaze_proportion_starting"] == value
-                ].sample(1)
+            row = single_glazing_sample.sample(1)
 
             # For single glazed windows, we can recommend double glazing or secondary glazing
 
@@ -456,7 +475,7 @@ def app():
     # We store all of this data in s3, as it is
     save_data_to_s3(
         bucket_name="retrofit-datalake-dev",
-        s3_file_name="sap_change_model/loft_insulation_testing_data.parquet",
+        s3_file_name="sap_change_model/simulation-pipeline-data.json",
         data=json.dumps(
             {
                 "loft_insulation_testing_data": loft_insulation_testing_data,
@@ -472,3 +491,48 @@ def app():
             }
         )
     )
+
+    # For each simulation type, we score against the model
+    from backend.ml_models.api import ModelApi
+    from datetime import datetime
+
+    created_at = datetime.now().isoformat()
+    model_api = ModelApi(portfolio_id="simulation-testing-pipeline", timestamp=created_at)
+    model_api.MODEL_PREFIXES = ["sap_change_predictions"]
+
+    # 1) Loft insulation
+    # We chunk up the data into 200 rows
+    loft_insulation_testing_df = pd.DataFrame(loft_insulation_testing_data)
+
+    loft_insulation_predictions = []
+    loft_to_loop_over = range(0, loft_insulation_testing_df.shape[0], 200)
+    for chunk in tqdm(loft_to_loop_over, total=len(loft_to_loop_over)):
+        loft_insulation_predictions_dict = model_api.predict_all(
+            df=loft_insulation_testing_df.iloc[chunk:chunk + 200],
+            bucket="retrofit-data-dev",
+            prediction_buckets={
+                "sap_change_predictions": "retrofit-sap-predictions-dev",
+            }
+        )
+
+        loft_insulation_predictions.append(loft_insulation_predictions_dict["sap_change_predictions"])
+
+    loft_insulation_predictions = pd.concat(loft_insulation_predictions)
+    # Store final parquet in s3
+    save_dataframe_to_s3_parquet(
+        df=loft_insulation_predictions,
+        bucket_name="retrofit-data-dev",
+        file_key=f"sap_change_model/simulation-pipeline-loft-insulation-predictions_{MODEL_VERSION}.parquet"
+    )
+
+    # We now merge the loft insulation predictions onto the scoring data and calculate exactly how much the insulation
+    # is worth
+
+    loft_insulation_comparison_df = loft_insulation_testing_df[
+        ["simulation_ending_insulation_thickness", "simulation_starting_insulation_thickness", "uprn", "id", ""]
+    ].merge(
+        loft_insulation_predictions,
+        left_on="id",
+        right_on="id",
+        how="left"
+    )