From bd131a2f663056fb46a906d8f148b2bcc06cd871 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 13 Feb 2025 22:32:31 +0000
Subject: [PATCH] preparing outputs for stonewater

---
 .../stonewater/Wave 3 Preparation.py          | 77 +++++++++++++++----
 1 file changed, 62 insertions(+), 15 deletions(-)

diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 94904aae..50dadcaf 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py	
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -2984,6 +2984,8 @@ def revised_model():
     original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
     original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str)
 
+    wave_21_folder_name = "Wave 2.1 Surveys - 2"
+
     # Check if we have all of the addresses
     missed = original_archetypes[
         ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values)
@@ -3028,7 +3030,6 @@ def revised_model():
         "10. Little Island",
         "11. CCS Dorset"
     ]
-    wave_21_folder_name = "Wave 2.1 Surveys - 2"
 
     for wave_2_1_folder in wave_21_folders:
         folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder)
@@ -3252,7 +3253,9 @@ def revised_model():
         'Main Wall Thickness', 'Main Building Alternative Wall Type',
         'Main Building Alternative Wall Insulation',
         'Main Building Alternative Wall Dry-lining',
-        'Main Building Alternative Wall Thickness', 'Main Fuel'
+        'Main Building Alternative Wall Thickness',
+        'Main Fuel',
+        'Main Building Age Band',
     ]
     # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey:
     retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns]
@@ -3795,7 +3798,8 @@ def revised_model():
                     "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
                     'SAP Band Install Package', 'Package Approved (Client)',
                     'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
-                    'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y",
+                    'Ventilation', 'Heating', 'Other Measures', 'PV System',
+                    "Asset ID.1_y",
                 ] + retrofit_assessments_data_columns_prefixed
                 ].rename(
                 columns={
@@ -3811,6 +3815,7 @@ def revised_model():
                     'Heating': 'Main Heating',
                     'Other Measures': 'Other measures',
                     'Asset ID.1_y': 'Organisation Reference',
+                    "PV System": "Solar PV",
                 }
             ),
             wates_coordination[
@@ -3818,8 +3823,7 @@ def revised_model():
                     "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
                     'SAP Band Install Package', 'Package Approved (Client)',
                     'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
-                    'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x'
-
+                    'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System"
                 ] + retrofit_assessments_data_columns_prefixed
                 ].rename(
                 columns={
@@ -3835,6 +3839,7 @@ def revised_model():
                     'Heating': 'Main Heating',
                     'Other Measures': 'Other measures',
                     'Asset ID_x': 'Organisation Reference',
+                    "PV System": "Solar PV",
                 }
             )
         ]
@@ -3857,12 +3862,12 @@ def revised_model():
 
     def find_nearest_matching_property(coordinated_packages, home):
         filter_levels = [
-            (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 1),
-            (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2),
-            (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3),
-            (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 4),
-            (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 5),
-            (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 6),
+            (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2),
+            (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3),
+            (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4),
+            (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5),
+            (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6),
+            (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7),
         ]
 
         max_confidence = max([confidence for (_, confidence) in filter_levels])
@@ -3911,12 +3916,13 @@ def revised_model():
                 {
                     "Organisation Reference": home["Organisation Reference"],
                     "Best Match Organisation Reference": m,
+                    "match_confidence": 1,
                     "Was Surveyed": True
                 } for m in survey_result["Organisation Reference"].values
             ]
             matches.extend(to_extend)
             continue
-        blah
+
         closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home)
         if closest_match is None:
             no_match.append(home["Organisation Reference"])
@@ -3926,6 +3932,7 @@ def revised_model():
             {
                 "Organisation Reference": home["Organisation Reference"],
                 "Best Match Organisation Reference": m,
+                "match_confidence": match_confidence,
                 "Was Surveyed": False
             } for m in closest_match["Organisation Reference"].values
         ]
@@ -3953,10 +3960,29 @@ def revised_model():
         suffixes=("", " - Closest Match")
     )
 
+    measures_columns = [
+        'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+        'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+        'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+        'Solar PV', 'Other measures'
+    ]
+
     # We want to aggregate the matches, when we have multiple
     aggregated_matches_df = []
     for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"):
+
+        measures = coordinated_packages[
+            (
+                coordinated_packages["Organisation Reference"].isin(
+                    mapped_matches['Best Match Organisation Reference'].values
+                )
+            )
+        ][measures_columns]
+
         if mapped_matches.shape[0] == 1:
+            # Get the measures for this property
+            measures = measures.squeeze()
+
             aggregated_matches_df.append(
                 {
                     "Organisation Reference": org_ref,
@@ -3965,6 +3991,7 @@ def revised_model():
                     "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0],
                     "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0],
                     "Was Surveyed": mapped_matches["Was Surveyed"].values[0],
+                    **measures
                 }
             )
             continue
@@ -3978,6 +4005,17 @@ def revised_model():
             mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[
                 0] / number_of_matches * 100
         )
+
+        measures_aggregated = {}
+        for m in measures_columns:
+            if any(~pd.isnull(measures[m])):
+                # Check if we have 2 unique values
+                vals = measures[~pd.isnull(measures[m])][m].unique()
+                if len(vals) > 1:
+                    measures_aggregated[m] = ", ".join(vals)
+                else:
+                    measures_aggregated[m] = vals[0]
+
         aggregated_matches_df.append(
             {
                 "Organisation Reference": org_ref,
@@ -3985,7 +4023,8 @@ def revised_model():
                 "Proportion": proportion_with_this_epc,
                 "Estimated SAP Rating": average_rating,
                 "Estimated EPC Rating": average_epc_rating,
-                "Was Surveyed": False
+                "Was Surveyed": False,
+                **measures_aggregated
             }
         )
 
@@ -4002,7 +4041,6 @@ def revised_model():
     def remove_leading_zero(address):
         return re.sub(r"^0([1-9]) ", r"\1 ", address)
 
-    # Example usage
     mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero)
     mapped_priority_list["address1"] = np.where(
         mapped_priority_list["Organisation Reference"] == 37004,
@@ -4020,6 +4058,13 @@ def revised_model():
     )
     mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"]
 
+    # Flag where 2 out of the three columns have consensus
+    mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = (
+        (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) |
+        (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) |
+        (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"])
+    )
+
     # Let's get the newest EPC data for these properties
     # We merge on UPRN, when we have it
     # from etl.route_march_data_pull.app import get_data
@@ -4081,6 +4126,7 @@ def revised_model():
             'Survey: Main Building Alternative Wall Dry-lining',
             'Survey: Main Building Alternative Wall Thickness',
             'Survey: Main Fuel',
+            'Survey: Main Building Age Band',
             'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type'
         ]
     ].rename(
@@ -4133,7 +4179,8 @@ def revised_model():
             [
                 "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation',
                 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness',
-                'Survey: Existing Primary Heating System',
+                'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band',
+                'Survey: Main Building Wall Area (m2)',
             ]
         ].rename(
             columns={