preparing outputs for stonewater

2026-07-27 23:35:01 +00:00 · 2025-02-13 22:32:31 +00:00 · 2025-02-13 22:32:31 +00:00 · bd131a2f66
commit bd131a2f66
parent b8a094106c
1 changed files with 62 additions and 15 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -2984,6 +2984,8 @@ def revised_model():
    original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
    original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str)

+    wave_21_folder_name = "Wave 2.1 Surveys - 2"
+
    # Check if we have all of the addresses
    missed = original_archetypes[
        ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values)
@ -3028,7 +3030,6 @@ def revised_model():
        "10. Little Island",
        "11. CCS Dorset"
    ]
-    wave_21_folder_name = "Wave 2.1 Surveys - 2"

    for wave_2_1_folder in wave_21_folders:
        folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder)
@ -3252,7 +3253,9 @@ def revised_model():
        'Main Wall Thickness', 'Main Building Alternative Wall Type',
        'Main Building Alternative Wall Insulation',
        'Main Building Alternative Wall Dry-lining',
-        'Main Building Alternative Wall Thickness', 'Main Fuel'
+        'Main Building Alternative Wall Thickness',
+        'Main Fuel',
+        'Main Building Age Band',
    ]
    # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey:
    retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns]
@ -3795,7 +3798,8 @@ def revised_model():
                    "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
                    'SAP Band Install Package', 'Package Approved (Client)',
                    'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
-                    'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y",
+                    'Ventilation', 'Heating', 'Other Measures', 'PV System',
+                    "Asset ID.1_y",
                ] + retrofit_assessments_data_columns_prefixed
                ].rename(
                columns={
@ -3811,6 +3815,7 @@ def revised_model():
                    'Heating': 'Main Heating',
                    'Other Measures': 'Other measures',
                    'Asset ID.1_y': 'Organisation Reference',
+                    "PV System": "Solar PV",
                }
            ),
            wates_coordination[
@ -3818,8 +3823,7 @@ def revised_model():
                    "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
                    'SAP Band Install Package', 'Package Approved (Client)',
                    'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
-                    'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x'
-
+                    'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System"
                ] + retrofit_assessments_data_columns_prefixed
                ].rename(
                columns={
@ -3835,6 +3839,7 @@ def revised_model():
                    'Heating': 'Main Heating',
                    'Other Measures': 'Other measures',
                    'Asset ID_x': 'Organisation Reference',
+                    "PV System": "Solar PV",
                }
            )
        ]
@ -3857,12 +3862,12 @@ def revised_model():

    def find_nearest_matching_property(coordinated_packages, home):
        filter_levels = [
-            (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 1),
-            (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2),
-            (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3),
-            (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 4),
-            (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 5),
-            (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 6),
+            (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2),
+            (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3),
+            (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4),
+            (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5),
+            (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6),
+            (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7),
        ]

        max_confidence = max([confidence for (_, confidence) in filter_levels])
@ -3911,12 +3916,13 @@ def revised_model():
                {
                    "Organisation Reference": home["Organisation Reference"],
                    "Best Match Organisation Reference": m,
+                    "match_confidence": 1,
                    "Was Surveyed": True
                } for m in survey_result["Organisation Reference"].values
            ]
            matches.extend(to_extend)
            continue
-        blah
+
        closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home)
        if closest_match is None:
            no_match.append(home["Organisation Reference"])
@ -3926,6 +3932,7 @@ def revised_model():
            {
                "Organisation Reference": home["Organisation Reference"],
                "Best Match Organisation Reference": m,
+                "match_confidence": match_confidence,
                "Was Surveyed": False
            } for m in closest_match["Organisation Reference"].values
        ]
@ -3953,10 +3960,29 @@ def revised_model():
        suffixes=("", " - Closest Match")
    )

+    measures_columns = [
+        'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+        'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+        'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+        'Solar PV', 'Other measures'
+    ]
+
    # We want to aggregate the matches, when we have multiple
    aggregated_matches_df = []
    for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"):
+
+        measures = coordinated_packages[
+            (
+                coordinated_packages["Organisation Reference"].isin(
+                    mapped_matches['Best Match Organisation Reference'].values
+                )
+            )
+        ][measures_columns]
+
        if mapped_matches.shape[0] == 1:
+            # Get the measures for this property
+            measures = measures.squeeze()
+
            aggregated_matches_df.append(
                {
                    "Organisation Reference": org_ref,
@ -3965,6 +3991,7 @@ def revised_model():
                    "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0],
                    "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0],
                    "Was Surveyed": mapped_matches["Was Surveyed"].values[0],
+                    **measures
                }
            )
            continue
@ -3978,6 +4005,17 @@ def revised_model():
            mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[
                0] / number_of_matches * 100
        )
+
+        measures_aggregated = {}
+        for m in measures_columns:
+            if any(~pd.isnull(measures[m])):
+                # Check if we have 2 unique values
+                vals = measures[~pd.isnull(measures[m])][m].unique()
+                if len(vals) > 1:
+                    measures_aggregated[m] = ", ".join(vals)
+                else:
+                    measures_aggregated[m] = vals[0]
+
        aggregated_matches_df.append(
            {
                "Organisation Reference": org_ref,
@ -3985,7 +4023,8 @@ def revised_model():
                "Proportion": proportion_with_this_epc,
                "Estimated SAP Rating": average_rating,
                "Estimated EPC Rating": average_epc_rating,
-                "Was Surveyed": False
+                "Was Surveyed": False,
+                **measures_aggregated
            }
        )

@ -4002,7 +4041,6 @@ def revised_model():
    def remove_leading_zero(address):
        return re.sub(r"^0([1-9]) ", r"\1 ", address)

-    # Example usage
    mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero)
    mapped_priority_list["address1"] = np.where(
        mapped_priority_list["Organisation Reference"] == 37004,
@ -4020,6 +4058,13 @@ def revised_model():
    )
    mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"]

+    # Flag where 2 out of the three columns have consensus
+    mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = (
+        (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) |
+        (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) |
+        (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"])
+    )
+
    # Let's get the newest EPC data for these properties
    # We merge on UPRN, when we have it
    # from etl.route_march_data_pull.app import get_data
@ -4081,6 +4126,7 @@ def revised_model():
            'Survey: Main Building Alternative Wall Dry-lining',
            'Survey: Main Building Alternative Wall Thickness',
            'Survey: Main Fuel',
+            'Survey: Main Building Age Band',
            'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type'
        ]
    ].rename(
@ -4133,7 +4179,8 @@ def revised_model():
            [
                "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation',
                'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness',
-                'Survey: Existing Primary Heating System',
+                'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band',
+                'Survey: Main Building Wall Area (m2)',
            ]
        ].rename(
            columns={