fixing unhandled cases in matching algorithm

2026-07-27 23:35:01 +00:00 · 2024-11-17 19:46:17 +00:00 · 2024-11-17 19:46:17 +00:00 · a630fe05c4
commit a630fe05c4
parent eff80e637f
1 changed files with 78 additions and 14 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -1756,20 +1756,44 @@ def propsed_wave_3_sample():
            ][["Archetype ID", "Current EPC Band"]].drop_duplicates()

        if region_surveyed["Archetype ID"].duplicated().sum():
-            blah1
-            region_surveyed = survey_results[
-                survey_results["Archetype ID"].isin(archetypes) &
-                (survey_results["Postal Region"] == region)
-                ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
-            region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc)
-            region_surveyed = region_surveyed.drop(columns=["Current SAP Rating"])
+            region_surveyed = []
+            for arch_id in archetypes:
+                for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
+                    archetype_data = survey_results_with_original_features[
+                        survey_results["Archetype ID"] == arch_id
+                        ].copy()
+                    if archetype_data.empty:
+                        continue
+                    archetype_data["distance_meters"] = haversine(
+                        lat1=property.latitude, lon1=property.longitude,
+                        lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
+                    )
+                    expected_sap = np.average(
+                        archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
+                    )
+                    expected_epc = sap_to_epc(expected_sap)
+                    region_surveyed.append(
+                        {
+                            "Archetype ID": arch_id,
+                            "Address ID": property["Address ID"],
+                            "Current EPC Band": expected_epc
+                        }
+                    )

-        region_assets = region_assets.merge(
-            region_surveyed,
-            on="Archetype ID",
-            how="left",
-            suffixes=("", "_method1")
-        )
+            region_surveyed = pd.DataFrame(region_surveyed)
+            region_assets = region_assets.merge(
+                region_surveyed,
+                on=["Archetype ID", "Address ID"],
+                how="left",
+                suffixes=("", "_method1")
+            )
+        else:
+            region_assets = region_assets.merge(
+                region_surveyed,
+                on="Archetype ID",
+                how="left",
+                suffixes=("", "_method1")
+            )

        # Label the tier 1 properties
        region_assets["Confidence Tier"] = np.where(
@ -1897,7 +1921,47 @@ def propsed_wave_3_sample():
                ].copy()

            if surveyed.empty:
-                blah3
+                # In this case, we do one additional check where we filter on everything the same apart from heating,
+                # where we do a slightly more rough match
+                surveyed = survey_results_with_original_features[
+                    (
+                        survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
+                        property["Property Type"].split(":")[0]
+                    ) &
+                    (
+                        survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+                        property["Wall Type"].split(":")[0]
+                    ) &
+                    (
+                        survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+                        property["Roof Type"].split(":")[0]
+                    )
+                    ].copy()
+
+                if "Electric" in property["Heating"]:
+                    # Take other electric heating systems
+                    surveyed = surveyed[surveyed["Heating"].str.contains("Electric")]
+                elif property["Heating"] == "Community Heating Systems: Community boilers only (RdSAP)":
+                    # Take other community heating systems
+                    surveyed = surveyed[surveyed["Heating"].str.contains("Community")]
+                elif property["Heating"] == 'Heat Pump: (from database)':
+                    # Take other heat pumps
+                    surveyed = surveyed[surveyed["Heating"].str.contains("Heat Pump")]
+                elif property["Heating"] == "Solid fuel room heaters: Open fire in grate":
+                    # Take other properties with room heaters
+                    surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")]
+                else:
+                    raise Exception("Fix me")
+
+            if surveyed.empty:
+                final_missed_matches.append(
+                    {
+                        "Address ID": a_id,
+                        "Confidence Tier": "4 - no similar property, needs survey to confirm",
+                        "Current EPC Band": "Needs Survey"
+                    }
+                )
+                continue

            # Calculate distance
            surveyed["distance_meters"] = haversine(