From a630fe05c485aca2c5509748eecb5544ddc78dbe Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Nov 2024 19:46:17 +0000 Subject: [PATCH] fixing unhandled cases in matching algorithm --- .../stonewater/Wave 3 Preparation.py | 92 ++++++++++++++++--- 1 file changed, 78 insertions(+), 14 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 3b44d560..460aa8ee 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1756,20 +1756,44 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): - blah1 - region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(archetypes) & - (survey_results["Postal Region"] == region) - ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() - region_surveyed["Current EPC Band"] = region_surveyed["Current SAP Rating"].apply(sap_to_epc) - region_surveyed = region_surveyed.drop(columns=["Current SAP Rating"]) + region_surveyed = [] + for arch_id in archetypes: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + archetype_data["distance_meters"] = haversine( + lat1=property.latitude, lon1=property.longitude, + lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + ) + expected_sap = np.average( + archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) + region_surveyed.append( + { + "Archetype ID": arch_id, + "Address ID": property["Address ID"], + "Current EPC Band": expected_epc + } + ) - region_assets = region_assets.merge( - region_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method1") - ) + region_surveyed = pd.DataFrame(region_surveyed) + region_assets = region_assets.merge( + region_surveyed, + on=["Archetype ID", "Address ID"], + how="left", + suffixes=("", "_method1") + ) + else: + region_assets = region_assets.merge( + region_surveyed, + on="Archetype ID", + how="left", + suffixes=("", "_method1") + ) # Label the tier 1 properties region_assets["Confidence Tier"] = np.where( @@ -1897,7 +1921,47 @@ def propsed_wave_3_sample(): ].copy() if surveyed.empty: - blah3 + # In this case, we do one additional check where we filter on everything the same apart from heating, + # where we do a slightly more rough match + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) + ].copy() + + if "Electric" in property["Heating"]: + # Take other electric heating systems + surveyed = surveyed[surveyed["Heating"].str.contains("Electric")] + elif property["Heating"] == "Community Heating Systems: Community boilers only (RdSAP)": + # Take other community heating systems + surveyed = surveyed[surveyed["Heating"].str.contains("Community")] + elif property["Heating"] == 'Heat Pump: (from database)': + # Take other heat pumps + surveyed = surveyed[surveyed["Heating"].str.contains("Heat Pump")] + elif property["Heating"] == "Solid fuel room heaters: Open fire in grate": + # Take other properties with room heaters + surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")] + else: + raise Exception("Fix me") + + if surveyed.empty: + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": "4 - no similar property, needs survey to confirm", + "Current EPC Band": "Needs Survey" + } + ) + continue # Calculate distance surveyed["distance_meters"] = haversine(