From ac9b7b37300204c83f862871ebd511208625978b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 22:08:10 +0000 Subject: [PATCH] updating methdology for matching --- .../stonewater/Wave 3 Preparation.py | 193 +++++++++++------- 1 file changed, 114 insertions(+), 79 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 08236d5b..f74dc19d 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1867,6 +1867,19 @@ def propsed_wave_3_sample(): return surveyed + def fill_survey_columns(region_assets, suffix): + for col in [ + 'Current EPC Band', 'Current SAP Rating', + 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', + 'Survey: Main Roof Type', 'Survey: Primary Heating System', + 'Survey: Matching Address ID', 'Distance to Closest Match (m)' + ]: + region_assets[col] = np.where( + pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]), + region_assets[col + suffix], region_assets[col] + ) + return region_assets + survey_attribute_columns = [ "Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', 'Survey: Primary Heating System' @@ -1920,6 +1933,14 @@ def propsed_wave_3_sample(): ].copy() if archetype_data.empty: continue + + match_type = "2 - same archetype" + if any(archetype_data["Postal Region"] == property["Postal Region"]): + match_type = "1 - same archetype, same postal region" + archetype_data = archetype_data[ + archetype_data["Postal Region"] == property["Postal Region"] + ] + if archetype_data.shape[0] > 1: # Look for an exact match, or as close as possible archetype_data_filtered = match_property_to_surveyed(property, archetype_data) @@ -1949,11 +1970,21 @@ def propsed_wave_3_sample(): 'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"], 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], "Survey: Matching Address ID": closest_match["Address ID"], - 'Distance to Closest Match (m)': closest_match["distance_meters"] + 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Match Type": match_type } ) - region_surveyed = pd.DataFrame(region_surveyed) + + if region_surveyed.empty: + region_surveyed = pd.DataFrame( + columns=[ + "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", + 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', + 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' + ] + ) + starting_shape = region_assets.shape[0] region_assets = region_assets.merge( region_surveyed, @@ -1968,95 +1999,99 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & pd.isnull(region_assets["Confidence Tier"]), - "1 - Archetype surveyed", region_assets["Confidence Tier"] + "1 - Archetype surveyed in region", region_assets["Confidence Tier"] ) - region_assets["Current EPC Band"] = np.where( - pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]), - region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"] - ) # Handle EPC C region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + region_assets["Current EPC Band_method1"].isin(["C", "B", "F", "G"]) & + pd.isnull(region_assets["Confidence Tier"]), "5 - EPC C or above", region_assets["Confidence Tier"] ) - region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) - # TODO: Turn into a function - missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + region_assets = fill_survey_columns(region_assets, suffix="_method1") - archetype_surveyed = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")] + region_assets = region_assets.drop(columns=method_1_columns) - if archetype_surveyed["Archetype ID"].duplicated().sum(): + missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"]) - archetype_surveyed = [] - for arch_id in missed_archetypes: - for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): - archetype_data = survey_results_with_original_features[ - survey_results["Archetype ID"] == arch_id - ].copy() - if archetype_data.empty: - continue - archetype_data["distance_meters"] = haversine( - lat1=property.latitude, lon1=property.longitude, - lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values - ) - expected_sap = np.average( - archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) - ) - expected_epc = sap_to_epc(expected_sap) - archetype_surveyed.append( - { - "Archetype ID": arch_id, - "Address ID": property["Address ID"], - "Current EPC Band": expected_epc - } - ) - archetype_surveyed = pd.DataFrame(archetype_surveyed) - region_assets = region_assets.merge( - archetype_surveyed, - on=["Archetype ID", "Address ID"], - how="left", - suffixes=("", "_method2") - ) - else: - region_assets = region_assets.merge( - archetype_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method2") - ) - - region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( - region_assets["Confidence Tier"]), - "2 - same archetype", region_assets["Confidence Tier"] - ) - - region_assets["Current EPC Band"] = np.where( - pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method2"]), - region_assets["Current EPC Band_method2"], region_assets["Current EPC Band"] - ) - - region_assets = region_assets.drop(columns=["Current EPC Band_method2"]) + archetype_surveyed = [] + for arch_id in missed_archetypes: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + raise Exception("IMPLEMENT ME") + # archetype_data["distance_meters"] = haversine( + # lat1=property.latitude, lon1=property.longitude, + # lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + # ) + # expected_sap = np.average( + # archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + # ) + # expected_epc = sap_to_epc(expected_sap) + # archetype_surveyed.append( + # { + # "Archetype ID": arch_id, + # "Address ID": property["Address ID"], + # "Current EPC Band": expected_epc + # } + # ) + # archetype_surveyed = pd.DataFrame(archetype_surveyed) + # if archetype_surveyed.empty: + # archetype_surveyed = pd.DataFrame( + # columns=[ + # "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", + # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', + # 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)' + # ] + # ) + # + # region_assets = region_assets.merge( + # archetype_surveyed, + # on=["Archetype ID", "Address ID"], + # how="left", + # suffixes=("", "_method2") + # ) + # + # region_assets["Confidence Tier"] = np.where( + # region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( + # region_assets["Confidence Tier"]), + # "2 - same archetype", region_assets["Confidence Tier"] + # ) + # + # for col in [ + # 'Current EPC Band', 'Current SAP Rating', + # 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', + # 'Survey: Main Roof Type', 'Survey: Primary Heating System', + # 'Survey: Matching Address ID', 'Distance to Closest Match (m)' + # ]: + # region_assets[col] = np.where( + # pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]), + # region_assets[col + "_method2"], region_assets[col] + # ) + # + # method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")] + # region_assets = region_assets.drop(columns=method_2_columns) # We label EPC C properties - region_assets["Confidence Tier"] = np.where( - region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), - "5 - EPC C or above", region_assets["Confidence Tier"] - ) - - region_assets["Confidence Tier"] = np.where( - region_assets["Archetype ID"] == "EPC C OR ABOVE", - "5 - EPC C or above", region_assets["Confidence Tier"] - ) - - region_assets["Current EPC Band"] = np.where( - region_assets["Archetype ID"] == "EPC C OR ABOVE", - "C", region_assets["Current EPC Band"] - ) + # region_assets["Confidence Tier"] = np.where( + # region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]), + # "5 - EPC C or above", region_assets["Confidence Tier"] + # ) + # + # region_assets["Confidence Tier"] = np.where( + # region_assets["Archetype ID"] == "EPC C OR ABOVE", + # "5 - EPC C or above", region_assets["Confidence Tier"] + # ) + # + # region_assets["Current EPC Band"] = np.where( + # region_assets["Archetype ID"] == "EPC C OR ABOVE", + # "C", region_assets["Current EPC Band"] + # ) missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()