From 5d5001fec3114eab4ba84e7fc0e40270ec017d35 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 22:47:39 +0000 Subject: [PATCH] added de-duping --- .../stonewater/Wave 3 Preparation.py | 221 ++++++------------ etl/find_my_epc/RetrieveFindMyEpc.py | 6 + etl/route_march_data_pull/app.py | 7 + 3 files changed, 85 insertions(+), 149 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index f74dc19d..744b3400 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1803,21 +1803,26 @@ def propsed_wave_3_sample(): def match_property_to_surveyed(property, survey_results_with_original_features): surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Postal Region"] == + property["Postal Region"] + ) & ( survey_results_with_original_features["Property Type"] == property["Property Type"] + ) + & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] ) & ( - survey_results_with_original_features["Wall Type"] == - property["Wall Type"] + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] ) & ( - survey_results_with_original_features["Roof Type"] == - property["Roof Type"] - ) & - ( - survey_results_with_original_features["Heating"] == - property["Heating"] + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] ) ].copy() @@ -1826,23 +1831,47 @@ def propsed_wave_3_sample(): surveyed = survey_results_with_original_features[ ( - survey_results_with_original_features["Property Type"] == - property["Property Type"] + survey_results_with_original_features["Postal Region"] == + property["Postal Region"] ) & ( - survey_results_with_original_features["Wall Type"] == - property["Wall Type"] + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] + ) + & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] ) & ( survey_results_with_original_features["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0] ) & ( - survey_results_with_original_features["Heating"] == - property["Heating"] + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] ) ].copy() + # surveyed = survey_results_with_original_features[ + # ( + # survey_results_with_original_features["Property Type"] == + # property["Property Type"] + # ) & + # ( + # survey_results_with_original_features["Wall Type"] == + # property["Wall Type"] + # ) & + # ( + # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Heating"] == + # property["Heating"] + # ) + # ].copy() + if not surveyed.empty: return surveyed @@ -1906,7 +1935,12 @@ def propsed_wave_3_sample(): on="Address ID", how="left" ) - region_assets['Distance to Closest Match (m)'] = 0 + region_assets['Distance to Closest Match (m)'] = None + region_assets["Distance to Closest Match (m)"] = np.where( + ~pd.isnull(region_assets["Current EPC Band"]), + 0, + region_assets["Distance to Closest Match (m)"] + ) # Label the tier 1 properties region_assets["Confidence Tier"] = None @@ -2016,7 +2050,7 @@ def propsed_wave_3_sample(): missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"]) - archetype_surveyed = [] + # archetype_surveyed = [] for arch_id in missed_archetypes: for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): archetype_data = survey_results_with_original_features[ @@ -2175,7 +2209,14 @@ def propsed_wave_3_sample(): { "Address ID": a_id, "Confidence Tier": "4 - no similar property, needs survey to confirm", - "Current EPC Band": "Needs Survey" + "Current EPC Band": "Needs Survey", + "Current SAP Rating": "Needs Survey", + 'Survey: Main Wall Type': "Not Surveyed", + "Survey: Main Alternative Wall": "Not Surveyed", + "Survey: Main Roof Type": "Not Surveyed", + "Survey: Primary Heating System": "Not Surveyed", + "Survey: Matching Address ID": "Not Surveyed", + 'Distance to Closest Match (m)': 9999999, } ) continue @@ -2197,18 +2238,6 @@ def propsed_wave_3_sample(): # Take the 3 nearest surveyed = surveyed.head(3) - # # We allow a max distance of 10km - # surveyed = surveyed[surveyed["distance_meters"] < 10000] - # if surveyed.empty: - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": "4 - no similar property, needs survey to confirm", - # "Current EPC Band": "Needs Survey" - # } - # ) - # continue - # perform a weighted mean of SAP rating - the closer the better expected_sap = np.average( surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1) @@ -2218,129 +2247,24 @@ def propsed_wave_3_sample(): if expected_epc in ["C", "B", "A"]: match_type = "5 - EPC C or above" + closest_match = surveyed.iloc[0] + final_missed_matches.append( { "Address ID": a_id, "Confidence Tier": match_type, - "Current EPC Band": expected_epc + "Current EPC Band": expected_epc, + "Current SAP Rating": expected_sap, + 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"], + "Survey: Main Alternative Wall": closest_match["Survey: Main Alternative Wall"], + "Survey: Main Roof Type": closest_match["Survey: Main Roof Type"], + "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], + "Survey: Matching Address ID": closest_match["Address ID"], + 'Distance to Closest Match (m)': closest_match["distance_meters"], } ) continue - # if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: - # filter_property_types = ["House", "Bungalow"] - # else: - # filter_property_types = ["Flat"] - # - # surveyed_similar = survey_results_with_original_features[ - # (survey_results_with_original_features["Postcode"] == property["Postcode"]) & - # ( - # survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - # filter_property_types - # ) - # ) & - # ( - # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - # property["Wall Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - # property["Roof Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Heating"].str.split(":").str[0] == - # property["Heating"].split(":")[0] - # ) - # ] - # if surveyed_similar.empty: - # surveyed_similar = survey_results_with_original_features[ - # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - # filter_property_types - # )) & - # (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - # property["Wall Type"].split(":")[0]) & - # (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - # property["Roof Type"].split(":")[0]) & - # (survey_results_with_original_features["Heating"].str.split(":").str[0] == - # property["Heating"].split(":")[0]) - # ] - # - # if surveyed_similar.empty: - # - # # We get an average based on the postcode - # surveyed_similar = survey_results_with_original_features[ - # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - # filter_property_types - # )) - # ] - # if surveyed_similar.empty: - # surveyed_similar_entire_population = survey_results_with_original_features[ - # ( - # survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[ - # "Property Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - # property["Wall Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - # property["Roof Type"].split(":")[0] - # ) & - # ( - # survey_results_with_original_features["Heating"].str.split(":").str[0] == - # property["Heating"].split(":")[0] - # ) - # ] - # - # # We order them by distance on postcode - # - # # Average - # expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean() - # expected_epc = sap_to_epc(expected_sap) - # - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": "3 - similar property, all areas searched", - # "Current EPC Band": expected_epc - # } - # - # ) - # else: - # expected_sap = surveyed_similar["Current SAP Rating"].mean() - # expected_epc = sap_to_epc(expected_sap) - # if expected_epc in ["C", "B", "A"]: - # tier = "5 - EPC C or above" - # else: - # tier = "3 - similar property, relaxed conditions" - # - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": tier, - # "Current EPC Band": expected_epc - # } - # ) - # continue - # # We take an average - # expected_sap = surveyed_similar["Current SAP Rating"].mean() - # expected_epc = sap_to_epc(expected_sap) - # if expected_epc in ["C", "B", "A"]: - # tier = "5 - EPC C or above" - # else: - # tier = "3 - similar property" - # - # final_missed_matches.append( - # { - # "Address ID": a_id, - # "Confidence Tier": tier, - # "Current EPC Band": expected_epc - # } - # ) - final_missed_matches = pd.DataFrame(final_missed_matches) region_assets = region_assets.merge( @@ -2353,12 +2277,11 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna( region_assets["Confidence Tier_method3"] ) - region_assets["Current EPC Band"] = np.where( - pd.isnull(region_assets["Current EPC Band"]), - region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"] - ) - region_assets = region_assets.drop(columns=["Confidence Tier_method3", "Current EPC Band_method3"]) + region_assets = fill_survey_columns(region_assets, suffix="_method3") + + method_3_columns = [c for c in region_assets.columns if c.endswith("_method3")] + region_assets = region_assets.drop(columns=method_3_columns) if pd.isnull(region_assets["Current EPC Band"]).sum(): raise Exception("Something went wrong") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index ac0e8235..b6394275 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -289,6 +289,12 @@ class RetrieveFindMyEpc: "Fuel change recommendation": [], "PV Cells recommendation": [], "Replacement glazing units": ["double_glazing"], + "Heating controls (time and temperature zone control)": ["time_temperature_zone_control"], + "High heat retention storage heaters": ["high_heat_retention_storage_heaters"], + "Gas condensing boiler": ["boiler_upgrade"], + "Change room heaters to condensing boiler": ["boiler_upgrade"], + "Cylinder thermostat": ["cylinder_thermostat"], + "Heat recovery system for mixer showers": ["heat_recovery_shower"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index d9f6bf43..6f9dd135 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -150,6 +150,13 @@ def app(): # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1) + # We check for duplicated addresses + asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] + if asset_list["deduper"].duplicated().sum(): + # Drop the dupes + print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping") + asset_list = asset_list[~asset_list["deduper"].duplicated()] + epc_data, errors, no_epc = get_data( asset_list=asset_list, fulladdress_column=FULLADDRESS_COLUMN,