From a7857c0375949f5d45d47afe41f59e07de883e71 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Nov 2024 20:30:57 +0000 Subject: [PATCH] pulling out data from best match --- .../stonewater/Wave 3 Preparation.py | 111 ++++++++++-------- etl/find_my_epc/RetrieveFindMyEpc.py | 1 + etl/route_march_data_pull/app.py | 65 ++++------ 3 files changed, 83 insertions(+), 94 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index aa9e4488..08236d5b 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1727,7 +1727,7 @@ def propsed_wave_3_sample(): "Existing Primary Heating System": "Survey: Primary Heating System" } ) - + survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] # Concatenate from the wall information survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[ "Main Wall Insulation Type"].astype(str) @@ -1872,6 +1872,8 @@ def propsed_wave_3_sample(): 'Survey: Primary Heating System' ] + survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy() + results = [] for region in tqdm(unique_postal_regions): # Take all of the properties in that region @@ -1884,10 +1886,14 @@ def propsed_wave_3_sample(): region_assets = region_assets.merge( exact_surveyed[ - ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns], + ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [ + "Survey: Matching Address ID" + ] + ], on="Address ID", how="left" ) + region_assets['Distance to Closest Match (m)'] = 0 # Label the tier 1 properties region_assets["Confidence Tier"] = None @@ -1901,61 +1907,62 @@ def propsed_wave_3_sample(): "5 - property was surveyed", region_assets["Confidence Tier"] ) - archetypes = region_assets[ + archetype_ids = region_assets[ pd.isnull(region_assets["Confidence Tier"]) ]["Archetype ID"].unique() # We get the properties that have been surveyed - region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(archetypes) & - (survey_results["Postal Region"] == region) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() - if region_surveyed["Archetype ID"].duplicated().sum(): + region_surveyed = [] + for arch_id in archetype_ids: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + if archetype_data.shape[0] > 1: + # Look for an exact match, or as close as possible + archetype_data_filtered = match_property_to_surveyed(property, archetype_data) + if not archetype_data_filtered.empty: + archetype_data = archetype_data_filtered - region_surveyed = [] - for arch_id in archetypes: - for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): - archetype_data = survey_results_with_original_features[ - survey_results["Archetype ID"] == arch_id - ].copy() - if archetype_data.empty: - continue - if archetype_data.shape[0] > 1: - # Look for an exact match, or as close as possible - archetype_data_filtered = match_property_to_surveyed(property, archetype_data) - if not archetype_data_filtered.empty: - archetype_data = archetype_data_filtered + archetype_data["distance_meters"] = haversine( + lat1=property.latitude, lon1=property.longitude, + lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + ) + expected_sap = np.average( + archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) - archetype_data["distance_meters"] = haversine( - lat1=property.latitude, lon1=property.longitude, - lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values - ) - expected_sap = np.average( - archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) - ) - expected_epc = sap_to_epc(expected_sap) - region_surveyed.append( - { - "Archetype ID": arch_id, - "Address ID": property["Address ID"], - "Current EPC Band": expected_epc - } - ) + # We take the features of the closest matching property + closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0] - region_surveyed = pd.DataFrame(region_surveyed) - region_assets = region_assets.merge( - region_surveyed, - on=["Archetype ID", "Address ID"], - how="left", - suffixes=("", "_method1") - ) - else: - region_assets = region_assets.merge( - region_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method1") - ) + region_surveyed.append( + { + "Archetype ID": arch_id, + "Address ID": property["Address ID"], + "Current EPC Band": expected_epc, + "Current SAP Rating": expected_sap, + 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"], + 'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"], + 'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"], + 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], + "Survey: Matching Address ID": closest_match["Address ID"], + 'Distance to Closest Match (m)': closest_match["distance_meters"] + } + ) + + region_surveyed = pd.DataFrame(region_surveyed) + starting_shape = region_assets.shape[0] + region_assets = region_assets.merge( + region_surveyed, + on=["Archetype ID", "Address ID"], + how="left", + suffixes=("", "_method1") + ) + if region_assets.shape[0] != starting_shape: + raise ValueError("Something went wrong") # Label the tier 1 properties region_assets["Confidence Tier"] = np.where( @@ -2326,7 +2333,9 @@ def propsed_wave_3_sample(): results = pd.concat(results) # Check if there are missings in current epc band, current sap rating or any of the survey attributes - for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns: + for c in ( + ["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + + survey_attribute_columns): if pd.isnull(results[c]).sum(): raise Exception("Something went wrong") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 913a04b8..d5a5134f 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -269,6 +269,7 @@ class RetrieveFindMyEpc: "Loft insulation": ["loft_insulation"], "Solar photovoltaic (PV) panels": ["solar_pv"], "Party wall insulation": ["party_wall_insulation"], + 'Draught proofing': ["draught_proofing"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index f24c5bb2..1e478b0c 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -23,41 +23,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") def get_data(asset_list, fulladdress_column, address1_column, postcode_column): epc_data = [] errors = [] + no_epc = [] for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): - postcode = home[postcode_column] - house_number = home[address1_column] - full_address = home[fulladdress_column] - - searcher = SearchEpc( - address1=str(house_number), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5 - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None - - searcher.find_property(skip_os=True) - if searcher.newest_epc is None: - continue - - # Look for EPC recommendatons - try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} - - # Retrieve data from FindMyEPC - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - time.sleep(np.random.uniform(0.1, 1)) try: postcode = home[postcode_column] house_number = home[address1_column] @@ -79,6 +46,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): searcher.find_property(skip_os=True) if searcher.newest_epc is None: + no_epc.append(home["row_id"]) continue # Look for EPC recommendatons @@ -106,7 +74,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): errors.append(home["row_id"]) time.sleep(5) - return epc_data, errors + return epc_data, errors, no_epc def extract_address1(asset_list, full_address_col, method="first_two_words"): @@ -140,26 +108,37 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/" - DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/" + DATA_FILENAME = "Bromford programme review.xlsx" + SHEET_NAME = "Bromford" POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = "Address" - ADDRESS1_COLUMN = None + FULLADDRESS_COLUMN = None + ADDRESS1_COLUMN = "No." ADDRESS1_METHOD = "first_two_words" + ADDRESS_COLS_TO_CONCAT = ["No.", "Address"] - asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0) + asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) + asset_list = asset_list[~pd.isnull(asset_list["Postcode"])] asset_list["row_id"] = asset_list.index # We clean up portential non-breaking spaces, and double spaces for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]: + asset_list[col] = asset_list[col].astype(str) asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) if ADDRESS1_COLUMN is None: ADDRESS1_COLUMN = "address1_extracted" - asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD) + asset_list = extract_address1( + asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD + ) - epc_data, errors = get_data( + if FULLADDRESS_COLUMN is None: + FULLADDRESS_COLUMN = "fulladdress_extracted" + # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas + asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1) + + epc_data, errors, no_epc = get_data( asset_list=asset_list, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, @@ -168,7 +147,7 @@ def app(): # We now retrieve any failed properties asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] - epc_data_failed, _ = get_data( + epc_data_failed, _, _ = get_data( asset_list=asset_list_failed, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN,