From 9a0c6c3e8fbae7a23980aa7e75912ef6202ab29d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 14:18:08 +0000 Subject: [PATCH] expanded eco3 matching --- .../ha_15_32/ha_analysis_batch_3.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 21509923..06bb0d96 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -172,7 +172,7 @@ class DataLoader: } UNMATCHED_ECO3 = { - "HA25": 119 + "HA25": 154 } def __init__(self, directories, december_figures_filepath, use_cache, rebuild): @@ -1508,12 +1508,16 @@ class DataLoader: eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") eco3_list = eco3_list_correction_function(eco3_list) + asset_list["matching_postcode_nospace"] = asset_list["matching_postcode"].str.replace(" ", "").str.lower() + eco3_list["postcode_no_space"] = eco3_list["Post Code"].str.lower().str.replace(" ", "") + if ha_name == "HA25": + # 317 -> 259 missed_postcodes = { - postcode.lower() for postcode in eco3_list["Post Code"] if - postcode.lower() not in asset_list["matching_postcode"].values + postcode for postcode in eco3_list["postcode_no_space"] if + postcode not in asset_list["matching_postcode_nospace"].values } - eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)] + eco3_list = eco3_list[~eco3_list["postcode_no_space"].isin(missed_postcodes)] # For the asset list, we create a matching address without any punctuation # TODO: We should generally just remove puncutation from addresses when matching @@ -1530,11 +1534,11 @@ class DataLoader: for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)): # if row["eco3_list_row_id"] == "HA25_Eco3_5422": # raise Exception() - postcode = row["Post Code"].lower().strip() + postcode = row["postcode_no_space"] # df will never be empty, since we've already done a check for common postcodes df = asset_list[ - asset_list["matching_postcode"].str.contains(postcode) + asset_list["matching_postcode_nospace"].str.contains(postcode) ] house_number = row["NO "] @@ -1588,6 +1592,8 @@ class DataLoader: f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" ) + # 154 missed, 2827 matched for HA 25 + matching_lookup = pd.DataFrame(matching_lookup) # Check dupes as this will cause problems later on if matching_lookup["asset_list_row_id"].duplicated().any():