From 022244377d36557f83081e505b8068ab2bd98004 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 12:26:16 +0000 Subject: [PATCH] working on fixing missed matched in eco3 matching --- .../ha_15_32/ha_analysis_batch_3.py | 84 +++++++++++++++---- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index ea5b0456..a5845990 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -171,6 +171,10 @@ class DataLoader: "HA107": 51, } + UNMATCHED_ECO3 = { + "HA25": 94 + } + def __init__(self, directories, december_figures_filepath, use_cache, rebuild): self.directories = directories self.use_cache = use_cache @@ -1458,9 +1462,6 @@ class DataLoader: def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): - # We add on a matching postcode without spaces for this - # asset_list["matching_postcode_no_space"] = asset_list["matching_postcode"].str.lower().str.replace(" ", "") - # May need an eco3 list correction function # NEADS DRIVE, postcode with bs305dt, is not found in the asset list @@ -1471,8 +1472,17 @@ class DataLoader: eco3_list = eco3_list[ ~pd.isnull(eco3_list["Post Code"]) ] + # We have a bunch of genuine duplicates + eco3_list = eco3_list.drop_duplicates(["NO ", "Street / Block Name", "Post Code"]) + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "HALWILL MEADOOW", "HALWILL MEADOW" + ) + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "Hall Road", "Hall Rd" + ) - missed_postcodes = [] if ha_name == "HA25": missed_postcodes = { postcode.lower() for postcode in eco3_list["Post Code"] if @@ -1480,10 +1490,18 @@ class DataLoader: } eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)] + # For the asset list, we create a matching address without any punctuation + # TODO: We should generally just remove puncutation from addresses when matching + asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(r'[^\w\s]', '', + regex=True) + # Remove double spaces + asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace( + " ", " " + ) + matching_lookup = [] missed = [] for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)): - postcode = row["Post Code"].lower().strip() # df will never be empty, since we've already done a check for common postcodes @@ -1507,24 +1525,20 @@ class DataLoader: if " " in str(house_number): house_number = house_number.split(" ")[0].strip() - df = df[df["matching_address"].str.contains(str(house_number))] + # We must do the house number filter + df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] + + # Perform a search on streetname + # We do this to prevent duplicate matches to properties with the same postcode and house number, + # but different streets + street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0] + street_name_section1 = re.sub(r'[^\w\s]', '', street_name_section1) + df = df[df["matching_address_no_punctuation"].str.contains(street_name_section1)] if df.empty: missed.append(row["eco3_list_row_id"]) continue - if df.shape[0] != 1: - df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] - - if df.empty: - missed.append(row["eco3_list_row_id"]) - continue - - if df.shape[0] != 1: - # Perform a search on streetname - street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0] - df = df[df["matching_address"].str.contains(street_name_section1)] - if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number) @@ -1538,6 +1552,40 @@ class DataLoader: } ) + # We verify the missed + # -HA25 contains 88 missed entries. These are actually 8 unique postcodes, where surveys were conducted + # on properties that had house numbers outside of the asset list + if len(missed) != self.UNMATCHED_ECO3[ha_name]: + raise ValueError( + f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" + ) + + # TODO: 194 missed + + matching_lookup = pd.DataFrame(matching_lookup) + # Check dupes as this will cause problems later on + if matching_lookup["asset_list_row_id"].duplicated().any(): + raise ValueError("Duplicated asset list row ids") + + missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)] + missed_df.head(3).tail(1)["eco3_list_row_id"] + + duped_ids = matching_lookup[matching_lookup["asset_list_row_id"].duplicated()]["asset_list_row_id"].tolist() + duped_df = matching_lookup[ + matching_lookup["asset_list_row_id"].isin(duped_ids) + ] + duped_surveys = eco3_list[ + eco3_list["eco3_list_row_id"].isin(duped_df["eco3_list_row_id"].values) + ].copy() + + duped_surveys = duped_surveys.merge(matching_lookup, how="left", on="eco3_list_row_id") + + duped_surveys[ + ["NO ", "Street / Block Name", "Post Code", "eco3_list_row_id", "asset_list_row_id"] + ].sort_values("asset_list_row_id").head() + + asset_list[asset_list["asset_list_row_id"] == "HA2515145"]["matching_address"].values + @staticmethod def extract_streetname(address, house_number=None, postcode=None): """