From 8ef0198606486cf3eee9abf84723181ef221ea6b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 16:22:50 +0000 Subject: [PATCH] handling deduping ciga match --- .../ha_15_32/ha_analysis_batch_3.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index d75a9f34..6ffe50e3 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -41,7 +41,7 @@ class DataLoader: UNMATCHED_CIGA = { # We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not # the asset list - "HA14": 4, + "HA14": 3, # There's just too many unmatched here "HA6": 117, "HA107": 52 @@ -147,6 +147,17 @@ class DataLoader: return ciga_list + @staticmethod + def dedupe_ciga_list(ciga_list): + ciga_list["unique_key"] = ciga_list["Matched Address"] + ciga_list["Matched Postcode"] + # Remove spaces from the unique key + ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(" ", "") + # Remove punctuation from the unique key + ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(r'[^\w\s]', '') + # Drop duplicated keys + ciga_list = ciga_list[~ciga_list["unique_key"].duplicated()] + return ciga_list + @staticmethod def get_asset_sheetname(workbook): if "Asset List" in workbook.sheetnames: @@ -244,6 +255,7 @@ class DataLoader: ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])] ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))] ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list) + ciga_list = self.dedupe_ciga_list(ciga_list) ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) return asset_list, survey_list, ciga_list @@ -686,10 +698,15 @@ class DataLoader: # We have an acceptable number of ciga failures for each HA if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]: - raise ValueError(f"Unmatched addresses for {ha_name} is not as expected") + raise ValueError( + f"Unmatched addresses for {ha_name} is not as expected, got {len(unmatched_addresses)} unmatched") matching_lookup = pd.DataFrame(matching_lookup) + # Check dupes as this will cause problems later on + if matching_lookup["asset_list_row_id"].duplicated().any(): + raise ValueError("Duplicated asset list row ids") + # Merge onto the ciga list ciga_list = ciga_list.merge(matching_lookup, how='left', on="ciga_list_row_id")