handling deduping ciga match

This commit is contained in:
Khalim Conn-Kowlessar 2024-02-26 16:22:50 +00:00
parent ae2cc3fab5
commit 8ef0198606

View file

@ -41,7 +41,7 @@ class DataLoader:
UNMATCHED_CIGA = {
# We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not
# the asset list
"HA14": 4,
"HA14": 3,
# There's just too many unmatched here
"HA6": 117,
"HA107": 52
@ -147,6 +147,17 @@ class DataLoader:
return ciga_list
@staticmethod
def dedupe_ciga_list(ciga_list):
ciga_list["unique_key"] = ciga_list["Matched Address"] + ciga_list["Matched Postcode"]
# Remove spaces from the unique key
ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(" ", "")
# Remove punctuation from the unique key
ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(r'[^\w\s]', '')
# Drop duplicated keys
ciga_list = ciga_list[~ciga_list["unique_key"].duplicated()]
return ciga_list
@staticmethod
def get_asset_sheetname(workbook):
if "Asset List" in workbook.sheetnames:
@ -244,6 +255,7 @@ class DataLoader:
ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])]
ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list)
ciga_list = self.dedupe_ciga_list(ciga_list)
ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)
return asset_list, survey_list, ciga_list
@ -686,10 +698,15 @@ class DataLoader:
# We have an acceptable number of ciga failures for each HA
if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]:
raise ValueError(f"Unmatched addresses for {ha_name} is not as expected")
raise ValueError(
f"Unmatched addresses for {ha_name} is not as expected, got {len(unmatched_addresses)} unmatched")
matching_lookup = pd.DataFrame(matching_lookup)
# Check dupes as this will cause problems later on
if matching_lookup["asset_list_row_id"].duplicated().any():
raise ValueError("Duplicated asset list row ids")
# Merge onto the ciga list
ciga_list = ciga_list.merge(matching_lookup, how='left', on="ciga_list_row_id")