mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
handling deduping ciga match
This commit is contained in:
parent
ae2cc3fab5
commit
8ef0198606
1 changed files with 19 additions and 2 deletions
|
|
@ -41,7 +41,7 @@ class DataLoader:
|
|||
UNMATCHED_CIGA = {
|
||||
# We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not
|
||||
# the asset list
|
||||
"HA14": 4,
|
||||
"HA14": 3,
|
||||
# There's just too many unmatched here
|
||||
"HA6": 117,
|
||||
"HA107": 52
|
||||
|
|
@ -147,6 +147,17 @@ class DataLoader:
|
|||
|
||||
return ciga_list
|
||||
|
||||
@staticmethod
|
||||
def dedupe_ciga_list(ciga_list):
|
||||
ciga_list["unique_key"] = ciga_list["Matched Address"] + ciga_list["Matched Postcode"]
|
||||
# Remove spaces from the unique key
|
||||
ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(" ", "")
|
||||
# Remove punctuation from the unique key
|
||||
ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(r'[^\w\s]', '')
|
||||
# Drop duplicated keys
|
||||
ciga_list = ciga_list[~ciga_list["unique_key"].duplicated()]
|
||||
return ciga_list
|
||||
|
||||
@staticmethod
|
||||
def get_asset_sheetname(workbook):
|
||||
if "Asset List" in workbook.sheetnames:
|
||||
|
|
@ -244,6 +255,7 @@ class DataLoader:
|
|||
ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])]
|
||||
ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
|
||||
ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list)
|
||||
ciga_list = self.dedupe_ciga_list(ciga_list)
|
||||
ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)
|
||||
|
||||
return asset_list, survey_list, ciga_list
|
||||
|
|
@ -686,10 +698,15 @@ class DataLoader:
|
|||
|
||||
# We have an acceptable number of ciga failures for each HA
|
||||
if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]:
|
||||
raise ValueError(f"Unmatched addresses for {ha_name} is not as expected")
|
||||
raise ValueError(
|
||||
f"Unmatched addresses for {ha_name} is not as expected, got {len(unmatched_addresses)} unmatched")
|
||||
|
||||
matching_lookup = pd.DataFrame(matching_lookup)
|
||||
|
||||
# Check dupes as this will cause problems later on
|
||||
if matching_lookup["asset_list_row_id"].duplicated().any():
|
||||
raise ValueError("Duplicated asset list row ids")
|
||||
|
||||
# Merge onto the ciga list
|
||||
ciga_list = ciga_list.merge(matching_lookup, how='left', on="ciga_list_row_id")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue