From a6c8ca0e1d56e7f76361c6c159437f3d832b245c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Dec 2023 16:08:24 +0000 Subject: [PATCH] cleaning up ha15 merging process --- etl/eligibility/ha_15_32/app.py | 155 ++++++++++++++++++++------------ 1 file changed, 96 insertions(+), 59 deletions(-) diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py index 52f37caf..cf836439 100644 --- a/etl/eligibility/ha_15_32/app.py +++ b/etl/eligibility/ha_15_32/app.py @@ -33,6 +33,11 @@ def marge_ha_32(asset_list, identified_addresses): This method merges the asset list onto the list of identified addresses, forming a singular file for ha32 """ + dropped_identified_merge_keys = [] + + # ha32 starts with 1418 rows + starting_rows = len(asset_list) + # We update how the Coxwold are listed in the identified addresses identified_addresses["Address"] = np.where( identified_addresses["Address"] == "Coxwold", @@ -68,11 +73,6 @@ def marge_ha_32(asset_list, identified_addresses): identified_addresses["Address"] ) - dropped_identified_merge_keys = [] - - # ha32 starts with 1418 rows - starting_rows = len(asset_list) - asset_list["merge_key"] = ( asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") + asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "") + @@ -158,62 +158,64 @@ def marge_ha_32(asset_list, identified_addresses): # TODO: Finish me -def merge_ha_15(asset_list, identified_addresses, ha): +def merge_ha_15(asset_list, identified_addresses): """ This method merges the asset list onto the list of identified addresses, forming a singular file """ - if ha not in ["ha32", "ha15"]: - raise ValueError("ha must be either ha32 or ha15") - - if ha == "ha32": - - - else: - raise NotImplementedError("We haven't implemented HA15 yet") - dropped_identified_merge_keys = [] - dropped_asset_list_merge_keys = [] + + # Update how Mary Mac Manus Drive, Milton Keynes is listed in the identified addresses + identified_addresses["Address"] = identified_addresses["Address"].str.replace( + "Mary Mac Manus Drive, Milton Keynes", "Mary Mac Manus Drive" + ) + + # This address has the wrong postcode in the orignal asset list + asset_list["Postcode"] = np.where( + asset_list["Address Line 1"] == "103 Priory Crescent", + "HP19 9NY", + asset_list["Postcode"] + ) # ha32 starts with 1418 rows - # HA15 starts with 7665 rows starting_rows = len(asset_list) - # We create a merge key on both files, based on concateneated, processed columns - if ha == "ha32": - asset_list["merge_key"] = ( - asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") + - asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "") + - asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "") - ) + asset_list["merge_key"] = ( + asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") + + asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "") + ).str.replace(',', '').str.replace('.', '') - asset_list["merge_key2"] = ( - asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") + - asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "") - ) + asset_list["merge_key2"] = ( + asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") + + asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") + + asset_list["Address Line 3"].astype(str).str.lower().str.strip().str.replace(" ", "") + + asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "") + ).str.replace(',', '').str.replace('.', '') - identified_addresses["merge_key"] = ( - identified_addresses["No."].astype(str).str.lower().str.strip().str.replace(" ", "") + - identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") + - identified_addresses["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "") - ) + asset_list["merge_key3"] = ( + asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") + + asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") + + asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "") + ).str.replace(',', '').str.replace('.', '') - identified_addresses["merge_key2"] = ( - identified_addresses["No."].astype(str).str.lower().str.strip().str.replace(" ", "") + - identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") - ) + asset_list["merge_key4"] = ( + asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") + + asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") + + asset_list["Address Line 4"].astype(str).str.lower().str.strip().str.replace(" ", "") + + asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "") + ).str.replace(',', '').str.replace('.', '') - else: - asset_list["merge_key"] = ( - asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") + - asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "") - ) + identified_addresses["merge_key"] = ( + identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") + + identified_addresses["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "") + ).str.replace(',', '').str.replace('.', '') # We check for duplicated identified addresses and in the asset list identified_dupes = identified_addresses["merge_key"].duplicated() if identified_dupes.sum(): logger.warning("We have %s duplicated identified addresses that will be dropped", identified_dupes.sum()) + dropped_identified_merge_keys.extend(identified_addresses[identified_dupes]["merge_key"].tolist()) identified_addresses = identified_addresses.drop_duplicates("merge_key") @@ -234,7 +236,7 @@ def merge_ha_15(asset_list, identified_addresses, ha): # Merge the asset list onto the identified addresses merged_data = pd.merge( asset_list, - identified_addresses.drop(columns="merge_key2"), + identified_addresses, how="left", left_on="merge_key", right_on="merge_key", @@ -244,19 +246,50 @@ def merge_ha_15(asset_list, identified_addresses, ha): if merged_data.shape[0] != starting_rows: raise ValueError("Row numbers have changed") - merged_data = merged_data.merge( - identified_addresses.drop(columns="merge_key"), + # merge on the second merge key + merged_data = pd.merge( + merged_data, + identified_addresses, how="left", left_on="merge_key2", - right_on="merge_key2", + right_on="merge_key", suffixes=("", "_identified_addresses2") ) + if merged_data.shape[0] != starting_rows: + raise ValueError("Row numbers have changed") + + # merge on the third merge key + merged_data = pd.merge( + merged_data, + identified_addresses, + how="left", + left_on="merge_key3", + right_on="merge_key", + suffixes=("", "_identified_addresses3") + ) + + if merged_data.shape[0] != starting_rows: + raise ValueError("Row numbers have changed") + + # merge on the fourth merge key + merged_data = pd.merge( + merged_data, + identified_addresses, + how="left", + left_on="merge_key4", + right_on="merge_key", + suffixes=("", "_identified_addresses4") + ) + if merged_data.shape[0] != starting_rows: raise ValueError("Row numbers have changed") merged_data["identified"] = ( - merged_data["Postcode_identified_addresses"].notnull() | merged_data["Postcode_identified_addresses2"].notnull() + merged_data["Postcode_identified_addresses"].notnull() | + merged_data["Postcode_identified_addresses2"].notnull() | + merged_data["Postcode_identified_addresses3"].notnull() | + merged_data["Postcode_identified_addresses4"].notnull() ) # HA 32 issues: @@ -265,7 +298,9 @@ def merge_ha_15(asset_list, identified_addresses, ha): missed = identified_addresses[ ~identified_addresses["merge_key"].isin(merged_data["merge_key"]) & - ~identified_addresses["merge_key2"].isin(merged_data["merge_key2"]) + ~identified_addresses["merge_key"].isin(merged_data["merge_key2"]) & + ~identified_addresses["merge_key"].isin(merged_data["merge_key3"]) & + ~identified_addresses["merge_key"].isin(merged_data["merge_key4"]) ] if ha == "ha32": @@ -274,19 +309,21 @@ def merge_ha_15(asset_list, identified_addresses, ha): missed.shape - m1 = missed[missed["Address"].str.contains("Hessle")] - - m1.head() - - [x for x in m1["merge_key"] if x in asset_list["merge_key"].tolist()] - [x for x in m1["merge_key2"] if x in asset_list["merge_key2"].tolist()] - missed["Address"].unique() - z = merged_data[merged_data["Street"].str.contains("Hessle") & ~merged_data["identified"]] + len([m for m in missed["Address"].unique() if "Mary Mac" in m]) - identified_addresses[identified_addresses["Address"].str.contains("Barringhton")] - asset_list[asset_list["Street"].str.contains("Hessle")] + a = identified_addresses[ + identified_addresses["Address"].str.contains("103 Priory Crescent") + ] + b = asset_list[ + asset_list["Address Line 1"].str.contains("103 Priory Crescent") + ] + + a["merge_key"] + b["merge_key"] + b["merge_key2"] + b["merge_key3"] identified_addresses["merge_key"].isin(merged_data["merge_key"]) @@ -317,4 +354,4 @@ def app(): ha32_asset_list, ha15_asset_list, ha32_identified_addresses, ha15_identified_addresses = load_data() ha32 = marge_ha_32(asset_list=ha32_asset_list, identified_addresses=ha32_identified_addresses) - ha15 = merge_ha_15(asset_list=ha15_asset_list, identified_addresses=ha15_identified_addresses, ha="ha15") + ha15 = merge_ha_15(asset_list=ha15_asset_list, identified_addresses=ha15_identified_addresses)