cleaning up ha15 merging process

This commit is contained in:
Khalim Conn-Kowlessar 2023-12-06 16:08:24 +00:00
parent d457ab46fb
commit a6c8ca0e1d

View file

@ -33,6 +33,11 @@ def marge_ha_32(asset_list, identified_addresses):
This method merges the asset list onto the list of identified addresses, forming a singular file for ha32
"""
dropped_identified_merge_keys = []
# ha32 starts with 1418 rows
starting_rows = len(asset_list)
# We update how the Coxwold are listed in the identified addresses
identified_addresses["Address"] = np.where(
identified_addresses["Address"] == "Coxwold",
@ -68,11 +73,6 @@ def marge_ha_32(asset_list, identified_addresses):
identified_addresses["Address"]
)
dropped_identified_merge_keys = []
# ha32 starts with 1418 rows
starting_rows = len(asset_list)
asset_list["merge_key"] = (
asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "") +
@ -158,62 +158,64 @@ def marge_ha_32(asset_list, identified_addresses):
# TODO: Finish me
def merge_ha_15(asset_list, identified_addresses, ha):
def merge_ha_15(asset_list, identified_addresses):
"""
This method merges the asset list onto the list of identified addresses, forming a singular file
"""
if ha not in ["ha32", "ha15"]:
raise ValueError("ha must be either ha32 or ha15")
if ha == "ha32":
else:
raise NotImplementedError("We haven't implemented HA15 yet")
dropped_identified_merge_keys = []
dropped_asset_list_merge_keys = []
# Update how Mary Mac Manus Drive, Milton Keynes is listed in the identified addresses
identified_addresses["Address"] = identified_addresses["Address"].str.replace(
"Mary Mac Manus Drive, Milton Keynes", "Mary Mac Manus Drive"
)
# This address has the wrong postcode in the orignal asset list
asset_list["Postcode"] = np.where(
asset_list["Address Line 1"] == "103 Priory Crescent",
"HP19 9NY",
asset_list["Postcode"]
)
# ha32 starts with 1418 rows
# HA15 starts with 7665 rows
starting_rows = len(asset_list)
# We create a merge key on both files, based on concateneated, processed columns
if ha == "ha32":
asset_list["merge_key"] = (
asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
)
asset_list["merge_key"] = (
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
).str.replace(',', '').str.replace('.', '')
asset_list["merge_key2"] = (
asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "")
)
asset_list["merge_key2"] = (
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Address Line 3"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
).str.replace(',', '').str.replace('.', '')
identified_addresses["merge_key"] = (
identified_addresses["No."].astype(str).str.lower().str.strip().str.replace(" ", "") +
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") +
identified_addresses["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
)
asset_list["merge_key3"] = (
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
).str.replace(',', '').str.replace('.', '')
identified_addresses["merge_key2"] = (
identified_addresses["No."].astype(str).str.lower().str.strip().str.replace(" ", "") +
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "")
)
asset_list["merge_key4"] = (
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Address Line 4"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
).str.replace(',', '').str.replace('.', '')
else:
asset_list["merge_key"] = (
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
)
identified_addresses["merge_key"] = (
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") +
identified_addresses["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
).str.replace(',', '').str.replace('.', '')
# We check for duplicated identified addresses and in the asset list
identified_dupes = identified_addresses["merge_key"].duplicated()
if identified_dupes.sum():
logger.warning("We have %s duplicated identified addresses that will be dropped", identified_dupes.sum())
dropped_identified_merge_keys.extend(identified_addresses[identified_dupes]["merge_key"].tolist())
identified_addresses = identified_addresses.drop_duplicates("merge_key")
@ -234,7 +236,7 @@ def merge_ha_15(asset_list, identified_addresses, ha):
# Merge the asset list onto the identified addresses
merged_data = pd.merge(
asset_list,
identified_addresses.drop(columns="merge_key2"),
identified_addresses,
how="left",
left_on="merge_key",
right_on="merge_key",
@ -244,19 +246,50 @@ def merge_ha_15(asset_list, identified_addresses, ha):
if merged_data.shape[0] != starting_rows:
raise ValueError("Row numbers have changed")
merged_data = merged_data.merge(
identified_addresses.drop(columns="merge_key"),
# merge on the second merge key
merged_data = pd.merge(
merged_data,
identified_addresses,
how="left",
left_on="merge_key2",
right_on="merge_key2",
right_on="merge_key",
suffixes=("", "_identified_addresses2")
)
if merged_data.shape[0] != starting_rows:
raise ValueError("Row numbers have changed")
# merge on the third merge key
merged_data = pd.merge(
merged_data,
identified_addresses,
how="left",
left_on="merge_key3",
right_on="merge_key",
suffixes=("", "_identified_addresses3")
)
if merged_data.shape[0] != starting_rows:
raise ValueError("Row numbers have changed")
# merge on the fourth merge key
merged_data = pd.merge(
merged_data,
identified_addresses,
how="left",
left_on="merge_key4",
right_on="merge_key",
suffixes=("", "_identified_addresses4")
)
if merged_data.shape[0] != starting_rows:
raise ValueError("Row numbers have changed")
merged_data["identified"] = (
merged_data["Postcode_identified_addresses"].notnull() | merged_data["Postcode_identified_addresses2"].notnull()
merged_data["Postcode_identified_addresses"].notnull() |
merged_data["Postcode_identified_addresses2"].notnull() |
merged_data["Postcode_identified_addresses3"].notnull() |
merged_data["Postcode_identified_addresses4"].notnull()
)
# HA 32 issues:
@ -265,7 +298,9 @@ def merge_ha_15(asset_list, identified_addresses, ha):
missed = identified_addresses[
~identified_addresses["merge_key"].isin(merged_data["merge_key"]) &
~identified_addresses["merge_key2"].isin(merged_data["merge_key2"])
~identified_addresses["merge_key"].isin(merged_data["merge_key2"]) &
~identified_addresses["merge_key"].isin(merged_data["merge_key3"]) &
~identified_addresses["merge_key"].isin(merged_data["merge_key4"])
]
if ha == "ha32":
@ -274,19 +309,21 @@ def merge_ha_15(asset_list, identified_addresses, ha):
missed.shape
m1 = missed[missed["Address"].str.contains("Hessle")]
m1.head()
[x for x in m1["merge_key"] if x in asset_list["merge_key"].tolist()]
[x for x in m1["merge_key2"] if x in asset_list["merge_key2"].tolist()]
missed["Address"].unique()
z = merged_data[merged_data["Street"].str.contains("Hessle") & ~merged_data["identified"]]
len([m for m in missed["Address"].unique() if "Mary Mac" in m])
identified_addresses[identified_addresses["Address"].str.contains("Barringhton")]
asset_list[asset_list["Street"].str.contains("Hessle")]
a = identified_addresses[
identified_addresses["Address"].str.contains("103 Priory Crescent")
]
b = asset_list[
asset_list["Address Line 1"].str.contains("103 Priory Crescent")
]
a["merge_key"]
b["merge_key"]
b["merge_key2"]
b["merge_key3"]
identified_addresses["merge_key"].isin(merged_data["merge_key"])
@ -317,4 +354,4 @@ def app():
ha32_asset_list, ha15_asset_list, ha32_identified_addresses, ha15_identified_addresses = load_data()
ha32 = marge_ha_32(asset_list=ha32_asset_list, identified_addresses=ha32_identified_addresses)
ha15 = merge_ha_15(asset_list=ha15_asset_list, identified_addresses=ha15_identified_addresses, ha="ha15")
ha15 = merge_ha_15(asset_list=ha15_asset_list, identified_addresses=ha15_identified_addresses)