mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
cleaning up ha15 merging process
This commit is contained in:
parent
d457ab46fb
commit
a6c8ca0e1d
1 changed files with 96 additions and 59 deletions
|
|
@ -33,6 +33,11 @@ def marge_ha_32(asset_list, identified_addresses):
|
|||
This method merges the asset list onto the list of identified addresses, forming a singular file for ha32
|
||||
"""
|
||||
|
||||
dropped_identified_merge_keys = []
|
||||
|
||||
# ha32 starts with 1418 rows
|
||||
starting_rows = len(asset_list)
|
||||
|
||||
# We update how the Coxwold are listed in the identified addresses
|
||||
identified_addresses["Address"] = np.where(
|
||||
identified_addresses["Address"] == "Coxwold",
|
||||
|
|
@ -68,11 +73,6 @@ def marge_ha_32(asset_list, identified_addresses):
|
|||
identified_addresses["Address"]
|
||||
)
|
||||
|
||||
dropped_identified_merge_keys = []
|
||||
|
||||
# ha32 starts with 1418 rows
|
||||
starting_rows = len(asset_list)
|
||||
|
||||
asset_list["merge_key"] = (
|
||||
asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
|
|
@ -158,62 +158,64 @@ def marge_ha_32(asset_list, identified_addresses):
|
|||
# TODO: Finish me
|
||||
|
||||
|
||||
def merge_ha_15(asset_list, identified_addresses, ha):
|
||||
def merge_ha_15(asset_list, identified_addresses):
|
||||
"""
|
||||
This method merges the asset list onto the list of identified addresses, forming a singular file
|
||||
"""
|
||||
|
||||
if ha not in ["ha32", "ha15"]:
|
||||
raise ValueError("ha must be either ha32 or ha15")
|
||||
|
||||
if ha == "ha32":
|
||||
|
||||
|
||||
else:
|
||||
raise NotImplementedError("We haven't implemented HA15 yet")
|
||||
|
||||
dropped_identified_merge_keys = []
|
||||
dropped_asset_list_merge_keys = []
|
||||
|
||||
# Update how Mary Mac Manus Drive, Milton Keynes is listed in the identified addresses
|
||||
identified_addresses["Address"] = identified_addresses["Address"].str.replace(
|
||||
"Mary Mac Manus Drive, Milton Keynes", "Mary Mac Manus Drive"
|
||||
)
|
||||
|
||||
# This address has the wrong postcode in the orignal asset list
|
||||
asset_list["Postcode"] = np.where(
|
||||
asset_list["Address Line 1"] == "103 Priory Crescent",
|
||||
"HP19 9NY",
|
||||
asset_list["Postcode"]
|
||||
)
|
||||
|
||||
# ha32 starts with 1418 rows
|
||||
# HA15 starts with 7665 rows
|
||||
starting_rows = len(asset_list)
|
||||
|
||||
# We create a merge key on both files, based on concateneated, processed columns
|
||||
if ha == "ha32":
|
||||
asset_list["merge_key"] = (
|
||||
asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
||||
)
|
||||
asset_list["merge_key"] = (
|
||||
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
||||
).str.replace(',', '').str.replace('.', '')
|
||||
|
||||
asset_list["merge_key2"] = (
|
||||
asset_list["Dwelling num"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Street"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
||||
)
|
||||
asset_list["merge_key2"] = (
|
||||
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Address Line 3"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
||||
).str.replace(',', '').str.replace('.', '')
|
||||
|
||||
identified_addresses["merge_key"] = (
|
||||
identified_addresses["No."].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
identified_addresses["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
||||
)
|
||||
asset_list["merge_key3"] = (
|
||||
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
||||
).str.replace(',', '').str.replace('.', '')
|
||||
|
||||
identified_addresses["merge_key2"] = (
|
||||
identified_addresses["No."].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
||||
)
|
||||
asset_list["merge_key4"] = (
|
||||
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Address Line 2"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Address Line 4"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
||||
).str.replace(',', '').str.replace('.', '')
|
||||
|
||||
else:
|
||||
asset_list["merge_key"] = (
|
||||
asset_list["Address Line 1"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
asset_list["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
||||
)
|
||||
identified_addresses["merge_key"] = (
|
||||
identified_addresses["Address"].astype(str).str.lower().str.strip().str.replace(" ", "") +
|
||||
identified_addresses["Postcode"].astype(str).str.lower().str.strip().str.replace(" ", "")
|
||||
).str.replace(',', '').str.replace('.', '')
|
||||
|
||||
# We check for duplicated identified addresses and in the asset list
|
||||
|
||||
identified_dupes = identified_addresses["merge_key"].duplicated()
|
||||
if identified_dupes.sum():
|
||||
logger.warning("We have %s duplicated identified addresses that will be dropped", identified_dupes.sum())
|
||||
|
||||
dropped_identified_merge_keys.extend(identified_addresses[identified_dupes]["merge_key"].tolist())
|
||||
|
||||
identified_addresses = identified_addresses.drop_duplicates("merge_key")
|
||||
|
|
@ -234,7 +236,7 @@ def merge_ha_15(asset_list, identified_addresses, ha):
|
|||
# Merge the asset list onto the identified addresses
|
||||
merged_data = pd.merge(
|
||||
asset_list,
|
||||
identified_addresses.drop(columns="merge_key2"),
|
||||
identified_addresses,
|
||||
how="left",
|
||||
left_on="merge_key",
|
||||
right_on="merge_key",
|
||||
|
|
@ -244,19 +246,50 @@ def merge_ha_15(asset_list, identified_addresses, ha):
|
|||
if merged_data.shape[0] != starting_rows:
|
||||
raise ValueError("Row numbers have changed")
|
||||
|
||||
merged_data = merged_data.merge(
|
||||
identified_addresses.drop(columns="merge_key"),
|
||||
# merge on the second merge key
|
||||
merged_data = pd.merge(
|
||||
merged_data,
|
||||
identified_addresses,
|
||||
how="left",
|
||||
left_on="merge_key2",
|
||||
right_on="merge_key2",
|
||||
right_on="merge_key",
|
||||
suffixes=("", "_identified_addresses2")
|
||||
)
|
||||
|
||||
if merged_data.shape[0] != starting_rows:
|
||||
raise ValueError("Row numbers have changed")
|
||||
|
||||
# merge on the third merge key
|
||||
merged_data = pd.merge(
|
||||
merged_data,
|
||||
identified_addresses,
|
||||
how="left",
|
||||
left_on="merge_key3",
|
||||
right_on="merge_key",
|
||||
suffixes=("", "_identified_addresses3")
|
||||
)
|
||||
|
||||
if merged_data.shape[0] != starting_rows:
|
||||
raise ValueError("Row numbers have changed")
|
||||
|
||||
# merge on the fourth merge key
|
||||
merged_data = pd.merge(
|
||||
merged_data,
|
||||
identified_addresses,
|
||||
how="left",
|
||||
left_on="merge_key4",
|
||||
right_on="merge_key",
|
||||
suffixes=("", "_identified_addresses4")
|
||||
)
|
||||
|
||||
if merged_data.shape[0] != starting_rows:
|
||||
raise ValueError("Row numbers have changed")
|
||||
|
||||
merged_data["identified"] = (
|
||||
merged_data["Postcode_identified_addresses"].notnull() | merged_data["Postcode_identified_addresses2"].notnull()
|
||||
merged_data["Postcode_identified_addresses"].notnull() |
|
||||
merged_data["Postcode_identified_addresses2"].notnull() |
|
||||
merged_data["Postcode_identified_addresses3"].notnull() |
|
||||
merged_data["Postcode_identified_addresses4"].notnull()
|
||||
)
|
||||
|
||||
# HA 32 issues:
|
||||
|
|
@ -265,7 +298,9 @@ def merge_ha_15(asset_list, identified_addresses, ha):
|
|||
|
||||
missed = identified_addresses[
|
||||
~identified_addresses["merge_key"].isin(merged_data["merge_key"]) &
|
||||
~identified_addresses["merge_key2"].isin(merged_data["merge_key2"])
|
||||
~identified_addresses["merge_key"].isin(merged_data["merge_key2"]) &
|
||||
~identified_addresses["merge_key"].isin(merged_data["merge_key3"]) &
|
||||
~identified_addresses["merge_key"].isin(merged_data["merge_key4"])
|
||||
]
|
||||
|
||||
if ha == "ha32":
|
||||
|
|
@ -274,19 +309,21 @@ def merge_ha_15(asset_list, identified_addresses, ha):
|
|||
|
||||
missed.shape
|
||||
|
||||
m1 = missed[missed["Address"].str.contains("Hessle")]
|
||||
|
||||
m1.head()
|
||||
|
||||
[x for x in m1["merge_key"] if x in asset_list["merge_key"].tolist()]
|
||||
[x for x in m1["merge_key2"] if x in asset_list["merge_key2"].tolist()]
|
||||
|
||||
missed["Address"].unique()
|
||||
|
||||
z = merged_data[merged_data["Street"].str.contains("Hessle") & ~merged_data["identified"]]
|
||||
len([m for m in missed["Address"].unique() if "Mary Mac" in m])
|
||||
|
||||
identified_addresses[identified_addresses["Address"].str.contains("Barringhton")]
|
||||
asset_list[asset_list["Street"].str.contains("Hessle")]
|
||||
a = identified_addresses[
|
||||
identified_addresses["Address"].str.contains("103 Priory Crescent")
|
||||
]
|
||||
b = asset_list[
|
||||
asset_list["Address Line 1"].str.contains("103 Priory Crescent")
|
||||
]
|
||||
|
||||
a["merge_key"]
|
||||
b["merge_key"]
|
||||
b["merge_key2"]
|
||||
b["merge_key3"]
|
||||
|
||||
identified_addresses["merge_key"].isin(merged_data["merge_key"])
|
||||
|
||||
|
|
@ -317,4 +354,4 @@ def app():
|
|||
ha32_asset_list, ha15_asset_list, ha32_identified_addresses, ha15_identified_addresses = load_data()
|
||||
|
||||
ha32 = marge_ha_32(asset_list=ha32_asset_list, identified_addresses=ha32_identified_addresses)
|
||||
ha15 = merge_ha_15(asset_list=ha15_asset_list, identified_addresses=ha15_identified_addresses, ha="ha15")
|
||||
ha15 = merge_ha_15(asset_list=ha15_asset_list, identified_addresses=ha15_identified_addresses)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue