adding to land registry matching logic

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-28 15:21:05 +01:00
parent a2a5094b01
commit 2174a85a8b

View file

@ -357,6 +357,8 @@ def app():
properties = properties.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN")
# TODO: Do we want to filter properties based on lodgement dates?
# E.g. we might want to filter properties that have had a sale EPC lodged in the last x months, because
# this could be indicative of a sale happening, and the land registry data may not have caught up yet
# Remove entries where the address begins with the term "land adjoining", or other records that don't reference the
# the property itself
@ -456,13 +458,9 @@ def app():
freehold_matching_lookup = pd.DataFrame(freehold_matching_lookup)
leasehold_matching_lookup = pd.DataFrame(leasehold_matching_lookup)
# shared_leasehold_match = pd.concat(shared_leasehold_match)
# shared_freehold_match = pd.concat(shared_freehold_match)
# freehold_matching_lookup.to_excel("freehold_matching_lookup_new.xlsx")
# leasehold_matching_lookup.to_excel("leasehold_matching_lookup_new.xlsx")
# shared_leasehold_match.to_excel("shared_leasehold_match_new.xlsx")
# shared_freehold_match.to_excel("shared_freehold_match_new.xlsx")
# freehold_matching_lookup.to_excel("freehold_matching_lookup V2.xlsx")
# leasehold_matching_lookup.to_excel("leasehold_matching_lookup V2.xlsx")
# The approximate matches aren't very good
freehold_matching_lookup = freehold_matching_lookup[freehold_matching_lookup["match_type"] == "exact"]
@ -477,10 +475,6 @@ def app():
# We also have duplicates at a UPRN level
combined_matching_lookup = remove_duplicate_uprn_matches(combined_matching_lookup, properties, company_ownership)
# There are some cases where we have duplicates
# freehold_matching_lookup = remove_duplicate_matches(freehold_matching_lookup, properties, company_ownership)
# leasehold_matching_lookup = remove_duplicate_matches(leasehold_matching_lookup, properties, company_ownership)
matched_addresses = combined_matching_lookup.merge(
properties[
[
@ -534,6 +528,7 @@ def app():
land_registry["postcode"] = land_registry["postcode"].str.lower().str.strip()
land_registry["street"] = land_registry["street"].str.lower().str.strip()
land_registry["paon"] = land_registry["paon"].str.lower().str.strip()
land_registry["saon"] = land_registry["saon"].str.lower().str.strip()
land_registry["date_of_transfer"] = pd.to_datetime(land_registry["date_of_transfer"])
def is_substring(x, match_string):
@ -576,8 +571,9 @@ def app():
# Filter further, when the street is in in the address
# street should be contained in epc_address
lr_filtered = lr_filtered[
lr_filtered["street"].apply(lambda x: is_substring(x, match["epc_address"].lower()))
]
lr_filtered["street"].apply(lambda x: is_substring(x, match["epc_address"].lower())) |
lr_filtered["street"].apply(lambda x: is_substring(x, match["Property Address"].lower()))
]
if lr_filtered.empty:
continue
@ -585,10 +581,11 @@ def app():
# We now check if paon is in address 1
lr_filtered["paon_match"] = lr_filtered["paon"].apply(lambda x: house_number_match(x, match["house_number"]))
# We also try the secondary match
lr_filtered["saon_match"] = lr_filtered["saon"].apply(
lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address1"])
lr_filtered["saon_match"] = (
lr_filtered["saon"].apply(
lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address1"])
)
)
# We fileter where we have a primary or secondary match
lr_filtered = lr_filtered[
lr_filtered["paon_match"] | lr_filtered["saon_match"]
@ -599,6 +596,7 @@ def app():
elif lr_filtered.shape[0] == 1:
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
@ -616,11 +614,13 @@ def app():
lr_filtered = lr_filtered.head(1)
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
elif has_paon_match and all_street_equal:
# Peform filter on paon
lr_filtered = lr_filtered[lr_filtered["paon_match"]]
@ -631,15 +631,92 @@ def app():
lr_filtered = lr_filtered.head(1)
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
else:
raise NotImplementedError("wtf")
# We do a match on saon
lr_filtered["saon_match2"] = lr_filtered["saon"].apply(
lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address"])
)
lr_filtered = lr_filtered[lr_filtered["saon_match2"]]
if lr_filtered.empty:
continue
elif lr_filtered.shape[0] == 1:
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
else:
raise NotImplementedError("wtf")
else:
raise NotImplementedError("wtf")
# We have a final check, based on an observed case
lr_address_1 = " ".join([x.lower().strip() for x in match["Property Address"].split(",")[0:2]])
lr_filtered["paon_match2"] = lr_filtered["paon"].apply(
lambda x: False if pd.isnull(x) else is_substring(x, lr_address_1)
)
lr_filtered = lr_filtered[lr_filtered["paon_match2"]]
if lr_filtered.empty:
continue
elif lr_filtered.shape[0] == 1:
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
else:
# Check all the same
all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered)
# Check saon is house number with exact match
lr_filtered["saon_match2"] = lr_filtered["saon"].apply(
lambda x: False if pd.isnull(x) else house_number_match(x, match["house_number"])
)
if all_paon_equal and all_saon_equal and all_street_equal:
# Take the newest record
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
lr_filtered = lr_filtered.head(1)
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
elif any(lr_filtered["saon_match2"]):
lr_filtered = lr_filtered[lr_filtered["saon_match2"]]
if lr_filtered.shape[0] == 1:
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
raise NotImplementedError("wtf")
else:
raise NotImplementedError("What happened here?")