working on land registry matches

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-27 22:37:13 +01:00
parent 971a74017e
commit a2a5094b01

View file

@ -345,9 +345,6 @@ def app():
company_ownership["Postcode"].str.lower().isin(properties["POSTCODE"].str.lower().unique())
]
# Read in land registry
land_registry = pd.read_csv("/Users/khalimconn-kowlessar/Downloads/land_registry_prices_paid_filtered.csv")
# Now we filter properties the other way around
properties = properties[properties["POSTCODE"].str.lower().isin(company_ownership["Postcode"].str.lower().unique())]
# We end up with 7.4k entires on a postcode match, however we need to now do a direct address match
@ -485,14 +482,167 @@ def app():
# leasehold_matching_lookup = remove_duplicate_matches(leasehold_matching_lookup, properties, company_ownership)
matched_addresses = combined_matching_lookup.merge(
properties[["UPRN", "ADDRESS", "CURRENT_ENERGY_EFFICIENCY", "CURRENT_ENERGY_RATING"]].rename(
columns={"ADDRESS": "epc_address"}),
properties[
[
"UPRN",
"ADDRESS",
"ADDRESS1",
"CURRENT_ENERGY_EFFICIENCY",
"CURRENT_ENERGY_RATING",
"POSTCODE"
]
].rename(
columns={
"ADDRESS": "epc_address",
"ADDRESS1": "epc_address1",
"POSTCODE": "epc_postcode"
}
),
how="left", on="UPRN"
).merge(
company_ownership[["Title Number", "Property Address", "Company Registration No. (1)", "Proprietor Name (1)"]],
company_ownership[
[
"Title Number",
"Property Address",
"Postcode",
"Company Registration No. (1)",
"Proprietor Name (1)",
]
],
how="left", on="Title Number"
)
# Let's try and get the house number
matched_addresses["house_number"] = (
matched_addresses["epc_address"]
.apply(remove_text_in_brackets)
.apply(SearchEpc.get_house_number)
.str.lower()
.str.replace(",", "")
)
# Read in land registry
land_registry = pd.read_csv(
"/Users/khalimconn-kowlessar/Downloads/land_registry_prices_paid_filtered.csv",
)
# We now perform a match between the land registry data and the matched address, in an attempt to find
# out when these properties last sold. The land registry data has been pre filtered on the postcodes in this
# data, and for sales within the last 5 years, to ensure the file isn't too large.
land_registry["postcode"] = land_registry["postcode"].str.lower().str.strip()
land_registry["street"] = land_registry["street"].str.lower().str.strip()
land_registry["paon"] = land_registry["paon"].str.lower().str.strip()
land_registry["date_of_transfer"] = pd.to_datetime(land_registry["date_of_transfer"])
def is_substring(x, match_string):
if pd.isnull(x):
return False
return x in match_string.lower()
def house_number_match(paon, house_number):
# Firstly try and convert to numberic
try:
paon_numeric = int(paon)
house_number_numeric = int(house_number)
return paon_numeric == house_number_numeric
except Exception as e: # noqa
# If we can't convert both to numeric, we do an equality
return paon == house_number
def check_equalities(lr_filtered):
all_paon_equal = all(lr_filtered["paon"] == lr_filtered["paon"].values[0])
if pd.isnull(lr_filtered["saon"].values[0]):
all_saon_equal = all(pd.isnull(lr_filtered["saon"]))
else:
all_saon_equal = all(lr_filtered["saon"] == lr_filtered["saon"].values[0])
all_street_equal = all(lr_filtered["street"] == lr_filtered["street"].values[0])
return all_paon_equal, all_saon_equal, all_street_equal
land_registry_matches = []
for _, match in tqdm(matched_addresses.iterrows(), total=len(matched_addresses)):
# Filter land registry on the postcode
lr_filtered = land_registry[
(land_registry["postcode"] == match["epc_postcode"].lower().strip())
]
# Filter further, when the street is in in the address
# street should be contained in epc_address
lr_filtered = lr_filtered[
lr_filtered["street"].apply(lambda x: is_substring(x, match["epc_address"].lower()))
]
if lr_filtered.empty:
continue
# We now check if paon is in address 1
lr_filtered["paon_match"] = lr_filtered["paon"].apply(lambda x: house_number_match(x, match["house_number"]))
# We also try the secondary match
lr_filtered["saon_match"] = lr_filtered["saon"].apply(
lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address1"])
)
# We fileter where we have a primary or secondary match
lr_filtered = lr_filtered[
lr_filtered["paon_match"] | lr_filtered["saon_match"]
]
if lr_filtered.empty:
continue
elif lr_filtered.shape[0] == 1:
land_registry_matches.append(
{
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
elif lr_filtered.shape[0] > 1:
# We make sure all records are the same and take the newest
all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered)
has_paon_match = any(lr_filtered["paon_match"])
if all_paon_equal and all_street_equal and all_saon_equal:
# Take the newest record, append and continue
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
lr_filtered = lr_filtered.head(1)
land_registry_matches.append(
{
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
elif has_paon_match and all_street_equal:
# Peform filter on paon
lr_filtered = lr_filtered[lr_filtered["paon_match"]]
# Do an addtiioanl equality check
all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered)
if all_paon_equal and all_street_equal and all_saon_equal:
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
lr_filtered = lr_filtered.head(1)
land_registry_matches.append(
{
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
else:
raise NotImplementedError("wtf")
else:
raise NotImplementedError("wtf")
else:
raise NotImplementedError("What happened here?")
# shared_freehold_match = pd.DataFrame(shared_freehold_match)
# Strore these files
# freehold_matching_lookup.to_excel("freehold_matching_lookup.xlsx")