mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
working on land registry matches
This commit is contained in:
parent
971a74017e
commit
a2a5094b01
1 changed files with 156 additions and 6 deletions
|
|
@ -345,9 +345,6 @@ def app():
|
|||
company_ownership["Postcode"].str.lower().isin(properties["POSTCODE"].str.lower().unique())
|
||||
]
|
||||
|
||||
# Read in land registry
|
||||
land_registry = pd.read_csv("/Users/khalimconn-kowlessar/Downloads/land_registry_prices_paid_filtered.csv")
|
||||
|
||||
# Now we filter properties the other way around
|
||||
properties = properties[properties["POSTCODE"].str.lower().isin(company_ownership["Postcode"].str.lower().unique())]
|
||||
# We end up with 7.4k entires on a postcode match, however we need to now do a direct address match
|
||||
|
|
@ -485,14 +482,167 @@ def app():
|
|||
# leasehold_matching_lookup = remove_duplicate_matches(leasehold_matching_lookup, properties, company_ownership)
|
||||
|
||||
matched_addresses = combined_matching_lookup.merge(
|
||||
properties[["UPRN", "ADDRESS", "CURRENT_ENERGY_EFFICIENCY", "CURRENT_ENERGY_RATING"]].rename(
|
||||
columns={"ADDRESS": "epc_address"}),
|
||||
properties[
|
||||
[
|
||||
"UPRN",
|
||||
"ADDRESS",
|
||||
"ADDRESS1",
|
||||
"CURRENT_ENERGY_EFFICIENCY",
|
||||
"CURRENT_ENERGY_RATING",
|
||||
"POSTCODE"
|
||||
]
|
||||
].rename(
|
||||
columns={
|
||||
"ADDRESS": "epc_address",
|
||||
"ADDRESS1": "epc_address1",
|
||||
"POSTCODE": "epc_postcode"
|
||||
}
|
||||
),
|
||||
how="left", on="UPRN"
|
||||
).merge(
|
||||
company_ownership[["Title Number", "Property Address", "Company Registration No. (1)", "Proprietor Name (1)"]],
|
||||
company_ownership[
|
||||
[
|
||||
"Title Number",
|
||||
"Property Address",
|
||||
"Postcode",
|
||||
"Company Registration No. (1)",
|
||||
"Proprietor Name (1)",
|
||||
|
||||
]
|
||||
],
|
||||
how="left", on="Title Number"
|
||||
)
|
||||
|
||||
# Let's try and get the house number
|
||||
matched_addresses["house_number"] = (
|
||||
matched_addresses["epc_address"]
|
||||
.apply(remove_text_in_brackets)
|
||||
.apply(SearchEpc.get_house_number)
|
||||
.str.lower()
|
||||
.str.replace(",", "")
|
||||
)
|
||||
|
||||
# Read in land registry
|
||||
land_registry = pd.read_csv(
|
||||
"/Users/khalimconn-kowlessar/Downloads/land_registry_prices_paid_filtered.csv",
|
||||
)
|
||||
|
||||
# We now perform a match between the land registry data and the matched address, in an attempt to find
|
||||
# out when these properties last sold. The land registry data has been pre filtered on the postcodes in this
|
||||
# data, and for sales within the last 5 years, to ensure the file isn't too large.
|
||||
|
||||
land_registry["postcode"] = land_registry["postcode"].str.lower().str.strip()
|
||||
land_registry["street"] = land_registry["street"].str.lower().str.strip()
|
||||
land_registry["paon"] = land_registry["paon"].str.lower().str.strip()
|
||||
land_registry["date_of_transfer"] = pd.to_datetime(land_registry["date_of_transfer"])
|
||||
|
||||
def is_substring(x, match_string):
|
||||
|
||||
if pd.isnull(x):
|
||||
return False
|
||||
|
||||
return x in match_string.lower()
|
||||
|
||||
def house_number_match(paon, house_number):
|
||||
# Firstly try and convert to numberic
|
||||
try:
|
||||
paon_numeric = int(paon)
|
||||
house_number_numeric = int(house_number)
|
||||
return paon_numeric == house_number_numeric
|
||||
except Exception as e: # noqa
|
||||
# If we can't convert both to numeric, we do an equality
|
||||
|
||||
return paon == house_number
|
||||
|
||||
def check_equalities(lr_filtered):
|
||||
all_paon_equal = all(lr_filtered["paon"] == lr_filtered["paon"].values[0])
|
||||
if pd.isnull(lr_filtered["saon"].values[0]):
|
||||
all_saon_equal = all(pd.isnull(lr_filtered["saon"]))
|
||||
else:
|
||||
all_saon_equal = all(lr_filtered["saon"] == lr_filtered["saon"].values[0])
|
||||
|
||||
all_street_equal = all(lr_filtered["street"] == lr_filtered["street"].values[0])
|
||||
|
||||
return all_paon_equal, all_saon_equal, all_street_equal
|
||||
|
||||
land_registry_matches = []
|
||||
for _, match in tqdm(matched_addresses.iterrows(), total=len(matched_addresses)):
|
||||
|
||||
# Filter land registry on the postcode
|
||||
lr_filtered = land_registry[
|
||||
(land_registry["postcode"] == match["epc_postcode"].lower().strip())
|
||||
]
|
||||
|
||||
# Filter further, when the street is in in the address
|
||||
# street should be contained in epc_address
|
||||
lr_filtered = lr_filtered[
|
||||
lr_filtered["street"].apply(lambda x: is_substring(x, match["epc_address"].lower()))
|
||||
]
|
||||
|
||||
if lr_filtered.empty:
|
||||
continue
|
||||
|
||||
# We now check if paon is in address 1
|
||||
lr_filtered["paon_match"] = lr_filtered["paon"].apply(lambda x: house_number_match(x, match["house_number"]))
|
||||
# We also try the secondary match
|
||||
lr_filtered["saon_match"] = lr_filtered["saon"].apply(
|
||||
lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address1"])
|
||||
)
|
||||
|
||||
# We fileter where we have a primary or secondary match
|
||||
lr_filtered = lr_filtered[
|
||||
lr_filtered["paon_match"] | lr_filtered["saon_match"]
|
||||
]
|
||||
|
||||
if lr_filtered.empty:
|
||||
continue
|
||||
elif lr_filtered.shape[0] == 1:
|
||||
land_registry_matches.append(
|
||||
{
|
||||
"transaction_id": lr_filtered['transaction_id'].values[0],
|
||||
"price": lr_filtered["price"].values[0],
|
||||
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
||||
}
|
||||
)
|
||||
continue
|
||||
elif lr_filtered.shape[0] > 1:
|
||||
# We make sure all records are the same and take the newest
|
||||
all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered)
|
||||
has_paon_match = any(lr_filtered["paon_match"])
|
||||
|
||||
if all_paon_equal and all_street_equal and all_saon_equal:
|
||||
# Take the newest record, append and continue
|
||||
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
|
||||
lr_filtered = lr_filtered.head(1)
|
||||
land_registry_matches.append(
|
||||
{
|
||||
"transaction_id": lr_filtered['transaction_id'].values[0],
|
||||
"price": lr_filtered["price"].values[0],
|
||||
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
||||
}
|
||||
)
|
||||
elif has_paon_match and all_street_equal:
|
||||
# Peform filter on paon
|
||||
lr_filtered = lr_filtered[lr_filtered["paon_match"]]
|
||||
# Do an addtiioanl equality check
|
||||
all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered)
|
||||
if all_paon_equal and all_street_equal and all_saon_equal:
|
||||
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
|
||||
lr_filtered = lr_filtered.head(1)
|
||||
land_registry_matches.append(
|
||||
{
|
||||
"transaction_id": lr_filtered['transaction_id'].values[0],
|
||||
"price": lr_filtered["price"].values[0],
|
||||
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
||||
}
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError("wtf")
|
||||
else:
|
||||
raise NotImplementedError("wtf")
|
||||
else:
|
||||
raise NotImplementedError("What happened here?")
|
||||
|
||||
# shared_freehold_match = pd.DataFrame(shared_freehold_match)
|
||||
# Strore these files
|
||||
# freehold_matching_lookup.to_excel("freehold_matching_lookup.xlsx")
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue