mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
finished land registry client for the moment
This commit is contained in:
parent
c7075c5432
commit
f61b7d371d
2 changed files with 42 additions and 10 deletions
|
|
@ -32,13 +32,13 @@ class LandRegistryClient:
|
|||
# A score of 70-100 is a high match
|
||||
SIMILARITY_THRESHOLD = 70
|
||||
|
||||
def __init__(self, paths: List[str], addresses: List[Dict[str: str]]):
|
||||
def __init__(self, paths: List[str], addresses: List[Dict[str, str]]):
|
||||
self.paths = paths
|
||||
self.addresses = pd.DataFrame(addresses)
|
||||
|
||||
translation_table = str.maketrans("", "", string.punctuation)
|
||||
# Use the translation table to remove punctuation from the text
|
||||
self.addresses['address'] = self.addresses['address'].str.translate(translation_table)
|
||||
self.addresses['address_match'] = self.addresses['address'].str.upper().str.translate(translation_table)
|
||||
|
||||
def read(self):
|
||||
logger.info("Reading in land registry data")
|
||||
|
|
@ -61,19 +61,48 @@ class LandRegistryClient:
|
|||
res = res[(res["ppd_category_type"] == "A") & (res["property_type"] != "O")]
|
||||
# Construct address
|
||||
res['address'] = res[
|
||||
['paon', 'saon', 'street', 'locality', 'town_city', 'district', 'county']
|
||||
['saon', 'paon', 'street', 'locality']
|
||||
].fillna('').agg(' '.join, axis=1)
|
||||
|
||||
res["address1_land_registry"] = res[
|
||||
['paon', 'street']
|
||||
].fillna('').agg(' '.join, axis=1)
|
||||
|
||||
# We now want to fuzzy match between res and self.addresses on postcode and take the
|
||||
# best fuzzy match
|
||||
res = res.merge(self.addresses, how="left", on="postcode", suffixes=("_land_registry", "_epc"))
|
||||
|
||||
res = res[res["address_epc"].str.contains(res["paon"])]
|
||||
res = res[
|
||||
((res["address1_land_registry"] == res["address1"]) |
|
||||
(res["address1_land_registry"] == res["address2"]))
|
||||
]
|
||||
|
||||
res = res[res['address_epc'].str.contains(res['paon'])]
|
||||
res = res[res.apply(lambda row: row['paon'] in row['address_match'], axis=1)]
|
||||
res = res[
|
||||
res.apply(lambda row: row['saon'] in row['address_match'] if not pd.isnull(row["saon"]) else False, axis=1)
|
||||
]
|
||||
|
||||
res = res[res.apply(lambda row: row['paon'] in row['address_epc'], axis=1)]
|
||||
res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_match'])
|
||||
|
||||
res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_epc'])
|
||||
res = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD]
|
||||
|
||||
res2 = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD]
|
||||
# Take the largest match_similarity for each id
|
||||
res = (
|
||||
res.sort_values("match_similarity", ascending=False)
|
||||
.groupby("id", as_index=False)
|
||||
.head(1)
|
||||
)
|
||||
|
||||
# Drop extra stuff
|
||||
res = res[
|
||||
[
|
||||
"price", "date_of_transfer", "property_type", "old_new", "duration", "ppd_category_type",
|
||||
"record_status",
|
||||
"uprn",
|
||||
"address_epc"
|
||||
]
|
||||
].rename(
|
||||
columns={"address_epc": "address"}
|
||||
)
|
||||
|
||||
return res
|
||||
|
|
|
|||
|
|
@ -49,7 +49,10 @@ def handler():
|
|||
{
|
||||
"postcode": x["postcode"].upper(),
|
||||
"address1": x["address1"].upper(),
|
||||
"address": x["address"].upper()
|
||||
"address2": x["address2"].upper(),
|
||||
"address3": x["address3"].upper(),
|
||||
"address": x["address"],
|
||||
"uprn": x["uprn"]
|
||||
} for x in data
|
||||
]
|
||||
|
||||
|
|
@ -103,7 +106,7 @@ def handler():
|
|||
|
||||
land_registry_client = LandRegistryClient(
|
||||
paths=[
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv"
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv",
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv",
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv",
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue