From f61b7d371da958ecc2c00b8792c4024da49e8a7d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 16 Jun 2023 09:17:46 +0100 Subject: [PATCH] finished land registry client for the moment --- model_data/LandRegistryClient.py | 45 ++++++++++++++++++++++++++------ model_data/app.py | 7 +++-- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/model_data/LandRegistryClient.py b/model_data/LandRegistryClient.py index 67dfc0d3..7a025f4a 100644 --- a/model_data/LandRegistryClient.py +++ b/model_data/LandRegistryClient.py @@ -32,13 +32,13 @@ class LandRegistryClient: # A score of 70-100 is a high match SIMILARITY_THRESHOLD = 70 - def __init__(self, paths: List[str], addresses: List[Dict[str: str]]): + def __init__(self, paths: List[str], addresses: List[Dict[str, str]]): self.paths = paths self.addresses = pd.DataFrame(addresses) translation_table = str.maketrans("", "", string.punctuation) # Use the translation table to remove punctuation from the text - self.addresses['address'] = self.addresses['address'].str.translate(translation_table) + self.addresses['address_match'] = self.addresses['address'].str.upper().str.translate(translation_table) def read(self): logger.info("Reading in land registry data") @@ -61,19 +61,48 @@ class LandRegistryClient: res = res[(res["ppd_category_type"] == "A") & (res["property_type"] != "O")] # Construct address res['address'] = res[ - ['paon', 'saon', 'street', 'locality', 'town_city', 'district', 'county'] + ['saon', 'paon', 'street', 'locality'] + ].fillna('').agg(' '.join, axis=1) + + res["address1_land_registry"] = res[ + ['paon', 'street'] ].fillna('').agg(' '.join, axis=1) # We now want to fuzzy match between res and self.addresses on postcode and take the # best fuzzy match res = res.merge(self.addresses, how="left", on="postcode", suffixes=("_land_registry", "_epc")) - res = res[res["address_epc"].str.contains(res["paon"])] + res = res[ + ((res["address1_land_registry"] == res["address1"]) | + (res["address1_land_registry"] == res["address2"])) + ] - res = res[res['address_epc'].str.contains(res['paon'])] + res = res[res.apply(lambda row: row['paon'] in row['address_match'], axis=1)] + res = res[ + res.apply(lambda row: row['saon'] in row['address_match'] if not pd.isnull(row["saon"]) else False, axis=1) + ] - res = res[res.apply(lambda row: row['paon'] in row['address_epc'], axis=1)] + res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_match']) - res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_epc']) + res = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD] - res2 = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD] + # Take the largest match_similarity for each id + res = ( + res.sort_values("match_similarity", ascending=False) + .groupby("id", as_index=False) + .head(1) + ) + + # Drop extra stuff + res = res[ + [ + "price", "date_of_transfer", "property_type", "old_new", "duration", "ppd_category_type", + "record_status", + "uprn", + "address_epc" + ] + ].rename( + columns={"address_epc": "address"} + ) + + return res diff --git a/model_data/app.py b/model_data/app.py index 6d900001..424c19ca 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -49,7 +49,10 @@ def handler(): { "postcode": x["postcode"].upper(), "address1": x["address1"].upper(), - "address": x["address"].upper() + "address2": x["address2"].upper(), + "address3": x["address3"].upper(), + "address": x["address"], + "uprn": x["uprn"] } for x in data ] @@ -103,7 +106,7 @@ def handler(): land_registry_client = LandRegistryClient( paths=[ - os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv" + os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv", os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv",