from typing import List, Dict import pandas as pd from tqdm import tqdm import string from model_data.utils import setup_logger from fuzzywuzzy import fuzz import numpy as np logger = setup_logger() class LandRegistryClient: COLUMN_NAMES = [ "transaction_id", "price", "date_of_transfer", "postcode", "property_type", "old_new", "duration", "paon", "saon", "street", "locality", "town_city", "district", "county", "ppd_category_type", "record_status", ] # A score of 70-100 is a high match SIMILARITY_THRESHOLD = 70 def __init__(self, paths: List[str], addresses: List[Dict[str, str]]): self.paths = paths self.addresses = pd.DataFrame(addresses) translation_table = str.maketrans("", "", string.punctuation) # Use the translation table to remove punctuation from the text self.addresses['address_match'] = self.addresses['address'].str.upper().str.translate(translation_table) def read(self): logger.info("Reading in land registry data") res = [] for path in tqdm(self.paths): df = pd.read_csv(path, header=None) df.columns = self.COLUMN_NAMES df = df[df["postcode"].isin(self.addresses["postcode"])] res.append(df) del df res = pd.concat(res) res = res.reset_index(drop=True) res["id"] = res.index # We want to remove records that were # 1) not sold at market value (this is when ppd_category_type is not A) # 2) propety type is other (this is when property_type is O) res = res[(res["ppd_category_type"] == "A") & (res["property_type"] != "O")] # Construct address res['address'] = res[ ['saon', 'paon', 'street', 'locality'] ].fillna('').agg(' '.join, axis=1) res["address1_land_registry"] = res[ ['paon', 'street'] ].fillna('').agg(' '.join, axis=1) # We now want to fuzzy match between res and self.addresses on postcode and take the # best fuzzy match res = res.merge(self.addresses, how="left", on="postcode", suffixes=("_land_registry", "_epc")) res = res[ ((res["address1_land_registry"] == res["address1"]) | (res["address1_land_registry"] == res["address2"])) ] res = res[res.apply(lambda row: row['paon'] in row['address_match'], axis=1)] res = res[ res.apply(lambda row: row['saon'] in row['address_match'] if not pd.isnull(row["saon"]) else False, axis=1) ] res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_match']) res = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD] # Take the largest match_similarity for each id res = ( res.sort_values("match_similarity", ascending=False) .groupby("id", as_index=False) .head(1) ) # Drop extra stuff res = res[ [ "price", "date_of_transfer", "property_type", "old_new", "duration", "ppd_category_type", "record_status", "uprn", "address_epc" ] ].rename( columns={"address_epc": "address"} ) return res