mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
108 lines
3.4 KiB
Python
108 lines
3.4 KiB
Python
from typing import List, Dict
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
import string
|
|
from utils.logger import setup_logger
|
|
from fuzzywuzzy import fuzz
|
|
import numpy as np
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
class LandRegistryClient:
|
|
COLUMN_NAMES = [
|
|
"transaction_id",
|
|
"price",
|
|
"date_of_transfer",
|
|
"postcode",
|
|
"property_type",
|
|
"old_new",
|
|
"duration",
|
|
"paon",
|
|
"saon",
|
|
"street",
|
|
"locality",
|
|
"town_city",
|
|
"district",
|
|
"county",
|
|
"ppd_category_type",
|
|
"record_status",
|
|
]
|
|
|
|
# A score of 70-100 is a high match
|
|
SIMILARITY_THRESHOLD = 70
|
|
|
|
def __init__(self, paths: List[str], addresses: List[Dict[str, str]]):
|
|
self.paths = paths
|
|
self.addresses = pd.DataFrame(addresses)
|
|
|
|
translation_table = str.maketrans("", "", string.punctuation)
|
|
# Use the translation table to remove punctuation from the text
|
|
self.addresses['address_match'] = self.addresses['address'].str.upper().str.translate(translation_table)
|
|
|
|
def read(self):
|
|
logger.info("Reading in land registry data")
|
|
res = []
|
|
|
|
for path in tqdm(self.paths):
|
|
df = pd.read_csv(path, header=None)
|
|
df.columns = self.COLUMN_NAMES
|
|
df = df[df["postcode"].isin(self.addresses["postcode"])]
|
|
res.append(df)
|
|
del df
|
|
|
|
res = pd.concat(res)
|
|
res = res.reset_index(drop=True)
|
|
res["id"] = res.index
|
|
|
|
# We want to remove records that were
|
|
# 1) not sold at market value (this is when ppd_category_type is not A)
|
|
# 2) propety type is other (this is when property_type is O)
|
|
res = res[(res["ppd_category_type"] == "A") & (res["property_type"] != "O")]
|
|
# Construct address
|
|
res['address'] = res[
|
|
['saon', 'paon', 'street', 'locality']
|
|
].fillna('').agg(' '.join, axis=1)
|
|
|
|
res["address1_land_registry"] = res[
|
|
['paon', 'street']
|
|
].fillna('').agg(' '.join, axis=1)
|
|
|
|
# We now want to fuzzy match between res and self.addresses on postcode and take the
|
|
# best fuzzy match
|
|
res = res.merge(self.addresses, how="left", on="postcode", suffixes=("_land_registry", "_epc"))
|
|
|
|
res = res[
|
|
((res["address1_land_registry"] == res["address1"]) |
|
|
(res["address1_land_registry"] == res["address2"]))
|
|
]
|
|
|
|
res = res[res.apply(lambda row: row['paon'] in row['address_match'], axis=1)]
|
|
res = res[
|
|
res.apply(lambda row: row['saon'] in row['address_match'] if not pd.isnull(row["saon"]) else False, axis=1)
|
|
]
|
|
|
|
res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_match'])
|
|
|
|
res = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD]
|
|
|
|
# Take the largest match_similarity for each id
|
|
res = (
|
|
res.sort_values("match_similarity", ascending=False)
|
|
.groupby("id", as_index=False)
|
|
.head(1)
|
|
)
|
|
|
|
# Drop extra stuff
|
|
res = res[
|
|
[
|
|
"price", "date_of_transfer", "property_type", "old_new", "duration", "ppd_category_type",
|
|
"record_status",
|
|
"uprn",
|
|
"address_epc"
|
|
]
|
|
].rename(
|
|
columns={"address_epc": "address"}
|
|
)
|
|
|
|
return res
|