Model/model_data/LandRegistryClient.py

from typing import List, Dict
import pandas as pd
from tqdm import tqdm
import string
from utils.logger import setup_logger
from fuzzywuzzy import fuzz
import numpy as np

logger = setup_logger()


class LandRegistryClient:
    COLUMN_NAMES = [
        "transaction_id",
        "price",
        "date_of_transfer",
        "postcode",
        "property_type",
        "old_new",
        "duration",
        "paon",
        "saon",
        "street",
        "locality",
        "town_city",
        "district",
        "county",
        "ppd_category_type",
        "record_status",
    ]

    # A score of 70-100 is a high match
    SIMILARITY_THRESHOLD = 70

    def __init__(self, paths: List[str], addresses: List[Dict[str, str]]):
        self.paths = paths
        self.addresses = pd.DataFrame(addresses)

        translation_table = str.maketrans("", "", string.punctuation)
        # Use the translation table to remove punctuation from the text
        self.addresses['address_match'] = self.addresses['address'].str.upper().str.translate(translation_table)

    def read(self):
        logger.info("Reading in land registry data")
        res = []

        for path in tqdm(self.paths):
            df = pd.read_csv(path, header=None)
            df.columns = self.COLUMN_NAMES
            df = df[df["postcode"].isin(self.addresses["postcode"])]
            res.append(df)
            del df

        res = pd.concat(res)
        res = res.reset_index(drop=True)
        res["id"] = res.index

        # We want to remove records that were
        # 1) not sold at market value (this is when ppd_category_type is not A)
        # 2) propety type is other (this is when property_type is O)
        res = res[(res["ppd_category_type"] == "A") & (res["property_type"] != "O")]
        # Construct address
        res['address'] = res[
            ['saon', 'paon', 'street', 'locality']
        ].fillna('').agg(' '.join, axis=1)

        res["address1_land_registry"] = res[
            ['paon', 'street']
        ].fillna('').agg(' '.join, axis=1)

        # We now want to fuzzy match between res and self.addresses on postcode and take the
        # best fuzzy match
        res = res.merge(self.addresses, how="left", on="postcode", suffixes=("_land_registry", "_epc"))

        res = res[
            ((res["address1_land_registry"] == res["address1"]) |
             (res["address1_land_registry"] == res["address2"]))
        ]

        res = res[res.apply(lambda row: row['paon'] in row['address_match'], axis=1)]
        res = res[
            res.apply(lambda row: row['saon'] in row['address_match'] if not pd.isnull(row["saon"]) else False, axis=1)
        ]

        res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_match'])

        res = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD]

        # Take the largest match_similarity for each id
        res = (
            res.sort_values("match_similarity", ascending=False)
            .groupby("id", as_index=False)
            .head(1)
        )

        # Drop extra stuff
        res = res[
            [
                "price", "date_of_transfer", "property_type", "old_new", "duration", "ppd_category_type",
                "record_status",
                "uprn",
                "address_epc"
            ]
        ].rename(
            columns={"address_epc": "address"}
        )

        return res