setting up land registry data

2026-06-08 11:17:27 +00:00 · 2023-06-15 22:04:21 +01:00 · 2023-06-15 22:04:21 +01:00 · c7075c5432
commit c7075c5432
parent f0f13cbf8a
7 changed files with 224 additions and 31 deletions
--- a/.gitignore
+++ b/.gitignore
@ -237,3 +237,5 @@ fabric.properties
 # Android studio 3.1+ serialized cache file
 .idea/caches/build_file_checksums.ser

+# Locally stored data
+/model_data/local_data/*
--- a/.idea/encodings.xml
+++ b/.idea/encodings.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding">
+    <file url="file://$USER_HOME$/Downloads/NEED_2021_anon_dataset_metadata (1).ods" charset="US-ASCII" />
+  </component>
+</project>
--- a/model_data/LandRegistryClient.py
+++ b/model_data/LandRegistryClient.py
@ -0,0 +1,79 @@
+from typing import List, Dict
+import pandas as pd
+from tqdm import tqdm
+import string
+from model_data.utils import setup_logger
+from fuzzywuzzy import fuzz
+import numpy as np
+
+logger = setup_logger()
+
+
+class LandRegistryClient:
+    COLUMN_NAMES = [
+        "transaction_id",
+        "price",
+        "date_of_transfer",
+        "postcode",
+        "property_type",
+        "old_new",
+        "duration",
+        "paon",
+        "saon",
+        "street",
+        "locality",
+        "town_city",
+        "district",
+        "county",
+        "ppd_category_type",
+        "record_status",
+    ]
+
+    # A score of 70-100 is a high match
+    SIMILARITY_THRESHOLD = 70
+
+    def __init__(self, paths: List[str], addresses: List[Dict[str: str]]):
+        self.paths = paths
+        self.addresses = pd.DataFrame(addresses)
+
+        translation_table = str.maketrans("", "", string.punctuation)
+        # Use the translation table to remove punctuation from the text
+        self.addresses['address'] = self.addresses['address'].str.translate(translation_table)
+
+    def read(self):
+        logger.info("Reading in land registry data")
+        res = []
+
+        for path in tqdm(self.paths):
+            df = pd.read_csv(path, header=None)
+            df.columns = self.COLUMN_NAMES
+            df = df[df["postcode"].isin(self.addresses["postcode"])]
+            res.append(df)
+            del df
+
+        res = pd.concat(res)
+        res = res.reset_index(drop=True)
+        res["id"] = res.index
+
+        # We want to remove records that were
+        # 1) not sold at market value (this is when ppd_category_type is not A)
+        # 2) propety type is other (this is when property_type is O)
+        res = res[(res["ppd_category_type"] == "A") & (res["property_type"] != "O")]
+        # Construct address
+        res['address'] = res[
+            ['paon', 'saon', 'street', 'locality', 'town_city', 'district', 'county']
+        ].fillna('').agg(' '.join, axis=1)
+
+        # We now want to fuzzy match between res and self.addresses on postcode and take the
+        # best fuzzy match
+        res = res.merge(self.addresses, how="left", on="postcode", suffixes=("_land_registry", "_epc"))
+
+        res = res[res["address_epc"].str.contains(res["paon"])]
+
+        res = res[res['address_epc'].str.contains(res['paon'])]
+
+        res = res[res.apply(lambda row: row['paon'] in row['address_epc'], axis=1)]
+
+        res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_epc'])
+
+        res2 = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD]
--- a/model_data/Need.py
+++ b/model_data/Need.py
@ -0,0 +1,32 @@
+import pandas as pd
+from model_data.utils import setup_logger
+
+logger = setup_logger()
+
+
+class Need:
+    """
+    Contains methods to read and interface with the NEED dataset.
+
+    Current iterations of this data is the 2021 anonymised dataset, which can be found here:
+    https://www.gov.uk/government/statistics/national-energy-efficiency-data-framework-need-anonymised-data-2021
+    """
+
+    def __init__(self, local_authorities, path):
+        self.local_authorities = local_authorities
+        self.path = path
+
+    def read(self) -> pd.DataFrame:
+        """
+        Reads the NEED dataset from a csv file.
+        :param path:    path to the csv file
+        :return:        pandas dataframe containing the data
+        """
+        logger.info("Reading NEED data - could take a moment")
+        df = pd.read_csv(self.path)
+        df = df[df["REGION"].isin(self.local_authorities)]
+
+        z = df[df["REGION"].str.contains("E9")]
+
+        type(df["REGION"].values[0])
+        return df
--- a/model_data/app.py
+++ b/model_data/app.py
@ -39,37 +39,78 @@ def handler():

    cleaner.clean()

-    # For testing:
-    from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
-    from collections import Counter
-    count = Counter([x["main-fuel"] for x in data])
-    descriptions = {x["hotwater-description"] for x in data}
-    out = []
-    for description in descriptions:
-        res = HotWaterAttributes(description).process()
-        out.append(
+    import pickle
+    import os
+    with open(os.path.abspath(os.path.dirname(__file__)) + "/data.pkl", "rb") as f:
+        data = pickle.load(f)
+
+    postcodes = [x["postcode"].upper() for x in data]
+    address_meta = [
        {
-                "original_description": description,
-                **res
-            }
+            "postcode": x["postcode"].upper(),
+            "address1": x["address1"].upper(),
+            "address": x["address"].upper()
+        } for x in data
+    ]
+
+    # For testing:
+    # from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
+    # from collections import Counter
+    # count = Counter([x["main-fuel"] for x in data])
+    # descriptions = {x["hotwater-description"] for x in data}
+    # out = []
+    # for description in descriptions:
+    #     res = HotWaterAttributes(description).process()
+    #     out.append(
+    #         {
+    #             "original_description": description,
+    #             **res
+    #         }
+    #     )
+    # df = pd.DataFrame(out)
+    # df = df.sort_values("original_description")
+    # df = df.reset_index(drop=True)
+    #
+    # import numpy as np
+    # idx = 1
+    # record = df[df.index == idx].to_dict("records")[0]
+    # record = {k: v for k, v in record.items() if v not in [None, np.nan]}
+    # from pprint import pprint
+    # pprint(record)
+    #
+    # # Issues:
+    # # 1) '2207 Time and temperature zone control' - we don't pick up any reference to the fact this is a time and
+    # # temperature zone control
+    # #     and we only pick up temperature zone control at the moment. Can we capture this too
+    # # 2) 'Charging system linked to use of community heating, programmer and at least two room stats' - what are room
+    # # stats and how should
+    # #     we capture this?
+    #
+    # df.to_dict("records")
+
+    from model_data.Need import Need
+    import os
+
+    need_client = Need(
+        local_authorities=local_authorities,
+        path=os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/need_2021_anon_dataset_4million.csv"
    )
-    df = pd.DataFrame(out)
-    df = df.sort_values("original_description")
-    df = df.reset_index(drop=True)
+    need_data = need_client.read()

-    import numpy as np
-    idx = 1
-    record = df[df.index == idx].to_dict("records")[0]
-    record = {k: v for k, v in record.items() if v not in [None, np.nan]}
-    from pprint import pprint
-    pprint(record)
+    ## Land registry
+    from model_data.LandRegistryClient import LandRegistryClient
+    import os

-    # Issues:
-    # 1) '2207 Time and temperature zone control' - we don't pick up any reference to the fact this is a time and
-    # temperature zone control
-    #     and we only pick up temperature zone control at the moment. Can we capture this too
-    # 2) 'Charging system linked to use of community heating, programmer and at least two room stats' - what are room
-    # stats and how should
-    #     we capture this?
-
-    df.to_dict("records")
+    land_registry_client = LandRegistryClient(
+        paths=[
+            os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv"
+            os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv",
+            os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv",
+            os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv",
+            os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv",
+            os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv",
+            os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv",
+            os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
+        ],
+        addresses=address_meta
+    )
--- a/model_data/requirements.txt
+++ b/model_data/requirements.txt
@ -7,3 +7,5 @@ pytest
 mock
 pytest-cov
 pytest-mock
+fuzzywuzzy
+python-Levenshtein
--- a/model_data/utils.py
+++ b/model_data/utils.py
@ -0,0 +1,31 @@
+import logging
+
+
+def setup_logger(log_file=None, level=logging.INFO):
+    # Create a logger and set the logging level
+    logger = logging.getLogger()
+    logger.setLevel(level)
+
+    # Define the log message format
+    log_format = "%(asctime)s [%(levelname)s] %(message)s"
+    date_format = "%Y-%m-%d %H:%M:%S"
+    formatter = logging.Formatter(log_format, datefmt=date_format)
+
+    # Create a file handler and set the file path and format
+    if log_file:
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setLevel(level)
+        file_handler.setFormatter(formatter)
+        logger.addHandler(file_handler)
+
+    # Create a console handler and set the format
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(level)
+
+    # Set the formatter for the handlers
+    console_handler.setFormatter(formatter)
+
+    # Add the handlers to the logger
+    logger.addHandler(console_handler)
+
+    return logger