setting up land registry data

This commit is contained in:
Khalim Conn-Kowlessar 2023-06-15 22:04:21 +01:00
parent f0f13cbf8a
commit c7075c5432
7 changed files with 224 additions and 31 deletions

2
.gitignore vendored
View file

@ -237,3 +237,5 @@ fabric.properties
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
# Locally stored data
/model_data/local_data/*

6
.idea/encodings.xml generated Normal file
View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$USER_HOME$/Downloads/NEED_2021_anon_dataset_metadata (1).ods" charset="US-ASCII" />
</component>
</project>

View file

@ -0,0 +1,79 @@
from typing import List, Dict
import pandas as pd
from tqdm import tqdm
import string
from model_data.utils import setup_logger
from fuzzywuzzy import fuzz
import numpy as np
logger = setup_logger()
class LandRegistryClient:
COLUMN_NAMES = [
"transaction_id",
"price",
"date_of_transfer",
"postcode",
"property_type",
"old_new",
"duration",
"paon",
"saon",
"street",
"locality",
"town_city",
"district",
"county",
"ppd_category_type",
"record_status",
]
# A score of 70-100 is a high match
SIMILARITY_THRESHOLD = 70
def __init__(self, paths: List[str], addresses: List[Dict[str: str]]):
self.paths = paths
self.addresses = pd.DataFrame(addresses)
translation_table = str.maketrans("", "", string.punctuation)
# Use the translation table to remove punctuation from the text
self.addresses['address'] = self.addresses['address'].str.translate(translation_table)
def read(self):
logger.info("Reading in land registry data")
res = []
for path in tqdm(self.paths):
df = pd.read_csv(path, header=None)
df.columns = self.COLUMN_NAMES
df = df[df["postcode"].isin(self.addresses["postcode"])]
res.append(df)
del df
res = pd.concat(res)
res = res.reset_index(drop=True)
res["id"] = res.index
# We want to remove records that were
# 1) not sold at market value (this is when ppd_category_type is not A)
# 2) propety type is other (this is when property_type is O)
res = res[(res["ppd_category_type"] == "A") & (res["property_type"] != "O")]
# Construct address
res['address'] = res[
['paon', 'saon', 'street', 'locality', 'town_city', 'district', 'county']
].fillna('').agg(' '.join, axis=1)
# We now want to fuzzy match between res and self.addresses on postcode and take the
# best fuzzy match
res = res.merge(self.addresses, how="left", on="postcode", suffixes=("_land_registry", "_epc"))
res = res[res["address_epc"].str.contains(res["paon"])]
res = res[res['address_epc'].str.contains(res['paon'])]
res = res[res.apply(lambda row: row['paon'] in row['address_epc'], axis=1)]
res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_epc'])
res2 = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD]

32
model_data/Need.py Normal file
View file

@ -0,0 +1,32 @@
import pandas as pd
from model_data.utils import setup_logger
logger = setup_logger()
class Need:
"""
Contains methods to read and interface with the NEED dataset.
Current iterations of this data is the 2021 anonymised dataset, which can be found here:
https://www.gov.uk/government/statistics/national-energy-efficiency-data-framework-need-anonymised-data-2021
"""
def __init__(self, local_authorities, path):
self.local_authorities = local_authorities
self.path = path
def read(self) -> pd.DataFrame:
"""
Reads the NEED dataset from a csv file.
:param path: path to the csv file
:return: pandas dataframe containing the data
"""
logger.info("Reading NEED data - could take a moment")
df = pd.read_csv(self.path)
df = df[df["REGION"].isin(self.local_authorities)]
z = df[df["REGION"].str.contains("E9")]
type(df["REGION"].values[0])
return df

View file

@ -39,37 +39,78 @@ def handler():
cleaner.clean()
# For testing:
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
from collections import Counter
count = Counter([x["main-fuel"] for x in data])
descriptions = {x["hotwater-description"] for x in data}
out = []
for description in descriptions:
res = HotWaterAttributes(description).process()
out.append(
import pickle
import os
with open(os.path.abspath(os.path.dirname(__file__)) + "/data.pkl", "rb") as f:
data = pickle.load(f)
postcodes = [x["postcode"].upper() for x in data]
address_meta = [
{
"original_description": description,
**res
}
"postcode": x["postcode"].upper(),
"address1": x["address1"].upper(),
"address": x["address"].upper()
} for x in data
]
# For testing:
# from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
# from collections import Counter
# count = Counter([x["main-fuel"] for x in data])
# descriptions = {x["hotwater-description"] for x in data}
# out = []
# for description in descriptions:
# res = HotWaterAttributes(description).process()
# out.append(
# {
# "original_description": description,
# **res
# }
# )
# df = pd.DataFrame(out)
# df = df.sort_values("original_description")
# df = df.reset_index(drop=True)
#
# import numpy as np
# idx = 1
# record = df[df.index == idx].to_dict("records")[0]
# record = {k: v for k, v in record.items() if v not in [None, np.nan]}
# from pprint import pprint
# pprint(record)
#
# # Issues:
# # 1) '2207 Time and temperature zone control' - we don't pick up any reference to the fact this is a time and
# # temperature zone control
# # and we only pick up temperature zone control at the moment. Can we capture this too
# # 2) 'Charging system linked to use of community heating, programmer and at least two room stats' - what are room
# # stats and how should
# # we capture this?
#
# df.to_dict("records")
from model_data.Need import Need
import os
need_client = Need(
local_authorities=local_authorities,
path=os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/need_2021_anon_dataset_4million.csv"
)
df = pd.DataFrame(out)
df = df.sort_values("original_description")
df = df.reset_index(drop=True)
need_data = need_client.read()
import numpy as np
idx = 1
record = df[df.index == idx].to_dict("records")[0]
record = {k: v for k, v in record.items() if v not in [None, np.nan]}
from pprint import pprint
pprint(record)
## Land registry
from model_data.LandRegistryClient import LandRegistryClient
import os
# Issues:
# 1) '2207 Time and temperature zone control' - we don't pick up any reference to the fact this is a time and
# temperature zone control
# and we only pick up temperature zone control at the moment. Can we capture this too
# 2) 'Charging system linked to use of community heating, programmer and at least two room stats' - what are room
# stats and how should
# we capture this?
df.to_dict("records")
land_registry_client = LandRegistryClient(
paths=[
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv"
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
],
addresses=address_meta
)

View file

@ -7,3 +7,5 @@ pytest
mock
pytest-cov
pytest-mock
fuzzywuzzy
python-Levenshtein

31
model_data/utils.py Normal file
View file

@ -0,0 +1,31 @@
import logging
def setup_logger(log_file=None, level=logging.INFO):
# Create a logger and set the logging level
logger = logging.getLogger()
logger.setLevel(level)
# Define the log message format
log_format = "%(asctime)s [%(levelname)s] %(message)s"
date_format = "%Y-%m-%d %H:%M:%S"
formatter = logging.Formatter(log_format, datefmt=date_format)
# Create a file handler and set the file path and format
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(level)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
# Create a console handler and set the format
console_handler = logging.StreamHandler()
console_handler.setLevel(level)
# Set the formatter for the handlers
console_handler.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(console_handler)
return logger