mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
setting up land registry data
This commit is contained in:
parent
f0f13cbf8a
commit
c7075c5432
7 changed files with 224 additions and 31 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -237,3 +237,5 @@ fabric.properties
|
|||
# Android studio 3.1+ serialized cache file
|
||||
.idea/caches/build_file_checksums.ser
|
||||
|
||||
# Locally stored data
|
||||
/model_data/local_data/*
|
||||
|
|
|
|||
6
.idea/encodings.xml
generated
Normal file
6
.idea/encodings.xml
generated
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Encoding">
|
||||
<file url="file://$USER_HOME$/Downloads/NEED_2021_anon_dataset_metadata (1).ods" charset="US-ASCII" />
|
||||
</component>
|
||||
</project>
|
||||
79
model_data/LandRegistryClient.py
Normal file
79
model_data/LandRegistryClient.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
from typing import List, Dict
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
import string
|
||||
from model_data.utils import setup_logger
|
||||
from fuzzywuzzy import fuzz
|
||||
import numpy as np
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class LandRegistryClient:
|
||||
COLUMN_NAMES = [
|
||||
"transaction_id",
|
||||
"price",
|
||||
"date_of_transfer",
|
||||
"postcode",
|
||||
"property_type",
|
||||
"old_new",
|
||||
"duration",
|
||||
"paon",
|
||||
"saon",
|
||||
"street",
|
||||
"locality",
|
||||
"town_city",
|
||||
"district",
|
||||
"county",
|
||||
"ppd_category_type",
|
||||
"record_status",
|
||||
]
|
||||
|
||||
# A score of 70-100 is a high match
|
||||
SIMILARITY_THRESHOLD = 70
|
||||
|
||||
def __init__(self, paths: List[str], addresses: List[Dict[str: str]]):
|
||||
self.paths = paths
|
||||
self.addresses = pd.DataFrame(addresses)
|
||||
|
||||
translation_table = str.maketrans("", "", string.punctuation)
|
||||
# Use the translation table to remove punctuation from the text
|
||||
self.addresses['address'] = self.addresses['address'].str.translate(translation_table)
|
||||
|
||||
def read(self):
|
||||
logger.info("Reading in land registry data")
|
||||
res = []
|
||||
|
||||
for path in tqdm(self.paths):
|
||||
df = pd.read_csv(path, header=None)
|
||||
df.columns = self.COLUMN_NAMES
|
||||
df = df[df["postcode"].isin(self.addresses["postcode"])]
|
||||
res.append(df)
|
||||
del df
|
||||
|
||||
res = pd.concat(res)
|
||||
res = res.reset_index(drop=True)
|
||||
res["id"] = res.index
|
||||
|
||||
# We want to remove records that were
|
||||
# 1) not sold at market value (this is when ppd_category_type is not A)
|
||||
# 2) propety type is other (this is when property_type is O)
|
||||
res = res[(res["ppd_category_type"] == "A") & (res["property_type"] != "O")]
|
||||
# Construct address
|
||||
res['address'] = res[
|
||||
['paon', 'saon', 'street', 'locality', 'town_city', 'district', 'county']
|
||||
].fillna('').agg(' '.join, axis=1)
|
||||
|
||||
# We now want to fuzzy match between res and self.addresses on postcode and take the
|
||||
# best fuzzy match
|
||||
res = res.merge(self.addresses, how="left", on="postcode", suffixes=("_land_registry", "_epc"))
|
||||
|
||||
res = res[res["address_epc"].str.contains(res["paon"])]
|
||||
|
||||
res = res[res['address_epc'].str.contains(res['paon'])]
|
||||
|
||||
res = res[res.apply(lambda row: row['paon'] in row['address_epc'], axis=1)]
|
||||
|
||||
res['match_similarity'] = np.vectorize(fuzz.ratio)(res['address_land_registry'], res['address_epc'])
|
||||
|
||||
res2 = res[res["match_similarity"] >= self.SIMILARITY_THRESHOLD]
|
||||
32
model_data/Need.py
Normal file
32
model_data/Need.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
import pandas as pd
|
||||
from model_data.utils import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class Need:
|
||||
"""
|
||||
Contains methods to read and interface with the NEED dataset.
|
||||
|
||||
Current iterations of this data is the 2021 anonymised dataset, which can be found here:
|
||||
https://www.gov.uk/government/statistics/national-energy-efficiency-data-framework-need-anonymised-data-2021
|
||||
"""
|
||||
|
||||
def __init__(self, local_authorities, path):
|
||||
self.local_authorities = local_authorities
|
||||
self.path = path
|
||||
|
||||
def read(self) -> pd.DataFrame:
|
||||
"""
|
||||
Reads the NEED dataset from a csv file.
|
||||
:param path: path to the csv file
|
||||
:return: pandas dataframe containing the data
|
||||
"""
|
||||
logger.info("Reading NEED data - could take a moment")
|
||||
df = pd.read_csv(self.path)
|
||||
df = df[df["REGION"].isin(self.local_authorities)]
|
||||
|
||||
z = df[df["REGION"].str.contains("E9")]
|
||||
|
||||
type(df["REGION"].values[0])
|
||||
return df
|
||||
|
|
@ -39,37 +39,78 @@ def handler():
|
|||
|
||||
cleaner.clean()
|
||||
|
||||
import pickle
|
||||
import os
|
||||
with open(os.path.abspath(os.path.dirname(__file__)) + "/data.pkl", "rb") as f:
|
||||
data = pickle.load(f)
|
||||
|
||||
postcodes = [x["postcode"].upper() for x in data]
|
||||
address_meta = [
|
||||
{
|
||||
"postcode": x["postcode"].upper(),
|
||||
"address1": x["address1"].upper(),
|
||||
"address": x["address"].upper()
|
||||
} for x in data
|
||||
]
|
||||
|
||||
# For testing:
|
||||
from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
from collections import Counter
|
||||
count = Counter([x["main-fuel"] for x in data])
|
||||
descriptions = {x["hotwater-description"] for x in data}
|
||||
out = []
|
||||
for description in descriptions:
|
||||
res = HotWaterAttributes(description).process()
|
||||
out.append(
|
||||
{
|
||||
"original_description": description,
|
||||
**res
|
||||
}
|
||||
)
|
||||
df = pd.DataFrame(out)
|
||||
df = df.sort_values("original_description")
|
||||
df = df.reset_index(drop=True)
|
||||
# from model_data.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
# from collections import Counter
|
||||
# count = Counter([x["main-fuel"] for x in data])
|
||||
# descriptions = {x["hotwater-description"] for x in data}
|
||||
# out = []
|
||||
# for description in descriptions:
|
||||
# res = HotWaterAttributes(description).process()
|
||||
# out.append(
|
||||
# {
|
||||
# "original_description": description,
|
||||
# **res
|
||||
# }
|
||||
# )
|
||||
# df = pd.DataFrame(out)
|
||||
# df = df.sort_values("original_description")
|
||||
# df = df.reset_index(drop=True)
|
||||
#
|
||||
# import numpy as np
|
||||
# idx = 1
|
||||
# record = df[df.index == idx].to_dict("records")[0]
|
||||
# record = {k: v for k, v in record.items() if v not in [None, np.nan]}
|
||||
# from pprint import pprint
|
||||
# pprint(record)
|
||||
#
|
||||
# # Issues:
|
||||
# # 1) '2207 Time and temperature zone control' - we don't pick up any reference to the fact this is a time and
|
||||
# # temperature zone control
|
||||
# # and we only pick up temperature zone control at the moment. Can we capture this too
|
||||
# # 2) 'Charging system linked to use of community heating, programmer and at least two room stats' - what are room
|
||||
# # stats and how should
|
||||
# # we capture this?
|
||||
#
|
||||
# df.to_dict("records")
|
||||
|
||||
import numpy as np
|
||||
idx = 1
|
||||
record = df[df.index == idx].to_dict("records")[0]
|
||||
record = {k: v for k, v in record.items() if v not in [None, np.nan]}
|
||||
from pprint import pprint
|
||||
pprint(record)
|
||||
from model_data.Need import Need
|
||||
import os
|
||||
|
||||
# Issues:
|
||||
# 1) '2207 Time and temperature zone control' - we don't pick up any reference to the fact this is a time and
|
||||
# temperature zone control
|
||||
# and we only pick up temperature zone control at the moment. Can we capture this too
|
||||
# 2) 'Charging system linked to use of community heating, programmer and at least two room stats' - what are room
|
||||
# stats and how should
|
||||
# we capture this?
|
||||
need_client = Need(
|
||||
local_authorities=local_authorities,
|
||||
path=os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/need_2021_anon_dataset_4million.csv"
|
||||
)
|
||||
need_data = need_client.read()
|
||||
|
||||
df.to_dict("records")
|
||||
## Land registry
|
||||
from model_data.LandRegistryClient import LandRegistryClient
|
||||
import os
|
||||
|
||||
land_registry_client = LandRegistryClient(
|
||||
paths=[
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv"
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv",
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv",
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv",
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv",
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv",
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv",
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
|
||||
],
|
||||
addresses=address_meta
|
||||
)
|
||||
|
|
|
|||
|
|
@ -6,4 +6,6 @@ mypy
|
|||
pytest
|
||||
mock
|
||||
pytest-cov
|
||||
pytest-mock
|
||||
pytest-mock
|
||||
fuzzywuzzy
|
||||
python-Levenshtein
|
||||
31
model_data/utils.py
Normal file
31
model_data/utils.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
import logging
|
||||
|
||||
|
||||
def setup_logger(log_file=None, level=logging.INFO):
|
||||
# Create a logger and set the logging level
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(level)
|
||||
|
||||
# Define the log message format
|
||||
log_format = "%(asctime)s [%(levelname)s] %(message)s"
|
||||
date_format = "%Y-%m-%d %H:%M:%S"
|
||||
formatter = logging.Formatter(log_format, datefmt=date_format)
|
||||
|
||||
# Create a file handler and set the file path and format
|
||||
if log_file:
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setLevel(level)
|
||||
file_handler.setFormatter(formatter)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# Create a console handler and set the format
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(level)
|
||||
|
||||
# Set the formatter for the handlers
|
||||
console_handler.setFormatter(formatter)
|
||||
|
||||
# Add the handlers to the logger
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
Loading…
Add table
Reference in a new issue