mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
1139 lines
49 KiB
Python
1139 lines
49 KiB
Python
from datetime import datetime
|
|
from typing import List
|
|
from tqdm import tqdm
|
|
import pandas as pd
|
|
import Levenshtein
|
|
import re
|
|
from utils.s3 import save_excel_to_s3, read_excel_from_s3
|
|
from utils.logger import setup_logger
|
|
from backend.SearchEpc import SearchEpc
|
|
from etl.spatial.OpenUprnClient import OpenUprnClient
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
class Ownership:
|
|
# These are a number of prefix phrases, found in the ownership data. If an address begins with a any of these
|
|
# terms, we remove them
|
|
OWNERSHIP_STARTING_TERMS = [
|
|
"land adjoining", "land on the", "land to the rear of", "land and buildings on the",
|
|
"garage adjoining", "car park adjoining", "the land adjoining", "land and buildings adjoining",
|
|
"all royal mines"
|
|
]
|
|
|
|
# anything that is sold within this many months is flagged to have sold recently and is then
|
|
# considered to be dropped from matching
|
|
SOLD_RECENTLY_MONTHS = 12
|
|
|
|
# Anything that has been lodged for a marketed or unmarketed sale within this many months is
|
|
# flagged as potentially in the process of being sold
|
|
LODGED_RECENTLY_MONTHS = 12
|
|
|
|
# These are the columns in the land registry data
|
|
LAND_REGISTRY_COLUMNS = [
|
|
"transaction_id",
|
|
"price",
|
|
"date_of_transfer",
|
|
"postcode",
|
|
"property_type",
|
|
"old_new",
|
|
"duration",
|
|
"paon",
|
|
"saon",
|
|
"street",
|
|
"locality",
|
|
"town_city",
|
|
"district",
|
|
"county",
|
|
"ppd_category_type",
|
|
"record_status",
|
|
]
|
|
|
|
def __init__(
|
|
self,
|
|
epc_paths: List[str],
|
|
domestic_ownership_path: str,
|
|
overseas_ownership_path: str,
|
|
land_registry_path: str,
|
|
project_name: str,
|
|
bucket: str,
|
|
average_property_value: float,
|
|
portfolio_value: float,
|
|
excluded_owners: List[str] = None,
|
|
excluded_uprns: List[int] = None,
|
|
save=True
|
|
):
|
|
"""
|
|
|
|
:param epc_paths: A list of strings, which points to the location of the EPC data to be used. TO date, this
|
|
data has been held locally, and so will require extension to read from remote locaations like
|
|
s3
|
|
:param domestic_ownership_path: A string which points to the location of the CCOD ownership data, that details
|
|
corporate ownership of properties in the UK, where the companies are UK based
|
|
:param overseas_ownership_path: A string which points to the location of the OCOD ownership data, that details
|
|
corporate ownership of properties in the UK, where the companies are overseas
|
|
:param land_registry_path: A string that points to the location of the land registry data
|
|
:param project_name: A string that is used to identify the project
|
|
:param bucket: The name of the s3 bucket where the data will be stored
|
|
:param average_property_value: The average property value in the area
|
|
"""
|
|
|
|
# All epc paths should end with certificates.csv
|
|
if not any(path for path in epc_paths if path.endswith("certificates.csv")):
|
|
raise ValueError("epc_paths contains a path that does not end with certificates.csv")
|
|
self.epc_paths = epc_paths
|
|
self.domestic_ownership_path = domestic_ownership_path
|
|
self.overseas_ownership_path = overseas_ownership_path
|
|
self.land_registry_path = land_registry_path
|
|
|
|
self.excluded_owners = [] if excluded_owners is None else excluded_owners
|
|
self.excluded_uprns = [] if excluded_uprns is None else excluded_uprns
|
|
|
|
self.run_timestamp = str(datetime.now())
|
|
self.project_name = project_name
|
|
self.bucket = bucket
|
|
|
|
self.average_property_value = average_property_value
|
|
self.portfolio_value = portfolio_value
|
|
|
|
# Data storage paths
|
|
self.epc_data_filepath = f"ownership/{self.project_name}/{self.run_timestamp}/epc_data.xlsx"
|
|
self.filtered_land_registry_filepath = (
|
|
f"ownership/{self.project_name}/{self.run_timestamp}/filtered_land_registry.xlsx"
|
|
)
|
|
self.matched_addresses_pre_filter_filepath = (
|
|
f"ownership/{self.project_name}/{self.run_timestamp}/matched_addresses_pre_filter.xlsx"
|
|
)
|
|
self.combined_matching_lookup_pre_filter_filepath = (
|
|
f"ownership/{self.project_name}/{self.run_timestamp}/combined_matching_lookup_pre_filter.xlsx"
|
|
)
|
|
# Final output paths
|
|
self.portfolio_owners_filepath = f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_owners.xlsx"
|
|
self.portfolio_properties_filepath = (
|
|
f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_properties.xlsx"
|
|
)
|
|
self.portfolio_epc_data_filepath = (
|
|
f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_epc_data.xlsx"
|
|
)
|
|
|
|
self.save = save
|
|
|
|
# Data
|
|
self.epc_data = None
|
|
self.ownership_data = None
|
|
self.freehold_matching_lookup = None
|
|
self.leasehold_matching_lookup = None
|
|
self.shared_freehold_match = None
|
|
self.shared_leasehold_match = None
|
|
self.land_registry = None
|
|
|
|
# Match tables
|
|
self.combined_matching_lookup = None
|
|
self.matched_addresses = None
|
|
self.land_registry_matches = None
|
|
|
|
# Final outputs data
|
|
self.portfolio_owners = None
|
|
self.portfolio_properties = None
|
|
self.portfolio_epc_data = None
|
|
|
|
def pipeline(self, column_filters=None):
|
|
"""
|
|
Runs the full ownership process
|
|
:param column_filters: Dictionary with column names as keys and list of acceptable values as values. This
|
|
dictionary is is used to filter the EPC data and should look like this:
|
|
{"column_name": ["value1", "value2", ...]}, where column_name is the name of the column
|
|
in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that
|
|
column. If a column is not found in the EPC data, an exception is raised.
|
|
"""
|
|
# Step 1: Get EPC data
|
|
self.source_epc_properties(column_filters=column_filters)
|
|
|
|
# Step 2: Get company ownership data
|
|
self.load_company_ownership()
|
|
|
|
# Step 3: Prepare data for matching
|
|
self.prepare_for_matching()
|
|
|
|
# Step 4: Match EPC data to ownership data
|
|
self.match()
|
|
|
|
# Step 5: Match land registry data to existing matches
|
|
self.match_with_land_registry()
|
|
# We store this data in s3 before we perform any filtering
|
|
if self.save:
|
|
save_excel_to_s3(
|
|
df=self.matched_addresses,
|
|
bucket_name=self.bucket,
|
|
file_key=self.matched_addresses_pre_filter_filepath
|
|
)
|
|
save_excel_to_s3(
|
|
df=self.combined_matching_lookup,
|
|
bucket_name=self.bucket,
|
|
file_key=self.combined_matching_lookup_pre_filter_filepath
|
|
)
|
|
|
|
# Prepare the final outputs:
|
|
self.create_final_matches()
|
|
|
|
def source_epc_properties(self, column_filters=None, postcodes=None):
|
|
"""
|
|
This function will filter the epc data as specified by column filters, searching across all of the EPC tables
|
|
:param column_filters: Dictionary with column names as keys and list of acceptable values as values. This
|
|
dictionary is is used to filter the EPC data and should look like this:
|
|
{"column_name": ["value1", "value2", ...]}, where column_name is the name of the column
|
|
in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that
|
|
column. If a column is not found in the EPC data, an exception is raised.
|
|
:param postcodes: A list of postcodes to filter the data on
|
|
"""
|
|
|
|
column_filters = {} if column_filters is None else column_filters
|
|
|
|
data = []
|
|
for path in tqdm(self.epc_paths):
|
|
epc_data = pd.read_csv(path, low_memory=False)
|
|
epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]
|
|
epc_data["UPRN"] = epc_data["UPRN"].astype(int).astype(str)
|
|
|
|
if pd.isnull(pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")).sum():
|
|
raise Exception("Lodgement datetime contains invalid data")
|
|
|
|
epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")
|
|
epc_data = epc_data.sort_values(["LODGEMENT_DATETIME"], ascending=False).drop_duplicates("UPRN")
|
|
|
|
# Apply column filters
|
|
for column, values in column_filters.items():
|
|
if column in epc_data.columns:
|
|
epc_data = epc_data[epc_data[column].isin(values)]
|
|
else:
|
|
raise Exception(f"Column {column} not found in data. column_filters is malformed")
|
|
|
|
if postcodes is not None:
|
|
epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes)]
|
|
if epc_data.empty:
|
|
continue
|
|
|
|
data.append(epc_data)
|
|
|
|
self.epc_data = pd.concat(data, ignore_index=True)
|
|
|
|
if self.excluded_uprns:
|
|
self.epc_data = self.epc_data[~self.epc_data["UPRN"].astype(float).isin(self.excluded_uprns)]
|
|
|
|
if self.save:
|
|
# We now store the data in s3
|
|
save_excel_to_s3(
|
|
df=self.epc_data,
|
|
bucket_name=self.bucket,
|
|
file_key=self.epc_data_filepath
|
|
)
|
|
|
|
def load_company_ownership(self):
|
|
"""
|
|
This function reads in the company ownership data and
|
|
:return:
|
|
"""
|
|
logger.info("Reading in company ownership data")
|
|
self.ownership_data = pd.read_csv(self.domestic_ownership_path)
|
|
self.ownership_data["is_overseas"] = False
|
|
overseas_company_ownership = pd.read_csv(self.overseas_ownership_path)
|
|
overseas_company_ownership["is_overseas"] = True
|
|
|
|
self.ownership_data = pd.concat([self.ownership_data, overseas_company_ownership])
|
|
|
|
# FIlter on relevant postcodes - this is done to reduce the large size of the ownership dataset
|
|
logger.info("Filtering ownership data on EPC postcodes")
|
|
self.ownership_data = self.ownership_data[
|
|
self.ownership_data["Postcode"].str.lower().isin(self.epc_data["POSTCODE"].str.lower().unique())
|
|
]
|
|
|
|
logger.info("Removing excluded owners")
|
|
# Use the company registration number to filter out excluded owners
|
|
self.ownership_data = self.ownership_data[
|
|
~self.ownership_data["Company Registration No. (1)"].astype(str).isin(self.excluded_owners)
|
|
]
|
|
|
|
def prepare_for_matching(self):
|
|
"""
|
|
Given the epc properties and the ownership data, this function performs a number of operations on both datasets
|
|
to prepare them for matching
|
|
"""
|
|
|
|
logger.info("Preparing data for matching")
|
|
# Now we filter properties the other way around, since the ownership data might not have all of the
|
|
# postcodes that appear in the EPC data
|
|
self.epc_data = self.epc_data[
|
|
self.epc_data["POSTCODE"].str.lower().isin(self.ownership_data["Postcode"].str.lower().unique())
|
|
]
|
|
# We have some duplicated on UPRN
|
|
# Take the newest UPRN
|
|
self.epc_data = self.epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN")
|
|
|
|
# Remove entries where the address begins with the term "land adjoining", or other records that don't
|
|
# reference the
|
|
# the property itself
|
|
|
|
for starting_term in self.OWNERSHIP_STARTING_TERMS:
|
|
self.ownership_data = self.ownership_data[
|
|
~self.ownership_data["Property Address"].str.lower().str.startswith(starting_term)
|
|
]
|
|
|
|
@staticmethod
|
|
def extract_numeric_part(house_number: str) -> str:
|
|
"""
|
|
Extracts only the numeric part from a house number that may contain letters.
|
|
|
|
Parameters:
|
|
- house_number (str): The house number string possibly containing letters.
|
|
|
|
Returns:
|
|
- str: The numeric part of the house number.
|
|
"""
|
|
# Use regular expression to replace all non-digit characters with nothing
|
|
numeric_part = re.sub(r'\D', '', house_number)
|
|
return numeric_part
|
|
|
|
@staticmethod
|
|
def remove_text_in_brackets(address: str) -> str:
|
|
"""
|
|
Removes any text within parentheses, including the parentheses themselves.
|
|
|
|
Parameters:
|
|
- address (str): The address string to clean.
|
|
|
|
Returns:
|
|
- str: The cleaned address with text in parentheses removed.
|
|
"""
|
|
# Regex to find and remove content in parentheses
|
|
cleaned_address = re.sub(r'\s*\([^)]*\)', '', address)
|
|
return cleaned_address
|
|
|
|
@staticmethod
|
|
def extract_range_from_house_number(house_number_range: str):
|
|
"""
|
|
Detects if the house number includes a numeric range (formatted as 'x-y') and extracts all values within this
|
|
range.
|
|
Non-numeric strings containing hyphens are ignored.
|
|
|
|
Parameters:
|
|
- house_number_range (str): The house number string that might contain a range.
|
|
|
|
Returns:
|
|
- list of str: A list of all numbers within the range if it is a range; otherwise, returns None.
|
|
"""
|
|
|
|
if not house_number_range:
|
|
return None
|
|
|
|
if '-' in house_number_range:
|
|
parts = house_number_range.split('-')
|
|
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
|
|
# Both parts are numeric, so it's a valid range
|
|
start, end = map(int, parts) # Convert parts to integers
|
|
return [str(x) for x in range(start, end + 1)]
|
|
else:
|
|
# Not a valid numeric range
|
|
return None
|
|
else:
|
|
# No hyphen present or not a range
|
|
return None
|
|
|
|
@staticmethod
|
|
def is_in_range(row, house_no):
|
|
""" Check if the house number is within the range provided in the row. """
|
|
if row and any(house_no == num for num in row):
|
|
return True
|
|
return False
|
|
|
|
@staticmethod
|
|
def levenstein_match(matching_string, df, address_col):
|
|
match_to = df[address_col].tolist()
|
|
# Strip out punctuation and spaces
|
|
match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
|
|
match_to = [x.replace(" ", "") for x in match_to]
|
|
|
|
# Perform matching between full key and match_to
|
|
distances = [Levenshtein.distance(matching_string, s) for s in match_to]
|
|
best_match_index = distances.index(min(distances))
|
|
# We might want to consider a threshold for the distance, however for the momeny,
|
|
# we don't consider this for the moment
|
|
df = df.iloc[best_match_index:best_match_index + 1]
|
|
|
|
return df
|
|
|
|
@classmethod
|
|
def remove_duplicate_matches(cls, matching_lookup, properties, company_ownership):
|
|
duplicated_titles = matching_lookup[matching_lookup["Title Number"].duplicated()]["Title Number"].unique()
|
|
|
|
to_drop = []
|
|
for dupe_title in duplicated_titles:
|
|
dupe_data = matching_lookup[matching_lookup["Title Number"] == dupe_title].copy()
|
|
matched_addresses = dupe_data.merge(
|
|
properties[["UPRN", "ADDRESS"]].rename(columns={"ADDRESS": "epc_address"}),
|
|
how="left", on="UPRN"
|
|
).merge(
|
|
company_ownership[["Title Number", "Property Address"]],
|
|
how="left", on="Title Number"
|
|
)
|
|
# We perform levenstein to get the best match
|
|
best_match = cls.levenstein_match(
|
|
matching_string=matched_addresses["Property Address"].values[0],
|
|
df=matched_addresses,
|
|
address_col="epc_address"
|
|
)
|
|
matches_to_drop = matched_addresses[
|
|
~matched_addresses["UPRN"].isin(best_match["UPRN"].values)
|
|
]
|
|
|
|
to_drop.append(
|
|
matches_to_drop[["UPRN", "Title Number"]].copy()
|
|
)
|
|
|
|
to_drop = pd.concat(to_drop) if to_drop else pd.DataFrame()
|
|
|
|
if not to_drop.empty:
|
|
merged = pd.merge(matching_lookup, to_drop, on=['UPRN', 'Title Number'], how='left', indicator=True)
|
|
merged = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
|
|
|
|
return merged
|
|
|
|
return matching_lookup
|
|
|
|
@classmethod
|
|
def remove_duplicate_uprn_matches(cls, matching_lookup, properties, company_ownership):
|
|
dupe_uprns = matching_lookup[matching_lookup["UPRN"].duplicated()]["UPRN"].unique().tolist()
|
|
|
|
to_drop = []
|
|
for dupe_uprn in dupe_uprns:
|
|
dupe_data = matching_lookup[matching_lookup["UPRN"] == dupe_uprn].copy()
|
|
matched_addresses = dupe_data.merge(
|
|
properties[["UPRN", "ADDRESS"]].rename(columns={"ADDRESS": "epc_address"}),
|
|
how="left", on="UPRN"
|
|
).merge(
|
|
company_ownership[["Title Number", "Property Address"]],
|
|
how="left", on="Title Number"
|
|
)
|
|
# We perform levenstein to get the best match
|
|
best_match = cls.levenstein_match(
|
|
matching_string=matched_addresses["Property Address"].values[0],
|
|
df=matched_addresses,
|
|
address_col="epc_address"
|
|
)
|
|
matches_to_drop = matched_addresses[
|
|
~matched_addresses["Title Number"].isin(best_match["Title Number"].values)
|
|
]
|
|
|
|
to_drop.append(
|
|
matches_to_drop[["UPRN", "Title Number"]].copy()
|
|
)
|
|
|
|
to_drop = pd.concat(to_drop)
|
|
|
|
if not to_drop.empty:
|
|
merged = pd.merge(matching_lookup, to_drop, on=['UPRN', 'Title Number'], how='left', indicator=True)
|
|
merged = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
|
|
|
|
return merged
|
|
|
|
return matching_lookup
|
|
|
|
@staticmethod
|
|
def is_substring(x, match_string):
|
|
if pd.isnull(x):
|
|
return False
|
|
return x in match_string.lower()
|
|
|
|
@staticmethod
|
|
def house_number_match(paon, house_number):
|
|
# Firstly try and convert to numberic
|
|
try:
|
|
paon_numeric = int(paon)
|
|
house_number_numeric = int(house_number)
|
|
return paon_numeric == house_number_numeric
|
|
except Exception as e: # noqa
|
|
# If we can't convert both to numeric, we do an equality
|
|
|
|
return paon == house_number
|
|
|
|
@staticmethod
|
|
def check_equalities(lr_filtered):
|
|
all_paon_equal = all(lr_filtered["paon"] == lr_filtered["paon"].values[0])
|
|
if pd.isnull(lr_filtered["saon"].values[0]):
|
|
all_saon_equal = all(pd.isnull(lr_filtered["saon"]))
|
|
else:
|
|
all_saon_equal = all(lr_filtered["saon"] == lr_filtered["saon"].values[0])
|
|
|
|
all_street_equal = all(lr_filtered["street"] == lr_filtered["street"].values[0])
|
|
|
|
return all_paon_equal, all_saon_equal, all_street_equal
|
|
|
|
def match(self):
|
|
if (self.epc_data is None) or (self.ownership_data is None):
|
|
raise ValueError("epc_data and ownership_data should not be null")
|
|
|
|
logger.info("Matching EPC data to ownership data")
|
|
freehold_matching_lookup = []
|
|
leasehold_matching_lookup = []
|
|
shared_leasehold_match = []
|
|
shared_freehold_match = []
|
|
for _, address in tqdm(self.epc_data.iterrows(), total=len(self.epc_data)):
|
|
match_type = "exact"
|
|
filtered = self.ownership_data[
|
|
self.ownership_data["Postcode"].str.lower() == address["POSTCODE"].lower()
|
|
].copy()
|
|
|
|
# Remove postcode and remove trailing commas
|
|
filtered["house_number"] = (
|
|
filtered["Property Address"]
|
|
.apply(self.remove_text_in_brackets)
|
|
.apply(SearchEpc.get_house_number)
|
|
.str.lower()
|
|
.str.replace(",", "")
|
|
)
|
|
house_no = SearchEpc.get_house_number(address["ADDRESS1"])
|
|
if house_no is not None:
|
|
house_no = house_no.replace(",", "")
|
|
|
|
if house_no is None:
|
|
# If the house number is missing, it means that we usually have a named property so we look for an
|
|
# exact match on that name
|
|
filtered = filtered[filtered["Property Address"].str.lower().str.contains(address["ADDRESS"].lower())]
|
|
if filtered.shape[0] != 1:
|
|
continue
|
|
|
|
else:
|
|
|
|
if house_no not in filtered["house_number"].values:
|
|
# If this happens, we check house_number for a x-y range of addresses
|
|
filtered["house_number_range"] = filtered["house_number"].apply(
|
|
self.extract_range_from_house_number
|
|
)
|
|
# If we have found a house number range, we check if the house number is in the range and if not,
|
|
# we drop the row
|
|
filtered['is_in_range'] = filtered['house_number_range'].apply(
|
|
lambda x: self.is_in_range(x, house_no)
|
|
)
|
|
|
|
if filtered['is_in_range'].any():
|
|
# If house_no is found in any range, keep only rows where it is in range
|
|
filtered = filtered[filtered['is_in_range']]
|
|
else:
|
|
# If house_no is not found in any range, filter out rows where 'house_number_range' is not None
|
|
filtered = filtered[filtered['house_number_range'].isnull()]
|
|
|
|
# Strip out letters from house_no and house_number
|
|
house_no = self.extract_numeric_part(house_no)
|
|
filtered["house_number"] = filtered["house_number"].astype(str).apply(self.extract_numeric_part)
|
|
match_type = "approximate"
|
|
|
|
filtered = filtered[filtered["house_number"] == house_no]
|
|
|
|
if filtered.empty:
|
|
continue
|
|
|
|
filtered_freehold = filtered[filtered["Tenure"] == "Freehold"]
|
|
filtered_leasehold = filtered[filtered["Tenure"] == "Leasehold"]
|
|
|
|
if filtered_freehold.shape[0] > 1:
|
|
matched = filtered_leasehold[["Title Number"]].copy()
|
|
matched.insert(0, "UPRN", address["UPRN"])
|
|
shared_freehold_match.append(matched)
|
|
elif not filtered_freehold.empty:
|
|
freehold_matching_lookup.append(
|
|
{
|
|
"UPRN": address["UPRN"],
|
|
"Title Number": filtered_freehold["Title Number"].values[0],
|
|
"match_type": match_type,
|
|
}
|
|
)
|
|
|
|
if filtered_leasehold.shape[0] > 1:
|
|
matched = filtered_leasehold[["Title Number"]].copy()
|
|
matched.insert(0, "UPRN", address["UPRN"])
|
|
shared_leasehold_match.append(matched)
|
|
elif not filtered_leasehold.empty:
|
|
leasehold_matching_lookup.append(
|
|
{
|
|
"UPRN": address["UPRN"],
|
|
"Title Number": filtered_leasehold["Title Number"].values[0],
|
|
"match_type": match_type,
|
|
}
|
|
)
|
|
|
|
logger.info("Matching complete - creating lookup tables")
|
|
|
|
self.freehold_matching_lookup = pd.DataFrame(freehold_matching_lookup)
|
|
self.leasehold_matching_lookup = pd.DataFrame(leasehold_matching_lookup)
|
|
|
|
self.freehold_matching_lookup = self.freehold_matching_lookup[
|
|
self.freehold_matching_lookup["match_type"] == "exact"
|
|
]
|
|
self.leasehold_matching_lookup = self.leasehold_matching_lookup[
|
|
self.leasehold_matching_lookup["match_type"] == "exact"
|
|
]
|
|
|
|
self.shared_leasehold_match = shared_leasehold_match
|
|
self.shared_freehold_match = shared_freehold_match
|
|
|
|
# finally, we create matched addresses
|
|
self.combined_matching_lookup = pd.concat([self.freehold_matching_lookup, self.leasehold_matching_lookup])
|
|
|
|
# Remove duplicates
|
|
self.combined_matching_lookup = self.remove_duplicate_matches(
|
|
matching_lookup=self.combined_matching_lookup,
|
|
properties=self.epc_data,
|
|
company_ownership=self.ownership_data
|
|
)
|
|
# We also have duplicates at a UPRN level
|
|
self.combined_matching_lookup = self.remove_duplicate_uprn_matches(
|
|
matching_lookup=self.combined_matching_lookup,
|
|
properties=self.epc_data,
|
|
company_ownership=self.ownership_data
|
|
)
|
|
|
|
self.matched_addresses = self.combined_matching_lookup.merge(
|
|
self.epc_data[
|
|
[
|
|
"UPRN",
|
|
"ADDRESS",
|
|
"ADDRESS1",
|
|
"CURRENT_ENERGY_EFFICIENCY",
|
|
"CURRENT_ENERGY_RATING",
|
|
"POSTCODE",
|
|
"LODGEMENT_DATE",
|
|
"TRANSACTION_TYPE",
|
|
"TENURE",
|
|
]
|
|
].rename(
|
|
columns={
|
|
"ADDRESS": "epc_address",
|
|
"ADDRESS1": "epc_address1",
|
|
"POSTCODE": "epc_postcode"
|
|
}
|
|
),
|
|
how="left", on="UPRN"
|
|
).merge(
|
|
self.ownership_data[
|
|
[
|
|
"Title Number",
|
|
"Property Address",
|
|
"Postcode",
|
|
"Company Registration No. (1)",
|
|
"Proprietor Name (1)",
|
|
"Date Proprietor Added",
|
|
]
|
|
],
|
|
how="left", on="Title Number"
|
|
)
|
|
|
|
# Let's try and get the house number
|
|
self.matched_addresses["house_number"] = (
|
|
self.matched_addresses["epc_address"]
|
|
.apply(self.remove_text_in_brackets)
|
|
.apply(SearchEpc.get_house_number)
|
|
.str.lower()
|
|
.str.replace(",", "")
|
|
)
|
|
|
|
logger.info("Successfully completed matching")
|
|
|
|
def get_land_registry(self):
|
|
"""
|
|
This function reads in the land registry data and filters it on the postcodes found in the EPC data
|
|
"""
|
|
land_registry = pd.read_csv(self.land_registry_path, header=None)
|
|
land_registry.columns = self.LAND_REGISTRY_COLUMNS
|
|
land_registry = land_registry[
|
|
land_registry["postcode"].str.lower().isin(self.epc_data["POSTCODE"].str.lower().unique())
|
|
]
|
|
land_registry["date_of_transfer"] = pd.to_datetime(
|
|
land_registry["date_of_transfer"], format="%Y-%m-%d", errors="coerce"
|
|
)
|
|
# Take data from the last 5 years
|
|
land_registry = land_registry[
|
|
(land_registry["date_of_transfer"] >= datetime.now() - pd.DateOffset(years=5))
|
|
]
|
|
|
|
return land_registry
|
|
|
|
def match_with_land_registry(self):
|
|
"""
|
|
This function matches the land registry data to the existing matches
|
|
:return:
|
|
"""
|
|
# TODO: Refactor this entire function
|
|
if self.matched_addresses is None:
|
|
raise ValueError("Run match() first!")
|
|
|
|
logger.info("Reading land registry data")
|
|
self.land_registry = self.get_land_registry()
|
|
# Store this fitereed version in s3
|
|
save_excel_to_s3(
|
|
df=self.land_registry,
|
|
bucket_name=self.bucket,
|
|
file_key=self.filtered_land_registry_filepath,
|
|
)
|
|
|
|
for col in ["postcode", "street", "paon", "saon"]:
|
|
self.land_registry[col] = self.land_registry[col].str.lower().str.strip()
|
|
|
|
self.land_registry["date_of_transfer"] = pd.to_datetime(self.land_registry["date_of_transfer"])
|
|
|
|
logger.info("Performing land registry matching")
|
|
land_registry_matches = []
|
|
for _, match in tqdm(self.matched_addresses.iterrows(), total=len(self.matched_addresses)):
|
|
# Filter land registry on the postcode
|
|
lr_filtered = self.land_registry[
|
|
(self.land_registry["postcode"] == match["epc_postcode"].lower().strip())
|
|
].copy()
|
|
|
|
# Filter further, when the street is in in the address
|
|
# street should be contained in epc_address
|
|
lr_filtered = lr_filtered[
|
|
lr_filtered["street"].apply(lambda x: self.is_substring(x, match["epc_address"].lower())) |
|
|
lr_filtered["street"].apply(lambda x: self.is_substring(x, match["Property Address"].lower()))
|
|
]
|
|
|
|
if lr_filtered.empty:
|
|
continue
|
|
|
|
# We now check if paon is in address 1
|
|
lr_filtered["paon_match"] = lr_filtered["paon"].apply(
|
|
lambda x: self.house_number_match(x, match["house_number"])
|
|
)
|
|
# We also try the secondary match
|
|
lr_filtered["saon_match"] = (
|
|
lr_filtered["saon"].apply(
|
|
lambda x: False if pd.isnull(x) else self.is_substring(x, match["epc_address1"])
|
|
)
|
|
)
|
|
# We fileter where we have a primary or secondary match
|
|
lr_filtered = lr_filtered[
|
|
lr_filtered["paon_match"] | lr_filtered["saon_match"]
|
|
]
|
|
|
|
if lr_filtered.empty:
|
|
continue
|
|
elif lr_filtered.shape[0] == 1:
|
|
land_registry_matches.append(
|
|
{
|
|
"uprn": match["UPRN"],
|
|
"transaction_id": lr_filtered['transaction_id'].values[0],
|
|
"price": lr_filtered["price"].values[0],
|
|
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
|
}
|
|
)
|
|
continue
|
|
elif lr_filtered.shape[0] > 1:
|
|
# We make sure all records are the same and take the newest
|
|
all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered)
|
|
has_paon_match = any(lr_filtered["paon_match"])
|
|
|
|
if all_paon_equal and all_street_equal and all_saon_equal:
|
|
# Take the newest record, append and continue
|
|
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
|
|
lr_filtered = lr_filtered.head(1)
|
|
land_registry_matches.append(
|
|
{
|
|
"uprn": match["UPRN"],
|
|
"transaction_id": lr_filtered['transaction_id'].values[0],
|
|
"price": lr_filtered["price"].values[0],
|
|
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
|
}
|
|
)
|
|
continue
|
|
elif has_paon_match and all_street_equal:
|
|
# Peform filter on paon
|
|
lr_filtered = lr_filtered[lr_filtered["paon_match"]]
|
|
# Do an addtiioanl equality check
|
|
all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered)
|
|
if all_paon_equal and all_street_equal and all_saon_equal:
|
|
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
|
|
lr_filtered = lr_filtered.head(1)
|
|
land_registry_matches.append(
|
|
{
|
|
"uprn": match["UPRN"],
|
|
"transaction_id": lr_filtered['transaction_id'].values[0],
|
|
"price": lr_filtered["price"].values[0],
|
|
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
|
}
|
|
)
|
|
else:
|
|
# We do a match on saon
|
|
lr_filtered["saon_match2"] = lr_filtered["saon"].apply(
|
|
lambda x: False if pd.isnull(x) else self.is_substring(x, match["epc_address"])
|
|
)
|
|
|
|
lr_filtered = lr_filtered[lr_filtered["saon_match2"]]
|
|
|
|
if lr_filtered.empty:
|
|
continue
|
|
elif lr_filtered.shape[0] == 1:
|
|
land_registry_matches.append(
|
|
{
|
|
"uprn": match["UPRN"],
|
|
"transaction_id": lr_filtered['transaction_id'].values[0],
|
|
"price": lr_filtered["price"].values[0],
|
|
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
|
}
|
|
)
|
|
continue
|
|
else:
|
|
raise NotImplementedError("wtf")
|
|
else:
|
|
# We have a final check, based on an observed case
|
|
lr_address_1 = " ".join([x.lower().strip() for x in match["Property Address"].split(",")[0:2]])
|
|
|
|
lr_filtered["paon_match2"] = lr_filtered["paon"].apply(
|
|
lambda x: False if pd.isnull(x) else self.is_substring(x, lr_address_1)
|
|
)
|
|
|
|
lr_filtered = lr_filtered[lr_filtered["paon_match2"]]
|
|
|
|
if lr_filtered.empty:
|
|
continue
|
|
elif lr_filtered.shape[0] == 1:
|
|
land_registry_matches.append(
|
|
{
|
|
"uprn": match["UPRN"],
|
|
"transaction_id": lr_filtered['transaction_id'].values[0],
|
|
"price": lr_filtered["price"].values[0],
|
|
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
|
}
|
|
)
|
|
continue
|
|
else:
|
|
# Check all the same
|
|
all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered)
|
|
|
|
# Check saon is house number with exact match
|
|
lr_filtered["saon_match2"] = lr_filtered["saon"].apply(
|
|
lambda x: False if pd.isnull(x) else self.house_number_match(x, match["house_number"])
|
|
)
|
|
# We check if we have a flat
|
|
match_flat_number = re.match("flat (\d+)", match["epc_address1"].lower())
|
|
match_apartment_number = re.match("apartment (\d+)", match["epc_address1"].lower())
|
|
lr_filtered["saon_match3"] = False
|
|
if match_flat_number is not None:
|
|
# Get out the match
|
|
match_flat_number = "flat " + match_flat_number.group(1)
|
|
lr_filtered["saon_match3"] = lr_filtered["saon"].apply(
|
|
lambda x: False if pd.isnull(x) else x == match_flat_number
|
|
)
|
|
|
|
if match_apartment_number is not None:
|
|
# Get out the match
|
|
match_apartment_number = "apartment " + match_apartment_number.group(1)
|
|
lr_filtered["saon_match3"] = lr_filtered["saon"].apply(
|
|
lambda x: False if pd.isnull(x) else x == match_apartment_number
|
|
)
|
|
|
|
if all_paon_equal and all_saon_equal and all_street_equal:
|
|
# Take the newest record
|
|
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
|
|
lr_filtered = lr_filtered.head(1)
|
|
land_registry_matches.append(
|
|
{
|
|
"uprn": match["UPRN"],
|
|
"transaction_id": lr_filtered['transaction_id'].values[0],
|
|
"price": lr_filtered["price"].values[0],
|
|
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
|
}
|
|
)
|
|
continue
|
|
elif any(lr_filtered["saon_match2"]):
|
|
lr_filtered = lr_filtered[lr_filtered["saon_match2"]]
|
|
all_saon_equal, all_paon_equal, all_street_equal = self.check_equalities(lr_filtered)
|
|
if all_paon_equal and all_saon_equal and all_street_equal:
|
|
# Filter on the newest record
|
|
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
|
|
lr_filtered = lr_filtered.head(1)
|
|
if lr_filtered.shape[0] == 1:
|
|
land_registry_matches.append(
|
|
{
|
|
"uprn": match["UPRN"],
|
|
"transaction_id": lr_filtered['transaction_id'].values[0],
|
|
"price": lr_filtered["price"].values[0],
|
|
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
|
}
|
|
)
|
|
continue
|
|
elif any(lr_filtered["saon_match3"]):
|
|
lr_filtered = lr_filtered[lr_filtered["saon_match3"]]
|
|
if lr_filtered.shape[0] == 1:
|
|
land_registry_matches.append(
|
|
{
|
|
"uprn": match["UPRN"],
|
|
"transaction_id": lr_filtered['transaction_id'].values[0],
|
|
"price": lr_filtered["price"].values[0],
|
|
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
|
|
}
|
|
)
|
|
continue
|
|
|
|
raise NotImplementedError("wtf")
|
|
else:
|
|
raise NotImplementedError("What happened here?")
|
|
|
|
self.land_registry_matches = pd.DataFrame(land_registry_matches)
|
|
|
|
logger.info("Sucessfully completed land registry matching - merging onto matched_addresses")
|
|
# Merge onto the EPC - ownership matches
|
|
self.matched_addresses = self.matched_addresses.merge(
|
|
self.land_registry_matches,
|
|
how="left",
|
|
left_on="UPRN",
|
|
right_on="uprn"
|
|
).drop(columns=["uprn"])
|
|
|
|
# Flag anything that sold in the last year
|
|
self.matched_addresses["sold_recently"] = (
|
|
self.matched_addresses["date_of_transfer"] >= pd.Timestamp.now() -
|
|
pd.DateOffset(month=self.SOLD_RECENTLY_MONTHS)
|
|
)
|
|
|
|
self.matched_addresses["sale_lodged_recently"] = (
|
|
(
|
|
pd.to_datetime(
|
|
self.matched_addresses["LODGEMENT_DATE"]
|
|
) >= pd.Timestamp.now() - pd.DateOffset(months=self.LODGED_RECENTLY_MONTHS)
|
|
) &
|
|
(self.matched_addresses["TRANSACTION_TYPE"].isin(["marketed sale", "non marketed sale"]))
|
|
)
|
|
|
|
def aggregate_matches(self, matching_lookup, company_ownership, properties):
|
|
df = matching_lookup.merge(
|
|
company_ownership, how="left", on="Title Number"
|
|
).merge(
|
|
properties[["UPRN", "LOCAL_AUTHORITY_LABEL"]], how="left", on="UPRN"
|
|
)
|
|
counts = (
|
|
df.groupby(["Company Registration No. (1)", "LOCAL_AUTHORITY_LABEL"])["UPRN"]
|
|
.count()
|
|
.reset_index(name="number_of_properties")
|
|
)
|
|
counts = counts.sort_values("number_of_properties", ascending=False)
|
|
|
|
pivot_counts = counts.pivot_table(
|
|
index=["Company Registration No. (1)"], # Rows: companies and proprietors
|
|
columns="LOCAL_AUTHORITY_LABEL", # Columns: each local authority
|
|
values="number_of_properties", # The counts of properties
|
|
fill_value=0 # Fill missing values with 0 (where there are no properties owned)
|
|
).reset_index()
|
|
|
|
total_counts = (
|
|
df.groupby(["Company Registration No. (1)"])["UPRN"]
|
|
.count()
|
|
.reset_index(name="total_number_of_properties")
|
|
)
|
|
|
|
# We have cases where the same company registration number results in the same company name, so we produce a
|
|
# best
|
|
# name per company registration number
|
|
best_names = (
|
|
df.groupby(["Company Registration No. (1)"])["Proprietor Name (1)"]
|
|
.first()
|
|
.reset_index()
|
|
)
|
|
|
|
total_counts = best_names.merge(
|
|
total_counts, how="left", on=["Company Registration No. (1)"]
|
|
)
|
|
|
|
pivot_counts = pivot_counts.merge(
|
|
total_counts, how="left", on=["Company Registration No. (1)"]
|
|
)
|
|
|
|
pivot_counts = pivot_counts.sort_values("total_number_of_properties", ascending=False)
|
|
pivot_counts = pivot_counts[pivot_counts["total_number_of_properties"] > 1]
|
|
|
|
pivot_counts["approx_value"] = self.average_property_value * pivot_counts["total_number_of_properties"]
|
|
pivot_counts["cumulative_value"] = pivot_counts["approx_value"].cumsum()
|
|
|
|
return pivot_counts
|
|
|
|
def create_final_matches(self):
|
|
"""
|
|
Given the matching to this point, this method creates the final matching tables
|
|
:return:
|
|
"""
|
|
logger.info("Creating final matches")
|
|
matched_addresses_final = self.matched_addresses[
|
|
~self.matched_addresses["sold_recently"] &
|
|
~self.matched_addresses["sale_lodged_recently"]
|
|
].copy()
|
|
|
|
logger.info("Performing conservation area and listed/herigage building filtering")
|
|
|
|
portfolio_spatial_data = OpenUprnClient.get_spatial_data(
|
|
matched_addresses_final["UPRN"].unique().tolist(), bucket_name="retrofit-data-dev"
|
|
)
|
|
|
|
portfolio_spatial_data = portfolio_spatial_data[
|
|
["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]
|
|
].copy()
|
|
portfolio_spatial_data["UPRN"] = portfolio_spatial_data["UPRN"].astype(str)
|
|
|
|
# Filter matched_addresses_final and filter combined_matching_lookup_final
|
|
matched_addresses_final = matched_addresses_final.merge(
|
|
portfolio_spatial_data, how="left", on="UPRN"
|
|
)
|
|
matched_addresses_final = matched_addresses_final[
|
|
matched_addresses_final["conservation_status"].isin([None, False]) &
|
|
matched_addresses_final["is_listed_building"].isin([None, False]) &
|
|
matched_addresses_final["is_heritage_building"].isin([None, False])
|
|
]
|
|
|
|
# Filter combined_matching_lookup accordingly
|
|
combined_matching_lookup_final = self.combined_matching_lookup[
|
|
self.combined_matching_lookup["UPRN"].isin(matched_addresses_final["UPRN"])
|
|
]
|
|
|
|
# Roll up portfolio
|
|
combined_aggregate = self.aggregate_matches(
|
|
matching_lookup=combined_matching_lookup_final,
|
|
company_ownership=self.ownership_data,
|
|
properties=self.epc_data
|
|
)
|
|
|
|
self.portfolio_owners = combined_aggregate[combined_aggregate["cumulative_value"] <= self.portfolio_value]
|
|
|
|
self.portfolio_properties = matched_addresses_final[
|
|
matched_addresses_final["Company Registration No. (1)"].isin(
|
|
self.portfolio_owners["Company Registration No. (1)"]
|
|
)
|
|
]
|
|
|
|
# We perform some checks
|
|
if self.portfolio_owners["total_number_of_properties"].sum() != self.portfolio_properties["UPRN"].nunique():
|
|
raise ValueError("Portfolio owners and properties don't match")
|
|
|
|
self.portfolio_epc_data = self.epc_data[self.epc_data["UPRN"].isin(self.portfolio_properties["UPRN"])]
|
|
|
|
# Additional checks
|
|
if self.portfolio_properties["UPRN"].nunique() != self.portfolio_epc_data["UPRN"].nunique():
|
|
raise ValueError("Portfolio properties and epc data don't match")
|
|
|
|
if self.save:
|
|
logger.info("Storing final outpus")
|
|
# Store data
|
|
save_excel_to_s3(
|
|
df=self.portfolio_owners,
|
|
bucket_name=self.bucket,
|
|
file_key=self.portfolio_owners_filepath,
|
|
)
|
|
|
|
save_excel_to_s3(
|
|
df=self.portfolio_properties,
|
|
bucket_name=self.bucket,
|
|
file_key=self.portfolio_properties_filepath,
|
|
)
|
|
|
|
save_excel_to_s3(
|
|
df=self.portfolio_epc_data,
|
|
bucket_name=self.bucket,
|
|
file_key=self.portfolio_epc_data_filepath,
|
|
)
|
|
|
|
def get_asset_list(self):
|
|
"""
|
|
From the EPC data, creates the asset list
|
|
:return:
|
|
"""
|
|
|
|
asset_list = self.portfolio_epc_data[["UPRN", "ADDRESS1", "POSTCODE"]].copy().rename(
|
|
columns={
|
|
"UPRN": "uprn",
|
|
"ADDRESS1": "address",
|
|
"POSTCODE": "postcode"
|
|
}
|
|
)
|
|
|
|
return asset_list
|
|
|
|
def create_final_outputs(self, portfolio_timestamp, storage_date, exclusion_uprns=None):
|
|
"""
|
|
Given the completed outputs of the matching process, this function creates the final outputs, after matching
|
|
valuation data, and creates a "working" directory, which is our current view of the sfr portfolio. This means
|
|
that we can iterate on the portfolio without affecting the final outputs, and then once we're happy with the
|
|
new version, we can commit those files to the "working" directory. This inforamtion shouldn't update very
|
|
often and so we're ok to store this at a daily level
|
|
:return:
|
|
"""
|
|
|
|
exclusion_uprns = [] if exclusion_uprns is None else exclusion_uprns
|
|
|
|
# Step 1: Read in the valuations data
|
|
valuations = read_excel_from_s3(
|
|
bucket_name=self.bucket,
|
|
file_key=f"ownership/{self.project_name}/sfr property valuations.xlsx",
|
|
header_row=0
|
|
)
|
|
|
|
# Load in the portfolio data
|
|
# 1) owners
|
|
portfolio_owners = read_excel_from_s3(
|
|
bucket_name=self.bucket,
|
|
file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_owners.xlsx",
|
|
header_row=0
|
|
)
|
|
# 2) EPC
|
|
portfolio_epc_data = read_excel_from_s3(
|
|
bucket_name=self.bucket,
|
|
file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_epc_data.xlsx",
|
|
header_row=0
|
|
)
|
|
|
|
# 3) properties
|
|
portfolio_properties = read_excel_from_s3(
|
|
bucket_name=self.bucket,
|
|
file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_properties.xlsx",
|
|
header_row=0
|
|
)
|
|
|
|
# Check they're the right size
|
|
if portfolio_owners["total_number_of_properties"].sum() != portfolio_properties["UPRN"].nunique():
|
|
raise ValueError("Portfolio owners and properties don't match")
|
|
|
|
if portfolio_properties["UPRN"].nunique() != portfolio_epc_data["UPRN"].nunique():
|
|
raise ValueError("Portfolio properties and epc data don't match")
|
|
|
|
# We make some final cuts based on UPRNs that at a later stage are found to be odd
|
|
if portfolio_properties["UPRN"].isin(exclusion_uprns).sum():
|
|
raise Exception("Implement me!")
|
|
# Identify who the owners are for thes uprns
|
|
# owners = portfolio_properties[portfolio_properties["UPRN"].isin(exclusion_uprns)].groupby(
|
|
# "Company Registration No. (1)"
|
|
# )["UPRN"].nunique().reset_index().rename(
|
|
# columns={"UPRN": "number_of_properties_to_exclude"}
|
|
# )
|
|
#
|
|
# min_owners_threshold = portfolio_owners["total_number_of_properties"].min()
|
|
#
|
|
# portfolio_owners = portfolio_owners.merge(
|
|
# owners, how="left", on="Company Registration No. (1)", suffixes=("", "_excluded")
|
|
# )
|
|
|
|
# Step 2: Merge in the valuations data
|
|
portfolio_properties = portfolio_properties.merge(
|
|
valuations.rename(columns={"uprn": "UPRN"}).drop(columns=['address', 'postcode']), how="left", on="UPRN"
|
|
)
|
|
|
|
# Step 3: Store the final outputs
|
|
save_excel_to_s3(
|
|
df=portfolio_owners,
|
|
bucket_name=self.bucket,
|
|
file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_owners.xlsx",
|
|
)
|
|
|
|
save_excel_to_s3(
|
|
df=portfolio_properties,
|
|
bucket_name=self.bucket,
|
|
file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_properties.xlsx",
|
|
)
|
|
|
|
save_excel_to_s3(
|
|
df=portfolio_epc_data,
|
|
bucket_name=self.bucket,
|
|
file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_epc_data.xlsx",
|
|
)
|