Model/etl/ownership/Ownership.py
2024-10-11 10:16:48 +01:00

1139 lines
49 KiB
Python

from datetime import datetime
from typing import List
from tqdm import tqdm
import pandas as pd
import Levenshtein
import re
from utils.s3 import save_excel_to_s3, read_excel_from_s3
from utils.logger import setup_logger
from backend.SearchEpc import SearchEpc
from etl.spatial.OpenUprnClient import OpenUprnClient
logger = setup_logger()
class Ownership:
# These are a number of prefix phrases, found in the ownership data. If an address begins with a any of these
# terms, we remove them
OWNERSHIP_STARTING_TERMS = [
"land adjoining", "land on the", "land to the rear of", "land and buildings on the",
"garage adjoining", "car park adjoining", "the land adjoining", "land and buildings adjoining",
"all royal mines"
]
# anything that is sold within this many months is flagged to have sold recently and is then
# considered to be dropped from matching
SOLD_RECENTLY_MONTHS = 12
# Anything that has been lodged for a marketed or unmarketed sale within this many months is
# flagged as potentially in the process of being sold
LODGED_RECENTLY_MONTHS = 12
# These are the columns in the land registry data
LAND_REGISTRY_COLUMNS = [
"transaction_id",
"price",
"date_of_transfer",
"postcode",
"property_type",
"old_new",
"duration",
"paon",
"saon",
"street",
"locality",
"town_city",
"district",
"county",
"ppd_category_type",
"record_status",
]
def __init__(
self,
epc_paths: List[str],
domestic_ownership_path: str,
overseas_ownership_path: str,
land_registry_path: str,
project_name: str,
bucket: str,
average_property_value: float,
portfolio_value: float,
excluded_owners: List[str] = None,
excluded_uprns: List[int] = None,
save=True
):
"""
:param epc_paths: A list of strings, which points to the location of the EPC data to be used. TO date, this
data has been held locally, and so will require extension to read from remote locaations like
s3
:param domestic_ownership_path: A string which points to the location of the CCOD ownership data, that details
corporate ownership of properties in the UK, where the companies are UK based
:param overseas_ownership_path: A string which points to the location of the OCOD ownership data, that details
corporate ownership of properties in the UK, where the companies are overseas
:param land_registry_path: A string that points to the location of the land registry data
:param project_name: A string that is used to identify the project
:param bucket: The name of the s3 bucket where the data will be stored
:param average_property_value: The average property value in the area
"""
# All epc paths should end with certificates.csv
if not any(path for path in epc_paths if path.endswith("certificates.csv")):
raise ValueError("epc_paths contains a path that does not end with certificates.csv")
self.epc_paths = epc_paths
self.domestic_ownership_path = domestic_ownership_path
self.overseas_ownership_path = overseas_ownership_path
self.land_registry_path = land_registry_path
self.excluded_owners = [] if excluded_owners is None else excluded_owners
self.excluded_uprns = [] if excluded_uprns is None else excluded_uprns
self.run_timestamp = str(datetime.now())
self.project_name = project_name
self.bucket = bucket
self.average_property_value = average_property_value
self.portfolio_value = portfolio_value
# Data storage paths
self.epc_data_filepath = f"ownership/{self.project_name}/{self.run_timestamp}/epc_data.xlsx"
self.filtered_land_registry_filepath = (
f"ownership/{self.project_name}/{self.run_timestamp}/filtered_land_registry.xlsx"
)
self.matched_addresses_pre_filter_filepath = (
f"ownership/{self.project_name}/{self.run_timestamp}/matched_addresses_pre_filter.xlsx"
)
self.combined_matching_lookup_pre_filter_filepath = (
f"ownership/{self.project_name}/{self.run_timestamp}/combined_matching_lookup_pre_filter.xlsx"
)
# Final output paths
self.portfolio_owners_filepath = f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_owners.xlsx"
self.portfolio_properties_filepath = (
f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_properties.xlsx"
)
self.portfolio_epc_data_filepath = (
f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_epc_data.xlsx"
)
self.save = save
# Data
self.epc_data = None
self.ownership_data = None
self.freehold_matching_lookup = None
self.leasehold_matching_lookup = None
self.shared_freehold_match = None
self.shared_leasehold_match = None
self.land_registry = None
# Match tables
self.combined_matching_lookup = None
self.matched_addresses = None
self.land_registry_matches = None
# Final outputs data
self.portfolio_owners = None
self.portfolio_properties = None
self.portfolio_epc_data = None
def pipeline(self, column_filters=None):
"""
Runs the full ownership process
:param column_filters: Dictionary with column names as keys and list of acceptable values as values. This
dictionary is is used to filter the EPC data and should look like this:
{"column_name": ["value1", "value2", ...]}, where column_name is the name of the column
in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that
column. If a column is not found in the EPC data, an exception is raised.
"""
# Step 1: Get EPC data
self.source_epc_properties(column_filters=column_filters)
# Step 2: Get company ownership data
self.load_company_ownership()
# Step 3: Prepare data for matching
self.prepare_for_matching()
# Step 4: Match EPC data to ownership data
self.match()
# Step 5: Match land registry data to existing matches
self.match_with_land_registry()
# We store this data in s3 before we perform any filtering
if self.save:
save_excel_to_s3(
df=self.matched_addresses,
bucket_name=self.bucket,
file_key=self.matched_addresses_pre_filter_filepath
)
save_excel_to_s3(
df=self.combined_matching_lookup,
bucket_name=self.bucket,
file_key=self.combined_matching_lookup_pre_filter_filepath
)
# Prepare the final outputs:
self.create_final_matches()
def source_epc_properties(self, column_filters=None, postcodes=None):
"""
This function will filter the epc data as specified by column filters, searching across all of the EPC tables
:param column_filters: Dictionary with column names as keys and list of acceptable values as values. This
dictionary is is used to filter the EPC data and should look like this:
{"column_name": ["value1", "value2", ...]}, where column_name is the name of the column
in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that
column. If a column is not found in the EPC data, an exception is raised.
:param postcodes: A list of postcodes to filter the data on
"""
column_filters = {} if column_filters is None else column_filters
data = []
for path in tqdm(self.epc_paths):
epc_data = pd.read_csv(path, low_memory=False)
epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]
epc_data["UPRN"] = epc_data["UPRN"].astype(int).astype(str)
if pd.isnull(pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")).sum():
raise Exception("Lodgement datetime contains invalid data")
epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")
epc_data = epc_data.sort_values(["LODGEMENT_DATETIME"], ascending=False).drop_duplicates("UPRN")
# Apply column filters
for column, values in column_filters.items():
if column in epc_data.columns:
epc_data = epc_data[epc_data[column].isin(values)]
else:
raise Exception(f"Column {column} not found in data. column_filters is malformed")
if postcodes is not None:
epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes)]
if epc_data.empty:
continue
data.append(epc_data)
self.epc_data = pd.concat(data, ignore_index=True)
if self.excluded_uprns:
self.epc_data = self.epc_data[~self.epc_data["UPRN"].astype(float).isin(self.excluded_uprns)]
if self.save:
# We now store the data in s3
save_excel_to_s3(
df=self.epc_data,
bucket_name=self.bucket,
file_key=self.epc_data_filepath
)
def load_company_ownership(self):
"""
This function reads in the company ownership data and
:return:
"""
logger.info("Reading in company ownership data")
self.ownership_data = pd.read_csv(self.domestic_ownership_path)
self.ownership_data["is_overseas"] = False
overseas_company_ownership = pd.read_csv(self.overseas_ownership_path)
overseas_company_ownership["is_overseas"] = True
self.ownership_data = pd.concat([self.ownership_data, overseas_company_ownership])
# FIlter on relevant postcodes - this is done to reduce the large size of the ownership dataset
logger.info("Filtering ownership data on EPC postcodes")
self.ownership_data = self.ownership_data[
self.ownership_data["Postcode"].str.lower().isin(self.epc_data["POSTCODE"].str.lower().unique())
]
logger.info("Removing excluded owners")
# Use the company registration number to filter out excluded owners
self.ownership_data = self.ownership_data[
~self.ownership_data["Company Registration No. (1)"].astype(str).isin(self.excluded_owners)
]
def prepare_for_matching(self):
"""
Given the epc properties and the ownership data, this function performs a number of operations on both datasets
to prepare them for matching
"""
logger.info("Preparing data for matching")
# Now we filter properties the other way around, since the ownership data might not have all of the
# postcodes that appear in the EPC data
self.epc_data = self.epc_data[
self.epc_data["POSTCODE"].str.lower().isin(self.ownership_data["Postcode"].str.lower().unique())
]
# We have some duplicated on UPRN
# Take the newest UPRN
self.epc_data = self.epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN")
# Remove entries where the address begins with the term "land adjoining", or other records that don't
# reference the
# the property itself
for starting_term in self.OWNERSHIP_STARTING_TERMS:
self.ownership_data = self.ownership_data[
~self.ownership_data["Property Address"].str.lower().str.startswith(starting_term)
]
@staticmethod
def extract_numeric_part(house_number: str) -> str:
"""
Extracts only the numeric part from a house number that may contain letters.
Parameters:
- house_number (str): The house number string possibly containing letters.
Returns:
- str: The numeric part of the house number.
"""
# Use regular expression to replace all non-digit characters with nothing
numeric_part = re.sub(r'\D', '', house_number)
return numeric_part
@staticmethod
def remove_text_in_brackets(address: str) -> str:
"""
Removes any text within parentheses, including the parentheses themselves.
Parameters:
- address (str): The address string to clean.
Returns:
- str: The cleaned address with text in parentheses removed.
"""
# Regex to find and remove content in parentheses
cleaned_address = re.sub(r'\s*\([^)]*\)', '', address)
return cleaned_address
@staticmethod
def extract_range_from_house_number(house_number_range: str):
"""
Detects if the house number includes a numeric range (formatted as 'x-y') and extracts all values within this
range.
Non-numeric strings containing hyphens are ignored.
Parameters:
- house_number_range (str): The house number string that might contain a range.
Returns:
- list of str: A list of all numbers within the range if it is a range; otherwise, returns None.
"""
if not house_number_range:
return None
if '-' in house_number_range:
parts = house_number_range.split('-')
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
# Both parts are numeric, so it's a valid range
start, end = map(int, parts) # Convert parts to integers
return [str(x) for x in range(start, end + 1)]
else:
# Not a valid numeric range
return None
else:
# No hyphen present or not a range
return None
@staticmethod
def is_in_range(row, house_no):
""" Check if the house number is within the range provided in the row. """
if row and any(house_no == num for num in row):
return True
return False
@staticmethod
def levenstein_match(matching_string, df, address_col):
match_to = df[address_col].tolist()
# Strip out punctuation and spaces
match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
match_to = [x.replace(" ", "") for x in match_to]
# Perform matching between full key and match_to
distances = [Levenshtein.distance(matching_string, s) for s in match_to]
best_match_index = distances.index(min(distances))
# We might want to consider a threshold for the distance, however for the momeny,
# we don't consider this for the moment
df = df.iloc[best_match_index:best_match_index + 1]
return df
@classmethod
def remove_duplicate_matches(cls, matching_lookup, properties, company_ownership):
duplicated_titles = matching_lookup[matching_lookup["Title Number"].duplicated()]["Title Number"].unique()
to_drop = []
for dupe_title in duplicated_titles:
dupe_data = matching_lookup[matching_lookup["Title Number"] == dupe_title].copy()
matched_addresses = dupe_data.merge(
properties[["UPRN", "ADDRESS"]].rename(columns={"ADDRESS": "epc_address"}),
how="left", on="UPRN"
).merge(
company_ownership[["Title Number", "Property Address"]],
how="left", on="Title Number"
)
# We perform levenstein to get the best match
best_match = cls.levenstein_match(
matching_string=matched_addresses["Property Address"].values[0],
df=matched_addresses,
address_col="epc_address"
)
matches_to_drop = matched_addresses[
~matched_addresses["UPRN"].isin(best_match["UPRN"].values)
]
to_drop.append(
matches_to_drop[["UPRN", "Title Number"]].copy()
)
to_drop = pd.concat(to_drop) if to_drop else pd.DataFrame()
if not to_drop.empty:
merged = pd.merge(matching_lookup, to_drop, on=['UPRN', 'Title Number'], how='left', indicator=True)
merged = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
return merged
return matching_lookup
@classmethod
def remove_duplicate_uprn_matches(cls, matching_lookup, properties, company_ownership):
dupe_uprns = matching_lookup[matching_lookup["UPRN"].duplicated()]["UPRN"].unique().tolist()
to_drop = []
for dupe_uprn in dupe_uprns:
dupe_data = matching_lookup[matching_lookup["UPRN"] == dupe_uprn].copy()
matched_addresses = dupe_data.merge(
properties[["UPRN", "ADDRESS"]].rename(columns={"ADDRESS": "epc_address"}),
how="left", on="UPRN"
).merge(
company_ownership[["Title Number", "Property Address"]],
how="left", on="Title Number"
)
# We perform levenstein to get the best match
best_match = cls.levenstein_match(
matching_string=matched_addresses["Property Address"].values[0],
df=matched_addresses,
address_col="epc_address"
)
matches_to_drop = matched_addresses[
~matched_addresses["Title Number"].isin(best_match["Title Number"].values)
]
to_drop.append(
matches_to_drop[["UPRN", "Title Number"]].copy()
)
to_drop = pd.concat(to_drop)
if not to_drop.empty:
merged = pd.merge(matching_lookup, to_drop, on=['UPRN', 'Title Number'], how='left', indicator=True)
merged = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
return merged
return matching_lookup
@staticmethod
def is_substring(x, match_string):
if pd.isnull(x):
return False
return x in match_string.lower()
@staticmethod
def house_number_match(paon, house_number):
# Firstly try and convert to numberic
try:
paon_numeric = int(paon)
house_number_numeric = int(house_number)
return paon_numeric == house_number_numeric
except Exception as e: # noqa
# If we can't convert both to numeric, we do an equality
return paon == house_number
@staticmethod
def check_equalities(lr_filtered):
all_paon_equal = all(lr_filtered["paon"] == lr_filtered["paon"].values[0])
if pd.isnull(lr_filtered["saon"].values[0]):
all_saon_equal = all(pd.isnull(lr_filtered["saon"]))
else:
all_saon_equal = all(lr_filtered["saon"] == lr_filtered["saon"].values[0])
all_street_equal = all(lr_filtered["street"] == lr_filtered["street"].values[0])
return all_paon_equal, all_saon_equal, all_street_equal
def match(self):
if (self.epc_data is None) or (self.ownership_data is None):
raise ValueError("epc_data and ownership_data should not be null")
logger.info("Matching EPC data to ownership data")
freehold_matching_lookup = []
leasehold_matching_lookup = []
shared_leasehold_match = []
shared_freehold_match = []
for _, address in tqdm(self.epc_data.iterrows(), total=len(self.epc_data)):
match_type = "exact"
filtered = self.ownership_data[
self.ownership_data["Postcode"].str.lower() == address["POSTCODE"].lower()
].copy()
# Remove postcode and remove trailing commas
filtered["house_number"] = (
filtered["Property Address"]
.apply(self.remove_text_in_brackets)
.apply(SearchEpc.get_house_number)
.str.lower()
.str.replace(",", "")
)
house_no = SearchEpc.get_house_number(address["ADDRESS1"])
if house_no is not None:
house_no = house_no.replace(",", "")
if house_no is None:
# If the house number is missing, it means that we usually have a named property so we look for an
# exact match on that name
filtered = filtered[filtered["Property Address"].str.lower().str.contains(address["ADDRESS"].lower())]
if filtered.shape[0] != 1:
continue
else:
if house_no not in filtered["house_number"].values:
# If this happens, we check house_number for a x-y range of addresses
filtered["house_number_range"] = filtered["house_number"].apply(
self.extract_range_from_house_number
)
# If we have found a house number range, we check if the house number is in the range and if not,
# we drop the row
filtered['is_in_range'] = filtered['house_number_range'].apply(
lambda x: self.is_in_range(x, house_no)
)
if filtered['is_in_range'].any():
# If house_no is found in any range, keep only rows where it is in range
filtered = filtered[filtered['is_in_range']]
else:
# If house_no is not found in any range, filter out rows where 'house_number_range' is not None
filtered = filtered[filtered['house_number_range'].isnull()]
# Strip out letters from house_no and house_number
house_no = self.extract_numeric_part(house_no)
filtered["house_number"] = filtered["house_number"].astype(str).apply(self.extract_numeric_part)
match_type = "approximate"
filtered = filtered[filtered["house_number"] == house_no]
if filtered.empty:
continue
filtered_freehold = filtered[filtered["Tenure"] == "Freehold"]
filtered_leasehold = filtered[filtered["Tenure"] == "Leasehold"]
if filtered_freehold.shape[0] > 1:
matched = filtered_leasehold[["Title Number"]].copy()
matched.insert(0, "UPRN", address["UPRN"])
shared_freehold_match.append(matched)
elif not filtered_freehold.empty:
freehold_matching_lookup.append(
{
"UPRN": address["UPRN"],
"Title Number": filtered_freehold["Title Number"].values[0],
"match_type": match_type,
}
)
if filtered_leasehold.shape[0] > 1:
matched = filtered_leasehold[["Title Number"]].copy()
matched.insert(0, "UPRN", address["UPRN"])
shared_leasehold_match.append(matched)
elif not filtered_leasehold.empty:
leasehold_matching_lookup.append(
{
"UPRN": address["UPRN"],
"Title Number": filtered_leasehold["Title Number"].values[0],
"match_type": match_type,
}
)
logger.info("Matching complete - creating lookup tables")
self.freehold_matching_lookup = pd.DataFrame(freehold_matching_lookup)
self.leasehold_matching_lookup = pd.DataFrame(leasehold_matching_lookup)
self.freehold_matching_lookup = self.freehold_matching_lookup[
self.freehold_matching_lookup["match_type"] == "exact"
]
self.leasehold_matching_lookup = self.leasehold_matching_lookup[
self.leasehold_matching_lookup["match_type"] == "exact"
]
self.shared_leasehold_match = shared_leasehold_match
self.shared_freehold_match = shared_freehold_match
# finally, we create matched addresses
self.combined_matching_lookup = pd.concat([self.freehold_matching_lookup, self.leasehold_matching_lookup])
# Remove duplicates
self.combined_matching_lookup = self.remove_duplicate_matches(
matching_lookup=self.combined_matching_lookup,
properties=self.epc_data,
company_ownership=self.ownership_data
)
# We also have duplicates at a UPRN level
self.combined_matching_lookup = self.remove_duplicate_uprn_matches(
matching_lookup=self.combined_matching_lookup,
properties=self.epc_data,
company_ownership=self.ownership_data
)
self.matched_addresses = self.combined_matching_lookup.merge(
self.epc_data[
[
"UPRN",
"ADDRESS",
"ADDRESS1",
"CURRENT_ENERGY_EFFICIENCY",
"CURRENT_ENERGY_RATING",
"POSTCODE",
"LODGEMENT_DATE",
"TRANSACTION_TYPE",
"TENURE",
]
].rename(
columns={
"ADDRESS": "epc_address",
"ADDRESS1": "epc_address1",
"POSTCODE": "epc_postcode"
}
),
how="left", on="UPRN"
).merge(
self.ownership_data[
[
"Title Number",
"Property Address",
"Postcode",
"Company Registration No. (1)",
"Proprietor Name (1)",
"Date Proprietor Added",
]
],
how="left", on="Title Number"
)
# Let's try and get the house number
self.matched_addresses["house_number"] = (
self.matched_addresses["epc_address"]
.apply(self.remove_text_in_brackets)
.apply(SearchEpc.get_house_number)
.str.lower()
.str.replace(",", "")
)
logger.info("Successfully completed matching")
def get_land_registry(self):
"""
This function reads in the land registry data and filters it on the postcodes found in the EPC data
"""
land_registry = pd.read_csv(self.land_registry_path, header=None)
land_registry.columns = self.LAND_REGISTRY_COLUMNS
land_registry = land_registry[
land_registry["postcode"].str.lower().isin(self.epc_data["POSTCODE"].str.lower().unique())
]
land_registry["date_of_transfer"] = pd.to_datetime(
land_registry["date_of_transfer"], format="%Y-%m-%d", errors="coerce"
)
# Take data from the last 5 years
land_registry = land_registry[
(land_registry["date_of_transfer"] >= datetime.now() - pd.DateOffset(years=5))
]
return land_registry
def match_with_land_registry(self):
"""
This function matches the land registry data to the existing matches
:return:
"""
# TODO: Refactor this entire function
if self.matched_addresses is None:
raise ValueError("Run match() first!")
logger.info("Reading land registry data")
self.land_registry = self.get_land_registry()
# Store this fitereed version in s3
save_excel_to_s3(
df=self.land_registry,
bucket_name=self.bucket,
file_key=self.filtered_land_registry_filepath,
)
for col in ["postcode", "street", "paon", "saon"]:
self.land_registry[col] = self.land_registry[col].str.lower().str.strip()
self.land_registry["date_of_transfer"] = pd.to_datetime(self.land_registry["date_of_transfer"])
logger.info("Performing land registry matching")
land_registry_matches = []
for _, match in tqdm(self.matched_addresses.iterrows(), total=len(self.matched_addresses)):
# Filter land registry on the postcode
lr_filtered = self.land_registry[
(self.land_registry["postcode"] == match["epc_postcode"].lower().strip())
].copy()
# Filter further, when the street is in in the address
# street should be contained in epc_address
lr_filtered = lr_filtered[
lr_filtered["street"].apply(lambda x: self.is_substring(x, match["epc_address"].lower())) |
lr_filtered["street"].apply(lambda x: self.is_substring(x, match["Property Address"].lower()))
]
if lr_filtered.empty:
continue
# We now check if paon is in address 1
lr_filtered["paon_match"] = lr_filtered["paon"].apply(
lambda x: self.house_number_match(x, match["house_number"])
)
# We also try the secondary match
lr_filtered["saon_match"] = (
lr_filtered["saon"].apply(
lambda x: False if pd.isnull(x) else self.is_substring(x, match["epc_address1"])
)
)
# We fileter where we have a primary or secondary match
lr_filtered = lr_filtered[
lr_filtered["paon_match"] | lr_filtered["saon_match"]
]
if lr_filtered.empty:
continue
elif lr_filtered.shape[0] == 1:
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
elif lr_filtered.shape[0] > 1:
# We make sure all records are the same and take the newest
all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered)
has_paon_match = any(lr_filtered["paon_match"])
if all_paon_equal and all_street_equal and all_saon_equal:
# Take the newest record, append and continue
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
lr_filtered = lr_filtered.head(1)
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
elif has_paon_match and all_street_equal:
# Peform filter on paon
lr_filtered = lr_filtered[lr_filtered["paon_match"]]
# Do an addtiioanl equality check
all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered)
if all_paon_equal and all_street_equal and all_saon_equal:
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
lr_filtered = lr_filtered.head(1)
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
else:
# We do a match on saon
lr_filtered["saon_match2"] = lr_filtered["saon"].apply(
lambda x: False if pd.isnull(x) else self.is_substring(x, match["epc_address"])
)
lr_filtered = lr_filtered[lr_filtered["saon_match2"]]
if lr_filtered.empty:
continue
elif lr_filtered.shape[0] == 1:
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
else:
raise NotImplementedError("wtf")
else:
# We have a final check, based on an observed case
lr_address_1 = " ".join([x.lower().strip() for x in match["Property Address"].split(",")[0:2]])
lr_filtered["paon_match2"] = lr_filtered["paon"].apply(
lambda x: False if pd.isnull(x) else self.is_substring(x, lr_address_1)
)
lr_filtered = lr_filtered[lr_filtered["paon_match2"]]
if lr_filtered.empty:
continue
elif lr_filtered.shape[0] == 1:
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
else:
# Check all the same
all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered)
# Check saon is house number with exact match
lr_filtered["saon_match2"] = lr_filtered["saon"].apply(
lambda x: False if pd.isnull(x) else self.house_number_match(x, match["house_number"])
)
# We check if we have a flat
match_flat_number = re.match("flat (\d+)", match["epc_address1"].lower())
match_apartment_number = re.match("apartment (\d+)", match["epc_address1"].lower())
lr_filtered["saon_match3"] = False
if match_flat_number is not None:
# Get out the match
match_flat_number = "flat " + match_flat_number.group(1)
lr_filtered["saon_match3"] = lr_filtered["saon"].apply(
lambda x: False if pd.isnull(x) else x == match_flat_number
)
if match_apartment_number is not None:
# Get out the match
match_apartment_number = "apartment " + match_apartment_number.group(1)
lr_filtered["saon_match3"] = lr_filtered["saon"].apply(
lambda x: False if pd.isnull(x) else x == match_apartment_number
)
if all_paon_equal and all_saon_equal and all_street_equal:
# Take the newest record
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
lr_filtered = lr_filtered.head(1)
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
elif any(lr_filtered["saon_match2"]):
lr_filtered = lr_filtered[lr_filtered["saon_match2"]]
all_saon_equal, all_paon_equal, all_street_equal = self.check_equalities(lr_filtered)
if all_paon_equal and all_saon_equal and all_street_equal:
# Filter on the newest record
lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
lr_filtered = lr_filtered.head(1)
if lr_filtered.shape[0] == 1:
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
elif any(lr_filtered["saon_match3"]):
lr_filtered = lr_filtered[lr_filtered["saon_match3"]]
if lr_filtered.shape[0] == 1:
land_registry_matches.append(
{
"uprn": match["UPRN"],
"transaction_id": lr_filtered['transaction_id'].values[0],
"price": lr_filtered["price"].values[0],
"date_of_transfer": lr_filtered["date_of_transfer"].values[0],
}
)
continue
raise NotImplementedError("wtf")
else:
raise NotImplementedError("What happened here?")
self.land_registry_matches = pd.DataFrame(land_registry_matches)
logger.info("Sucessfully completed land registry matching - merging onto matched_addresses")
# Merge onto the EPC - ownership matches
self.matched_addresses = self.matched_addresses.merge(
self.land_registry_matches,
how="left",
left_on="UPRN",
right_on="uprn"
).drop(columns=["uprn"])
# Flag anything that sold in the last year
self.matched_addresses["sold_recently"] = (
self.matched_addresses["date_of_transfer"] >= pd.Timestamp.now() -
pd.DateOffset(month=self.SOLD_RECENTLY_MONTHS)
)
self.matched_addresses["sale_lodged_recently"] = (
(
pd.to_datetime(
self.matched_addresses["LODGEMENT_DATE"]
) >= pd.Timestamp.now() - pd.DateOffset(months=self.LODGED_RECENTLY_MONTHS)
) &
(self.matched_addresses["TRANSACTION_TYPE"].isin(["marketed sale", "non marketed sale"]))
)
def aggregate_matches(self, matching_lookup, company_ownership, properties):
df = matching_lookup.merge(
company_ownership, how="left", on="Title Number"
).merge(
properties[["UPRN", "LOCAL_AUTHORITY_LABEL"]], how="left", on="UPRN"
)
counts = (
df.groupby(["Company Registration No. (1)", "LOCAL_AUTHORITY_LABEL"])["UPRN"]
.count()
.reset_index(name="number_of_properties")
)
counts = counts.sort_values("number_of_properties", ascending=False)
pivot_counts = counts.pivot_table(
index=["Company Registration No. (1)"], # Rows: companies and proprietors
columns="LOCAL_AUTHORITY_LABEL", # Columns: each local authority
values="number_of_properties", # The counts of properties
fill_value=0 # Fill missing values with 0 (where there are no properties owned)
).reset_index()
total_counts = (
df.groupby(["Company Registration No. (1)"])["UPRN"]
.count()
.reset_index(name="total_number_of_properties")
)
# We have cases where the same company registration number results in the same company name, so we produce a
# best
# name per company registration number
best_names = (
df.groupby(["Company Registration No. (1)"])["Proprietor Name (1)"]
.first()
.reset_index()
)
total_counts = best_names.merge(
total_counts, how="left", on=["Company Registration No. (1)"]
)
pivot_counts = pivot_counts.merge(
total_counts, how="left", on=["Company Registration No. (1)"]
)
pivot_counts = pivot_counts.sort_values("total_number_of_properties", ascending=False)
pivot_counts = pivot_counts[pivot_counts["total_number_of_properties"] > 1]
pivot_counts["approx_value"] = self.average_property_value * pivot_counts["total_number_of_properties"]
pivot_counts["cumulative_value"] = pivot_counts["approx_value"].cumsum()
return pivot_counts
def create_final_matches(self):
"""
Given the matching to this point, this method creates the final matching tables
:return:
"""
logger.info("Creating final matches")
matched_addresses_final = self.matched_addresses[
~self.matched_addresses["sold_recently"] &
~self.matched_addresses["sale_lodged_recently"]
].copy()
logger.info("Performing conservation area and listed/herigage building filtering")
portfolio_spatial_data = OpenUprnClient.get_spatial_data(
matched_addresses_final["UPRN"].unique().tolist(), bucket_name="retrofit-data-dev"
)
portfolio_spatial_data = portfolio_spatial_data[
["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]
].copy()
portfolio_spatial_data["UPRN"] = portfolio_spatial_data["UPRN"].astype(str)
# Filter matched_addresses_final and filter combined_matching_lookup_final
matched_addresses_final = matched_addresses_final.merge(
portfolio_spatial_data, how="left", on="UPRN"
)
matched_addresses_final = matched_addresses_final[
matched_addresses_final["conservation_status"].isin([None, False]) &
matched_addresses_final["is_listed_building"].isin([None, False]) &
matched_addresses_final["is_heritage_building"].isin([None, False])
]
# Filter combined_matching_lookup accordingly
combined_matching_lookup_final = self.combined_matching_lookup[
self.combined_matching_lookup["UPRN"].isin(matched_addresses_final["UPRN"])
]
# Roll up portfolio
combined_aggregate = self.aggregate_matches(
matching_lookup=combined_matching_lookup_final,
company_ownership=self.ownership_data,
properties=self.epc_data
)
self.portfolio_owners = combined_aggregate[combined_aggregate["cumulative_value"] <= self.portfolio_value]
self.portfolio_properties = matched_addresses_final[
matched_addresses_final["Company Registration No. (1)"].isin(
self.portfolio_owners["Company Registration No. (1)"]
)
]
# We perform some checks
if self.portfolio_owners["total_number_of_properties"].sum() != self.portfolio_properties["UPRN"].nunique():
raise ValueError("Portfolio owners and properties don't match")
self.portfolio_epc_data = self.epc_data[self.epc_data["UPRN"].isin(self.portfolio_properties["UPRN"])]
# Additional checks
if self.portfolio_properties["UPRN"].nunique() != self.portfolio_epc_data["UPRN"].nunique():
raise ValueError("Portfolio properties and epc data don't match")
if self.save:
logger.info("Storing final outpus")
# Store data
save_excel_to_s3(
df=self.portfolio_owners,
bucket_name=self.bucket,
file_key=self.portfolio_owners_filepath,
)
save_excel_to_s3(
df=self.portfolio_properties,
bucket_name=self.bucket,
file_key=self.portfolio_properties_filepath,
)
save_excel_to_s3(
df=self.portfolio_epc_data,
bucket_name=self.bucket,
file_key=self.portfolio_epc_data_filepath,
)
def get_asset_list(self):
"""
From the EPC data, creates the asset list
:return:
"""
asset_list = self.portfolio_epc_data[["UPRN", "ADDRESS1", "POSTCODE"]].copy().rename(
columns={
"UPRN": "uprn",
"ADDRESS1": "address",
"POSTCODE": "postcode"
}
)
return asset_list
def create_final_outputs(self, portfolio_timestamp, storage_date, exclusion_uprns=None):
"""
Given the completed outputs of the matching process, this function creates the final outputs, after matching
valuation data, and creates a "working" directory, which is our current view of the sfr portfolio. This means
that we can iterate on the portfolio without affecting the final outputs, and then once we're happy with the
new version, we can commit those files to the "working" directory. This inforamtion shouldn't update very
often and so we're ok to store this at a daily level
:return:
"""
exclusion_uprns = [] if exclusion_uprns is None else exclusion_uprns
# Step 1: Read in the valuations data
valuations = read_excel_from_s3(
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/sfr property valuations.xlsx",
header_row=0
)
# Load in the portfolio data
# 1) owners
portfolio_owners = read_excel_from_s3(
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_owners.xlsx",
header_row=0
)
# 2) EPC
portfolio_epc_data = read_excel_from_s3(
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_epc_data.xlsx",
header_row=0
)
# 3) properties
portfolio_properties = read_excel_from_s3(
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_properties.xlsx",
header_row=0
)
# Check they're the right size
if portfolio_owners["total_number_of_properties"].sum() != portfolio_properties["UPRN"].nunique():
raise ValueError("Portfolio owners and properties don't match")
if portfolio_properties["UPRN"].nunique() != portfolio_epc_data["UPRN"].nunique():
raise ValueError("Portfolio properties and epc data don't match")
# We make some final cuts based on UPRNs that at a later stage are found to be odd
if portfolio_properties["UPRN"].isin(exclusion_uprns).sum():
raise Exception("Implement me!")
# Identify who the owners are for thes uprns
# owners = portfolio_properties[portfolio_properties["UPRN"].isin(exclusion_uprns)].groupby(
# "Company Registration No. (1)"
# )["UPRN"].nunique().reset_index().rename(
# columns={"UPRN": "number_of_properties_to_exclude"}
# )
#
# min_owners_threshold = portfolio_owners["total_number_of_properties"].min()
#
# portfolio_owners = portfolio_owners.merge(
# owners, how="left", on="Company Registration No. (1)", suffixes=("", "_excluded")
# )
# Step 2: Merge in the valuations data
portfolio_properties = portfolio_properties.merge(
valuations.rename(columns={"uprn": "UPRN"}).drop(columns=['address', 'postcode']), how="left", on="UPRN"
)
# Step 3: Store the final outputs
save_excel_to_s3(
df=portfolio_owners,
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_owners.xlsx",
)
save_excel_to_s3(
df=portfolio_properties,
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_properties.xlsx",
)
save_excel_to_s3(
df=portfolio_epc_data,
bucket_name=self.bucket,
file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_epc_data.xlsx",
)