from datetime import datetime from typing import List from tqdm import tqdm import pandas as pd import Levenshtein import re from utils.s3 import save_excel_to_s3, read_excel_from_s3 from utils.logger import setup_logger from backend.SearchEpc import SearchEpc from etl.spatial.OpenUprnClient import OpenUprnClient logger = setup_logger() class Ownership: # These are a number of prefix phrases, found in the ownership data. If an address begins with a any of these # terms, we remove them OWNERSHIP_STARTING_TERMS = [ "land adjoining", "land on the", "land to the rear of", "land and buildings on the", "garage adjoining", "car park adjoining", "the land adjoining", "land and buildings adjoining", "all royal mines" ] # anything that is sold within this many months is flagged to have sold recently and is then # considered to be dropped from matching SOLD_RECENTLY_MONTHS = 12 # Anything that has been lodged for a marketed or unmarketed sale within this many months is # flagged as potentially in the process of being sold LODGED_RECENTLY_MONTHS = 12 # These are the columns in the land registry data LAND_REGISTRY_COLUMNS = [ "transaction_id", "price", "date_of_transfer", "postcode", "property_type", "old_new", "duration", "paon", "saon", "street", "locality", "town_city", "district", "county", "ppd_category_type", "record_status", ] def __init__( self, epc_paths: List[str], domestic_ownership_path: str, overseas_ownership_path: str, land_registry_path: str, project_name: str, bucket: str, average_property_value: float, portfolio_value: float, excluded_owners: List[str] = None, excluded_uprns: List[int] = None, save=True ): """ :param epc_paths: A list of strings, which points to the location of the EPC data to be used. TO date, this data has been held locally, and so will require extension to read from remote locaations like s3 :param domestic_ownership_path: A string which points to the location of the CCOD ownership data, that details corporate ownership of properties in the UK, where the companies are UK based :param overseas_ownership_path: A string which points to the location of the OCOD ownership data, that details corporate ownership of properties in the UK, where the companies are overseas :param land_registry_path: A string that points to the location of the land registry data :param project_name: A string that is used to identify the project :param bucket: The name of the s3 bucket where the data will be stored :param average_property_value: The average property value in the area """ # All epc paths should end with certificates.csv if not any(path for path in epc_paths if path.endswith("certificates.csv")): raise ValueError("epc_paths contains a path that does not end with certificates.csv") self.epc_paths = epc_paths self.domestic_ownership_path = domestic_ownership_path self.overseas_ownership_path = overseas_ownership_path self.land_registry_path = land_registry_path self.excluded_owners = [] if excluded_owners is None else excluded_owners self.excluded_uprns = [] if excluded_uprns is None else excluded_uprns self.run_timestamp = str(datetime.now()) self.project_name = project_name self.bucket = bucket self.average_property_value = average_property_value self.portfolio_value = portfolio_value # Data storage paths self.epc_data_filepath = f"ownership/{self.project_name}/{self.run_timestamp}/epc_data.xlsx" self.filtered_land_registry_filepath = ( f"ownership/{self.project_name}/{self.run_timestamp}/filtered_land_registry.xlsx" ) self.matched_addresses_pre_filter_filepath = ( f"ownership/{self.project_name}/{self.run_timestamp}/matched_addresses_pre_filter.xlsx" ) self.combined_matching_lookup_pre_filter_filepath = ( f"ownership/{self.project_name}/{self.run_timestamp}/combined_matching_lookup_pre_filter.xlsx" ) # Final output paths self.portfolio_owners_filepath = f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_owners.xlsx" self.portfolio_properties_filepath = ( f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_properties.xlsx" ) self.portfolio_epc_data_filepath = ( f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_epc_data.xlsx" ) self.save = save # Data self.epc_data = None self.ownership_data = None self.freehold_matching_lookup = None self.leasehold_matching_lookup = None self.shared_freehold_match = None self.shared_leasehold_match = None self.land_registry = None # Match tables self.combined_matching_lookup = None self.matched_addresses = None self.land_registry_matches = None # Final outputs data self.portfolio_owners = None self.portfolio_properties = None self.portfolio_epc_data = None def pipeline(self, column_filters=None): """ Runs the full ownership process :param column_filters: Dictionary with column names as keys and list of acceptable values as values. This dictionary is is used to filter the EPC data and should look like this: {"column_name": ["value1", "value2", ...]}, where column_name is the name of the column in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that column. If a column is not found in the EPC data, an exception is raised. """ # Step 1: Get EPC data self.source_epc_properties(column_filters=column_filters) # Step 2: Get company ownership data self.load_company_ownership() # Step 3: Prepare data for matching self.prepare_for_matching() # Step 4: Match EPC data to ownership data self.match() # Step 5: Match land registry data to existing matches self.match_with_land_registry() # We store this data in s3 before we perform any filtering if self.save: save_excel_to_s3( df=self.matched_addresses, bucket_name=self.bucket, file_key=self.matched_addresses_pre_filter_filepath ) save_excel_to_s3( df=self.combined_matching_lookup, bucket_name=self.bucket, file_key=self.combined_matching_lookup_pre_filter_filepath ) # Prepare the final outputs: self.create_final_matches() def source_epc_properties(self, column_filters=None, postcodes=None): """ This function will filter the epc data as specified by column filters, searching across all of the EPC tables :param column_filters: Dictionary with column names as keys and list of acceptable values as values. This dictionary is is used to filter the EPC data and should look like this: {"column_name": ["value1", "value2", ...]}, where column_name is the name of the column in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that column. If a column is not found in the EPC data, an exception is raised. :param postcodes: A list of postcodes to filter the data on """ column_filters = {} if column_filters is None else column_filters data = [] for path in tqdm(self.epc_paths): epc_data = pd.read_csv(path, low_memory=False) epc_data = epc_data[~pd.isnull(epc_data["UPRN"])] epc_data["UPRN"] = epc_data["UPRN"].astype(int).astype(str) if pd.isnull(pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce")).sum(): raise Exception("Lodgement datetime contains invalid data") epc_data["LODGEMENT_DATETIME"] = pd.to_datetime(epc_data["LODGEMENT_DATETIME"], errors="coerce") epc_data = epc_data.sort_values(["LODGEMENT_DATETIME"], ascending=False).drop_duplicates("UPRN") # Apply column filters for column, values in column_filters.items(): if column in epc_data.columns: epc_data = epc_data[epc_data[column].isin(values)] else: raise Exception(f"Column {column} not found in data. column_filters is malformed") if postcodes is not None: epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes)] if epc_data.empty: continue data.append(epc_data) self.epc_data = pd.concat(data, ignore_index=True) if self.excluded_uprns: self.epc_data = self.epc_data[~self.epc_data["UPRN"].astype(float).isin(self.excluded_uprns)] if self.save: # We now store the data in s3 save_excel_to_s3( df=self.epc_data, bucket_name=self.bucket, file_key=self.epc_data_filepath ) def load_company_ownership(self): """ This function reads in the company ownership data and :return: """ logger.info("Reading in company ownership data") self.ownership_data = pd.read_csv(self.domestic_ownership_path) self.ownership_data["is_overseas"] = False overseas_company_ownership = pd.read_csv(self.overseas_ownership_path) overseas_company_ownership["is_overseas"] = True self.ownership_data = pd.concat([self.ownership_data, overseas_company_ownership]) # FIlter on relevant postcodes - this is done to reduce the large size of the ownership dataset logger.info("Filtering ownership data on EPC postcodes") self.ownership_data = self.ownership_data[ self.ownership_data["Postcode"].str.lower().isin(self.epc_data["POSTCODE"].str.lower().unique()) ] logger.info("Removing excluded owners") # Use the company registration number to filter out excluded owners self.ownership_data = self.ownership_data[ ~self.ownership_data["Company Registration No. (1)"].astype(str).isin(self.excluded_owners) ] def prepare_for_matching(self): """ Given the epc properties and the ownership data, this function performs a number of operations on both datasets to prepare them for matching """ logger.info("Preparing data for matching") # Now we filter properties the other way around, since the ownership data might not have all of the # postcodes that appear in the EPC data self.epc_data = self.epc_data[ self.epc_data["POSTCODE"].str.lower().isin(self.ownership_data["Postcode"].str.lower().unique()) ] # We have some duplicated on UPRN # Take the newest UPRN self.epc_data = self.epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN") # Remove entries where the address begins with the term "land adjoining", or other records that don't # reference the # the property itself for starting_term in self.OWNERSHIP_STARTING_TERMS: self.ownership_data = self.ownership_data[ ~self.ownership_data["Property Address"].str.lower().str.startswith(starting_term) ] @staticmethod def extract_numeric_part(house_number: str) -> str: """ Extracts only the numeric part from a house number that may contain letters. Parameters: - house_number (str): The house number string possibly containing letters. Returns: - str: The numeric part of the house number. """ # Use regular expression to replace all non-digit characters with nothing numeric_part = re.sub(r'\D', '', house_number) return numeric_part @staticmethod def remove_text_in_brackets(address: str) -> str: """ Removes any text within parentheses, including the parentheses themselves. Parameters: - address (str): The address string to clean. Returns: - str: The cleaned address with text in parentheses removed. """ # Regex to find and remove content in parentheses cleaned_address = re.sub(r'\s*\([^)]*\)', '', address) return cleaned_address @staticmethod def extract_range_from_house_number(house_number_range: str): """ Detects if the house number includes a numeric range (formatted as 'x-y') and extracts all values within this range. Non-numeric strings containing hyphens are ignored. Parameters: - house_number_range (str): The house number string that might contain a range. Returns: - list of str: A list of all numbers within the range if it is a range; otherwise, returns None. """ if not house_number_range: return None if '-' in house_number_range: parts = house_number_range.split('-') if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit(): # Both parts are numeric, so it's a valid range start, end = map(int, parts) # Convert parts to integers return [str(x) for x in range(start, end + 1)] else: # Not a valid numeric range return None else: # No hyphen present or not a range return None @staticmethod def is_in_range(row, house_no): """ Check if the house number is within the range provided in the row. """ if row and any(house_no == num for num in row): return True return False @staticmethod def levenstein_match(matching_string, df, address_col): match_to = df[address_col].tolist() # Strip out punctuation and spaces match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to] match_to = [x.replace(" ", "") for x in match_to] # Perform matching between full key and match_to distances = [Levenshtein.distance(matching_string, s) for s in match_to] best_match_index = distances.index(min(distances)) # We might want to consider a threshold for the distance, however for the momeny, # we don't consider this for the moment df = df.iloc[best_match_index:best_match_index + 1] return df @classmethod def remove_duplicate_matches(cls, matching_lookup, properties, company_ownership): duplicated_titles = matching_lookup[matching_lookup["Title Number"].duplicated()]["Title Number"].unique() to_drop = [] for dupe_title in duplicated_titles: dupe_data = matching_lookup[matching_lookup["Title Number"] == dupe_title].copy() matched_addresses = dupe_data.merge( properties[["UPRN", "ADDRESS"]].rename(columns={"ADDRESS": "epc_address"}), how="left", on="UPRN" ).merge( company_ownership[["Title Number", "Property Address"]], how="left", on="Title Number" ) # We perform levenstein to get the best match best_match = cls.levenstein_match( matching_string=matched_addresses["Property Address"].values[0], df=matched_addresses, address_col="epc_address" ) matches_to_drop = matched_addresses[ ~matched_addresses["UPRN"].isin(best_match["UPRN"].values) ] to_drop.append( matches_to_drop[["UPRN", "Title Number"]].copy() ) to_drop = pd.concat(to_drop) if to_drop else pd.DataFrame() if not to_drop.empty: merged = pd.merge(matching_lookup, to_drop, on=['UPRN', 'Title Number'], how='left', indicator=True) merged = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge']) return merged return matching_lookup @classmethod def remove_duplicate_uprn_matches(cls, matching_lookup, properties, company_ownership): dupe_uprns = matching_lookup[matching_lookup["UPRN"].duplicated()]["UPRN"].unique().tolist() to_drop = [] for dupe_uprn in dupe_uprns: dupe_data = matching_lookup[matching_lookup["UPRN"] == dupe_uprn].copy() matched_addresses = dupe_data.merge( properties[["UPRN", "ADDRESS"]].rename(columns={"ADDRESS": "epc_address"}), how="left", on="UPRN" ).merge( company_ownership[["Title Number", "Property Address"]], how="left", on="Title Number" ) # We perform levenstein to get the best match best_match = cls.levenstein_match( matching_string=matched_addresses["Property Address"].values[0], df=matched_addresses, address_col="epc_address" ) matches_to_drop = matched_addresses[ ~matched_addresses["Title Number"].isin(best_match["Title Number"].values) ] to_drop.append( matches_to_drop[["UPRN", "Title Number"]].copy() ) to_drop = pd.concat(to_drop) if not to_drop.empty: merged = pd.merge(matching_lookup, to_drop, on=['UPRN', 'Title Number'], how='left', indicator=True) merged = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge']) return merged return matching_lookup @staticmethod def is_substring(x, match_string): if pd.isnull(x): return False return x in match_string.lower() @staticmethod def house_number_match(paon, house_number): # Firstly try and convert to numberic try: paon_numeric = int(paon) house_number_numeric = int(house_number) return paon_numeric == house_number_numeric except Exception as e: # noqa # If we can't convert both to numeric, we do an equality return paon == house_number @staticmethod def check_equalities(lr_filtered): all_paon_equal = all(lr_filtered["paon"] == lr_filtered["paon"].values[0]) if pd.isnull(lr_filtered["saon"].values[0]): all_saon_equal = all(pd.isnull(lr_filtered["saon"])) else: all_saon_equal = all(lr_filtered["saon"] == lr_filtered["saon"].values[0]) all_street_equal = all(lr_filtered["street"] == lr_filtered["street"].values[0]) return all_paon_equal, all_saon_equal, all_street_equal def match(self): if (self.epc_data is None) or (self.ownership_data is None): raise ValueError("epc_data and ownership_data should not be null") logger.info("Matching EPC data to ownership data") freehold_matching_lookup = [] leasehold_matching_lookup = [] shared_leasehold_match = [] shared_freehold_match = [] for _, address in tqdm(self.epc_data.iterrows(), total=len(self.epc_data)): match_type = "exact" filtered = self.ownership_data[ self.ownership_data["Postcode"].str.lower() == address["POSTCODE"].lower() ].copy() # Remove postcode and remove trailing commas filtered["house_number"] = ( filtered["Property Address"] .apply(self.remove_text_in_brackets) .apply(SearchEpc.get_house_number) .str.lower() .str.replace(",", "") ) house_no = SearchEpc.get_house_number(address["ADDRESS1"]) if house_no is not None: house_no = house_no.replace(",", "") if house_no is None: # If the house number is missing, it means that we usually have a named property so we look for an # exact match on that name filtered = filtered[filtered["Property Address"].str.lower().str.contains(address["ADDRESS"].lower())] if filtered.shape[0] != 1: continue else: if house_no not in filtered["house_number"].values: # If this happens, we check house_number for a x-y range of addresses filtered["house_number_range"] = filtered["house_number"].apply( self.extract_range_from_house_number ) # If we have found a house number range, we check if the house number is in the range and if not, # we drop the row filtered['is_in_range'] = filtered['house_number_range'].apply( lambda x: self.is_in_range(x, house_no) ) if filtered['is_in_range'].any(): # If house_no is found in any range, keep only rows where it is in range filtered = filtered[filtered['is_in_range']] else: # If house_no is not found in any range, filter out rows where 'house_number_range' is not None filtered = filtered[filtered['house_number_range'].isnull()] # Strip out letters from house_no and house_number house_no = self.extract_numeric_part(house_no) filtered["house_number"] = filtered["house_number"].astype(str).apply(self.extract_numeric_part) match_type = "approximate" filtered = filtered[filtered["house_number"] == house_no] if filtered.empty: continue filtered_freehold = filtered[filtered["Tenure"] == "Freehold"] filtered_leasehold = filtered[filtered["Tenure"] == "Leasehold"] if filtered_freehold.shape[0] > 1: matched = filtered_leasehold[["Title Number"]].copy() matched.insert(0, "UPRN", address["UPRN"]) shared_freehold_match.append(matched) elif not filtered_freehold.empty: freehold_matching_lookup.append( { "UPRN": address["UPRN"], "Title Number": filtered_freehold["Title Number"].values[0], "match_type": match_type, } ) if filtered_leasehold.shape[0] > 1: matched = filtered_leasehold[["Title Number"]].copy() matched.insert(0, "UPRN", address["UPRN"]) shared_leasehold_match.append(matched) elif not filtered_leasehold.empty: leasehold_matching_lookup.append( { "UPRN": address["UPRN"], "Title Number": filtered_leasehold["Title Number"].values[0], "match_type": match_type, } ) logger.info("Matching complete - creating lookup tables") self.freehold_matching_lookup = pd.DataFrame(freehold_matching_lookup) self.leasehold_matching_lookup = pd.DataFrame(leasehold_matching_lookup) self.freehold_matching_lookup = self.freehold_matching_lookup[ self.freehold_matching_lookup["match_type"] == "exact" ] self.leasehold_matching_lookup = self.leasehold_matching_lookup[ self.leasehold_matching_lookup["match_type"] == "exact" ] self.shared_leasehold_match = shared_leasehold_match self.shared_freehold_match = shared_freehold_match # finally, we create matched addresses self.combined_matching_lookup = pd.concat([self.freehold_matching_lookup, self.leasehold_matching_lookup]) # Remove duplicates self.combined_matching_lookup = self.remove_duplicate_matches( matching_lookup=self.combined_matching_lookup, properties=self.epc_data, company_ownership=self.ownership_data ) # We also have duplicates at a UPRN level self.combined_matching_lookup = self.remove_duplicate_uprn_matches( matching_lookup=self.combined_matching_lookup, properties=self.epc_data, company_ownership=self.ownership_data ) self.matched_addresses = self.combined_matching_lookup.merge( self.epc_data[ [ "UPRN", "ADDRESS", "ADDRESS1", "CURRENT_ENERGY_EFFICIENCY", "CURRENT_ENERGY_RATING", "POSTCODE", "LODGEMENT_DATE", "TRANSACTION_TYPE", "TENURE", ] ].rename( columns={ "ADDRESS": "epc_address", "ADDRESS1": "epc_address1", "POSTCODE": "epc_postcode" } ), how="left", on="UPRN" ).merge( self.ownership_data[ [ "Title Number", "Property Address", "Postcode", "Company Registration No. (1)", "Proprietor Name (1)", "Date Proprietor Added", ] ], how="left", on="Title Number" ) # Let's try and get the house number self.matched_addresses["house_number"] = ( self.matched_addresses["epc_address"] .apply(self.remove_text_in_brackets) .apply(SearchEpc.get_house_number) .str.lower() .str.replace(",", "") ) logger.info("Successfully completed matching") def get_land_registry(self): """ This function reads in the land registry data and filters it on the postcodes found in the EPC data """ land_registry = pd.read_csv(self.land_registry_path, header=None) land_registry.columns = self.LAND_REGISTRY_COLUMNS land_registry = land_registry[ land_registry["postcode"].str.lower().isin(self.epc_data["POSTCODE"].str.lower().unique()) ] land_registry["date_of_transfer"] = pd.to_datetime( land_registry["date_of_transfer"], format="%Y-%m-%d", errors="coerce" ) # Take data from the last 5 years land_registry = land_registry[ (land_registry["date_of_transfer"] >= datetime.now() - pd.DateOffset(years=5)) ] return land_registry def match_with_land_registry(self): """ This function matches the land registry data to the existing matches :return: """ # TODO: Refactor this entire function if self.matched_addresses is None: raise ValueError("Run match() first!") logger.info("Reading land registry data") self.land_registry = self.get_land_registry() # Store this fitereed version in s3 save_excel_to_s3( df=self.land_registry, bucket_name=self.bucket, file_key=self.filtered_land_registry_filepath, ) for col in ["postcode", "street", "paon", "saon"]: self.land_registry[col] = self.land_registry[col].str.lower().str.strip() self.land_registry["date_of_transfer"] = pd.to_datetime(self.land_registry["date_of_transfer"]) logger.info("Performing land registry matching") land_registry_matches = [] for _, match in tqdm(self.matched_addresses.iterrows(), total=len(self.matched_addresses)): # Filter land registry on the postcode lr_filtered = self.land_registry[ (self.land_registry["postcode"] == match["epc_postcode"].lower().strip()) ].copy() # Filter further, when the street is in in the address # street should be contained in epc_address lr_filtered = lr_filtered[ lr_filtered["street"].apply(lambda x: self.is_substring(x, match["epc_address"].lower())) | lr_filtered["street"].apply(lambda x: self.is_substring(x, match["Property Address"].lower())) ] if lr_filtered.empty: continue # We now check if paon is in address 1 lr_filtered["paon_match"] = lr_filtered["paon"].apply( lambda x: self.house_number_match(x, match["house_number"]) ) # We also try the secondary match lr_filtered["saon_match"] = ( lr_filtered["saon"].apply( lambda x: False if pd.isnull(x) else self.is_substring(x, match["epc_address1"]) ) ) # We fileter where we have a primary or secondary match lr_filtered = lr_filtered[ lr_filtered["paon_match"] | lr_filtered["saon_match"] ] if lr_filtered.empty: continue elif lr_filtered.shape[0] == 1: land_registry_matches.append( { "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], } ) continue elif lr_filtered.shape[0] > 1: # We make sure all records are the same and take the newest all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered) has_paon_match = any(lr_filtered["paon_match"]) if all_paon_equal and all_street_equal and all_saon_equal: # Take the newest record, append and continue lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False) lr_filtered = lr_filtered.head(1) land_registry_matches.append( { "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], } ) continue elif has_paon_match and all_street_equal: # Peform filter on paon lr_filtered = lr_filtered[lr_filtered["paon_match"]] # Do an addtiioanl equality check all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered) if all_paon_equal and all_street_equal and all_saon_equal: lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False) lr_filtered = lr_filtered.head(1) land_registry_matches.append( { "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], } ) else: # We do a match on saon lr_filtered["saon_match2"] = lr_filtered["saon"].apply( lambda x: False if pd.isnull(x) else self.is_substring(x, match["epc_address"]) ) lr_filtered = lr_filtered[lr_filtered["saon_match2"]] if lr_filtered.empty: continue elif lr_filtered.shape[0] == 1: land_registry_matches.append( { "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], } ) continue else: raise NotImplementedError("wtf") else: # We have a final check, based on an observed case lr_address_1 = " ".join([x.lower().strip() for x in match["Property Address"].split(",")[0:2]]) lr_filtered["paon_match2"] = lr_filtered["paon"].apply( lambda x: False if pd.isnull(x) else self.is_substring(x, lr_address_1) ) lr_filtered = lr_filtered[lr_filtered["paon_match2"]] if lr_filtered.empty: continue elif lr_filtered.shape[0] == 1: land_registry_matches.append( { "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], } ) continue else: # Check all the same all_paon_equal, all_saon_equal, all_street_equal = self.check_equalities(lr_filtered) # Check saon is house number with exact match lr_filtered["saon_match2"] = lr_filtered["saon"].apply( lambda x: False if pd.isnull(x) else self.house_number_match(x, match["house_number"]) ) # We check if we have a flat match_flat_number = re.match("flat (\d+)", match["epc_address1"].lower()) match_apartment_number = re.match("apartment (\d+)", match["epc_address1"].lower()) lr_filtered["saon_match3"] = False if match_flat_number is not None: # Get out the match match_flat_number = "flat " + match_flat_number.group(1) lr_filtered["saon_match3"] = lr_filtered["saon"].apply( lambda x: False if pd.isnull(x) else x == match_flat_number ) if match_apartment_number is not None: # Get out the match match_apartment_number = "apartment " + match_apartment_number.group(1) lr_filtered["saon_match3"] = lr_filtered["saon"].apply( lambda x: False if pd.isnull(x) else x == match_apartment_number ) if all_paon_equal and all_saon_equal and all_street_equal: # Take the newest record lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False) lr_filtered = lr_filtered.head(1) land_registry_matches.append( { "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], } ) continue elif any(lr_filtered["saon_match2"]): lr_filtered = lr_filtered[lr_filtered["saon_match2"]] all_saon_equal, all_paon_equal, all_street_equal = self.check_equalities(lr_filtered) if all_paon_equal and all_saon_equal and all_street_equal: # Filter on the newest record lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False) lr_filtered = lr_filtered.head(1) if lr_filtered.shape[0] == 1: land_registry_matches.append( { "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], } ) continue elif any(lr_filtered["saon_match3"]): lr_filtered = lr_filtered[lr_filtered["saon_match3"]] if lr_filtered.shape[0] == 1: land_registry_matches.append( { "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], } ) continue raise NotImplementedError("wtf") else: raise NotImplementedError("What happened here?") self.land_registry_matches = pd.DataFrame(land_registry_matches) logger.info("Sucessfully completed land registry matching - merging onto matched_addresses") # Merge onto the EPC - ownership matches self.matched_addresses = self.matched_addresses.merge( self.land_registry_matches, how="left", left_on="UPRN", right_on="uprn" ).drop(columns=["uprn"]) # Flag anything that sold in the last year self.matched_addresses["sold_recently"] = ( self.matched_addresses["date_of_transfer"] >= pd.Timestamp.now() - pd.DateOffset(month=self.SOLD_RECENTLY_MONTHS) ) self.matched_addresses["sale_lodged_recently"] = ( ( pd.to_datetime( self.matched_addresses["LODGEMENT_DATE"] ) >= pd.Timestamp.now() - pd.DateOffset(months=self.LODGED_RECENTLY_MONTHS) ) & (self.matched_addresses["TRANSACTION_TYPE"].isin(["marketed sale", "non marketed sale"])) ) def aggregate_matches(self, matching_lookup, company_ownership, properties): df = matching_lookup.merge( company_ownership, how="left", on="Title Number" ).merge( properties[["UPRN", "LOCAL_AUTHORITY_LABEL"]], how="left", on="UPRN" ) counts = ( df.groupby(["Company Registration No. (1)", "LOCAL_AUTHORITY_LABEL"])["UPRN"] .count() .reset_index(name="number_of_properties") ) counts = counts.sort_values("number_of_properties", ascending=False) pivot_counts = counts.pivot_table( index=["Company Registration No. (1)"], # Rows: companies and proprietors columns="LOCAL_AUTHORITY_LABEL", # Columns: each local authority values="number_of_properties", # The counts of properties fill_value=0 # Fill missing values with 0 (where there are no properties owned) ).reset_index() total_counts = ( df.groupby(["Company Registration No. (1)"])["UPRN"] .count() .reset_index(name="total_number_of_properties") ) # We have cases where the same company registration number results in the same company name, so we produce a # best # name per company registration number best_names = ( df.groupby(["Company Registration No. (1)"])["Proprietor Name (1)"] .first() .reset_index() ) total_counts = best_names.merge( total_counts, how="left", on=["Company Registration No. (1)"] ) pivot_counts = pivot_counts.merge( total_counts, how="left", on=["Company Registration No. (1)"] ) pivot_counts = pivot_counts.sort_values("total_number_of_properties", ascending=False) pivot_counts = pivot_counts[pivot_counts["total_number_of_properties"] > 1] pivot_counts["approx_value"] = self.average_property_value * pivot_counts["total_number_of_properties"] pivot_counts["cumulative_value"] = pivot_counts["approx_value"].cumsum() return pivot_counts def create_final_matches(self): """ Given the matching to this point, this method creates the final matching tables :return: """ logger.info("Creating final matches") matched_addresses_final = self.matched_addresses[ ~self.matched_addresses["sold_recently"] & ~self.matched_addresses["sale_lodged_recently"] ].copy() logger.info("Performing conservation area and listed/herigage building filtering") portfolio_spatial_data = OpenUprnClient.get_spatial_data( matched_addresses_final["UPRN"].unique().tolist(), bucket_name="retrofit-data-dev" ) portfolio_spatial_data = portfolio_spatial_data[ ["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"] ].copy() portfolio_spatial_data["UPRN"] = portfolio_spatial_data["UPRN"].astype(str) # Filter matched_addresses_final and filter combined_matching_lookup_final matched_addresses_final = matched_addresses_final.merge( portfolio_spatial_data, how="left", on="UPRN" ) matched_addresses_final = matched_addresses_final[ matched_addresses_final["conservation_status"].isin([None, False]) & matched_addresses_final["is_listed_building"].isin([None, False]) & matched_addresses_final["is_heritage_building"].isin([None, False]) ] # Filter combined_matching_lookup accordingly combined_matching_lookup_final = self.combined_matching_lookup[ self.combined_matching_lookup["UPRN"].isin(matched_addresses_final["UPRN"]) ] # Roll up portfolio combined_aggregate = self.aggregate_matches( matching_lookup=combined_matching_lookup_final, company_ownership=self.ownership_data, properties=self.epc_data ) self.portfolio_owners = combined_aggregate[combined_aggregate["cumulative_value"] <= self.portfolio_value] self.portfolio_properties = matched_addresses_final[ matched_addresses_final["Company Registration No. (1)"].isin( self.portfolio_owners["Company Registration No. (1)"] ) ] # We perform some checks if self.portfolio_owners["total_number_of_properties"].sum() != self.portfolio_properties["UPRN"].nunique(): raise ValueError("Portfolio owners and properties don't match") self.portfolio_epc_data = self.epc_data[self.epc_data["UPRN"].isin(self.portfolio_properties["UPRN"])] # Additional checks if self.portfolio_properties["UPRN"].nunique() != self.portfolio_epc_data["UPRN"].nunique(): raise ValueError("Portfolio properties and epc data don't match") if self.save: logger.info("Storing final outpus") # Store data save_excel_to_s3( df=self.portfolio_owners, bucket_name=self.bucket, file_key=self.portfolio_owners_filepath, ) save_excel_to_s3( df=self.portfolio_properties, bucket_name=self.bucket, file_key=self.portfolio_properties_filepath, ) save_excel_to_s3( df=self.portfolio_epc_data, bucket_name=self.bucket, file_key=self.portfolio_epc_data_filepath, ) def get_asset_list(self): """ From the EPC data, creates the asset list :return: """ asset_list = self.portfolio_epc_data[["UPRN", "ADDRESS1", "POSTCODE"]].copy().rename( columns={ "UPRN": "uprn", "ADDRESS1": "address", "POSTCODE": "postcode" } ) return asset_list def create_final_outputs(self, portfolio_timestamp, storage_date, exclusion_uprns=None): """ Given the completed outputs of the matching process, this function creates the final outputs, after matching valuation data, and creates a "working" directory, which is our current view of the sfr portfolio. This means that we can iterate on the portfolio without affecting the final outputs, and then once we're happy with the new version, we can commit those files to the "working" directory. This inforamtion shouldn't update very often and so we're ok to store this at a daily level :return: """ exclusion_uprns = [] if exclusion_uprns is None else exclusion_uprns # Step 1: Read in the valuations data valuations = read_excel_from_s3( bucket_name=self.bucket, file_key=f"ownership/{self.project_name}/sfr property valuations.xlsx", header_row=0 ) # Load in the portfolio data # 1) owners portfolio_owners = read_excel_from_s3( bucket_name=self.bucket, file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_owners.xlsx", header_row=0 ) # 2) EPC portfolio_epc_data = read_excel_from_s3( bucket_name=self.bucket, file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_epc_data.xlsx", header_row=0 ) # 3) properties portfolio_properties = read_excel_from_s3( bucket_name=self.bucket, file_key=f"ownership/{self.project_name}/{portfolio_timestamp}/portfolio_properties.xlsx", header_row=0 ) # Check they're the right size if portfolio_owners["total_number_of_properties"].sum() != portfolio_properties["UPRN"].nunique(): raise ValueError("Portfolio owners and properties don't match") if portfolio_properties["UPRN"].nunique() != portfolio_epc_data["UPRN"].nunique(): raise ValueError("Portfolio properties and epc data don't match") # We make some final cuts based on UPRNs that at a later stage are found to be odd if portfolio_properties["UPRN"].isin(exclusion_uprns).sum(): raise Exception("Implement me!") # Identify who the owners are for thes uprns # owners = portfolio_properties[portfolio_properties["UPRN"].isin(exclusion_uprns)].groupby( # "Company Registration No. (1)" # )["UPRN"].nunique().reset_index().rename( # columns={"UPRN": "number_of_properties_to_exclude"} # ) # # min_owners_threshold = portfolio_owners["total_number_of_properties"].min() # # portfolio_owners = portfolio_owners.merge( # owners, how="left", on="Company Registration No. (1)", suffixes=("", "_excluded") # ) # Step 2: Merge in the valuations data portfolio_properties = portfolio_properties.merge( valuations.rename(columns={"uprn": "UPRN"}).drop(columns=['address', 'postcode']), how="left", on="UPRN" ) # Step 3: Store the final outputs save_excel_to_s3( df=portfolio_owners, bucket_name=self.bucket, file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_owners.xlsx", ) save_excel_to_s3( df=portfolio_properties, bucket_name=self.bucket, file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_properties.xlsx", ) save_excel_to_s3( df=portfolio_epc_data, bucket_name=self.bucket, file_key=f"ownership/{self.project_name}/current/{storage_date}/portfolio_epc_data.xlsx", )