mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Adding company ownership matching code for goldman poc
This commit is contained in:
parent
155a8c568c
commit
cce9c64fdc
3 changed files with 104 additions and 20 deletions
|
|
@ -193,33 +193,31 @@ class SearchEpc:
|
|||
@classmethod
|
||||
def get_house_number(cls, address: str) -> str | None:
|
||||
"""
|
||||
This method will use the usaddress library to parse an address and extract the house number
|
||||
:return:
|
||||
This method uses the usaddress library to parse an address and extract the primary house or flat number.
|
||||
"""
|
||||
try:
|
||||
parsed = usaddress.parse(address)
|
||||
# First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected
|
||||
for part, type_ in parsed:
|
||||
if type_ == 'OccupancyIdentifier':
|
||||
return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
|
||||
# number
|
||||
|
||||
parsed = usaddress.parse(address)
|
||||
parsed_house_number = [x for x in parsed if (x[1] == "AddressNumber")]
|
||||
parsed_house_number = parsed_house_number[0][0] if parsed_house_number else None
|
||||
# Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found
|
||||
address_number = next((part for part, type_ in parsed if type_ == 'AddressNumber'), None)
|
||||
if address_number:
|
||||
return address_number.replace(",", "") # Remove any trailing commas
|
||||
|
||||
if parsed_house_number is None:
|
||||
# Because usaddress isn't optimal for parsing addresses with some prefixes such as 'Flat',
|
||||
# we also add a custom approach
|
||||
|
||||
# Pattern to look for 'Flat' or 'Apartment' followed by a number, or just a number at the beginning
|
||||
# Further fallback to custom regex (in case usaddress completely fails)
|
||||
pattern = r'(?i)(?:flat|apartment)\s*(\d+)|^\s*(\d+)'
|
||||
|
||||
match = re.search(pattern, address)
|
||||
|
||||
if match:
|
||||
# Return the first non-None group found
|
||||
return next(g for g in match.groups() if g is not None)
|
||||
else:
|
||||
return None
|
||||
|
||||
# Remove training commas
|
||||
parsed_house_number = parsed_house_number.replace(",", "")
|
||||
except Exception as e:
|
||||
print(f"Error parsing address: {e}")
|
||||
|
||||
return parsed_house_number
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def extract_numeric_housenumber_part(house_number: str | None) -> int | None:
|
||||
|
|
|
|||
87
etl/customers/goldman/property_ownership.py
Normal file
87
etl/customers/goldman/property_ownership.py
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from backend.SearchEpc import SearchEpc
|
||||
|
||||
|
||||
def aggregate_matches(matching_lookup, company_ownership):
|
||||
df = matching_lookup.merge(company_ownership, how="left", on="Title Number")
|
||||
counts = (
|
||||
df.groupby(["Company Registration No. (1)", "Proprietor Name (1)"])["UPRN"]
|
||||
.count()
|
||||
.reset_index(name="number_of_properties")
|
||||
)
|
||||
counts = counts.sort_values("number_of_properties", ascending=False)
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
This script is for scoping property ownership for EPC F & G rated properties in Birmingam, for Goldman Sachs
|
||||
"""
|
||||
|
||||
properties = pd.read_excel("Birmingham EPC F & G Properties.xlsx")
|
||||
company_ownership = pd.read_csv("/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_04.csv")
|
||||
# FIlter on relevant postcodes
|
||||
company_ownership = company_ownership[
|
||||
company_ownership["Postcode"].str.lower().isin(properties["POSTCODE"].str.lower().unique())]
|
||||
|
||||
# Now we filter properties the other way around
|
||||
properties = properties[properties["POSTCODE"].str.lower().isin(company_ownership["Postcode"].str.lower().unique())]
|
||||
# We end up with 7.4k entires on a postcode match, however we need to now do a direct address match
|
||||
|
||||
ignore_title_numbers = [
|
||||
"WM922695", # Land at the back of 17 Plumstead Road, Birmingham (B44 0EA): relates to WM154788
|
||||
"WM426374", # land on the south side of 15 Carlyle Road, Edgbaston, Birmingham (B16 9BH): relates to WM537591
|
||||
"WM44948",
|
||||
]
|
||||
company_ownership = company_ownership[~company_ownership["Title Number"].isin(ignore_title_numbers)]
|
||||
# Remove entries where the address begins with the term "land adjoining":
|
||||
|
||||
company_ownership = company_ownership[~company_ownership["Property Address"].str.startswith("land adjoining")]
|
||||
|
||||
freehold_matching_lookup = []
|
||||
leasehold_matching_lookup = []
|
||||
for _, address in tqdm(properties.iterrows(), total=len(properties)):
|
||||
filtered = company_ownership[
|
||||
company_ownership["Postcode"].str.lower() == address["POSTCODE"].lower()
|
||||
].copy()
|
||||
|
||||
filtered["house_number"] = filtered["Property Address"].apply(SearchEpc.get_house_number)
|
||||
house_no = SearchEpc.get_house_number(address["ADDRESS1"])
|
||||
|
||||
filtered = filtered[filtered["house_number"] == house_no]
|
||||
|
||||
if filtered.empty:
|
||||
continue
|
||||
|
||||
filtered_freehold = filtered[filtered["Tenure"] == "Freehold"]
|
||||
filtered_leasehold = filtered[filtered["Tenure"] == "Leasehold"]
|
||||
|
||||
if filtered_freehold.shape[0] > 1:
|
||||
raise ValueError("Multiple freehold matches")
|
||||
|
||||
if filtered_leasehold.shape[0] > 1:
|
||||
raise ValueError("Multiple leasehold matches")
|
||||
|
||||
if not filtered_leasehold.empty:
|
||||
leasehold_matching_lookup.append(
|
||||
{
|
||||
"UPRN": address["UPRN"],
|
||||
"Title Number": filtered_leasehold["Title Number"].values[0]
|
||||
}
|
||||
)
|
||||
|
||||
if not filtered_freehold.empty:
|
||||
freehold_matching_lookup.append(
|
||||
{
|
||||
"UPRN": address["UPRN"],
|
||||
"Title Number": filtered_freehold["Title Number"].values[0]
|
||||
}
|
||||
)
|
||||
|
||||
freehold_matching_lookup = pd.DataFrame(freehold_matching_lookup)
|
||||
leasehold_matching_lookup = pd.DataFrame(leasehold_matching_lookup)
|
||||
|
||||
freehold_aggregate = aggregate_matches(freehold_matching_lookup, company_ownership)
|
||||
leasehold_aggregate = aggregate_matches(leasehold_matching_lookup, company_ownership)
|
||||
|
|
@ -22,9 +22,8 @@ def route_march_may_2024():
|
|||
asset_list = read_excel_from_s3(
|
||||
bucket_name="retrofit-datalake-dev",
|
||||
file_key="customers/Livewest/Livewest proposed route march Apr-May 2024.xlsx",
|
||||
header_row=1
|
||||
header_row=0
|
||||
)
|
||||
asset_list = pd.read_excel("/Users/khalimconn-kowlessar/Downloads/Livewest proposed route march Apr-May 2024.xlsx")
|
||||
|
||||
epc_data = []
|
||||
for _, unit in tqdm(asset_list.iterrows(), total=len(asset_list)):
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue