diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 44178792..06eea258 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -193,33 +193,31 @@ class SearchEpc: @classmethod def get_house_number(cls, address: str) -> str | None: """ - This method will use the usaddress library to parse an address and extract the house number - :return: + This method uses the usaddress library to parse an address and extract the primary house or flat number. """ + try: + parsed = usaddress.parse(address) + # First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected + for part, type_ in parsed: + if type_ == 'OccupancyIdentifier': + return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary + # number - parsed = usaddress.parse(address) - parsed_house_number = [x for x in parsed if (x[1] == "AddressNumber")] - parsed_house_number = parsed_house_number[0][0] if parsed_house_number else None + # Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found + address_number = next((part for part, type_ in parsed if type_ == 'AddressNumber'), None) + if address_number: + return address_number.replace(",", "") # Remove any trailing commas - if parsed_house_number is None: - # Because usaddress isn't optimal for parsing addresses with some prefixes such as 'Flat', - # we also add a custom approach - - # Pattern to look for 'Flat' or 'Apartment' followed by a number, or just a number at the beginning + # Further fallback to custom regex (in case usaddress completely fails) pattern = r'(?i)(?:flat|apartment)\s*(\d+)|^\s*(\d+)' - match = re.search(pattern, address) - if match: - # Return the first non-None group found return next(g for g in match.groups() if g is not None) - else: - return None - # Remove training commas - parsed_house_number = parsed_house_number.replace(",", "") + except Exception as e: + print(f"Error parsing address: {e}") - return parsed_house_number + return None @staticmethod def extract_numeric_housenumber_part(house_number: str | None) -> int | None: diff --git a/etl/customers/goldman/property_ownership.py b/etl/customers/goldman/property_ownership.py new file mode 100644 index 00000000..17db71b2 --- /dev/null +++ b/etl/customers/goldman/property_ownership.py @@ -0,0 +1,87 @@ +import pandas as pd +from tqdm import tqdm +from backend.SearchEpc import SearchEpc + + +def aggregate_matches(matching_lookup, company_ownership): + df = matching_lookup.merge(company_ownership, how="left", on="Title Number") + counts = ( + df.groupby(["Company Registration No. (1)", "Proprietor Name (1)"])["UPRN"] + .count() + .reset_index(name="number_of_properties") + ) + counts = counts.sort_values("number_of_properties", ascending=False) + + return counts + + +def app(): + """ + This script is for scoping property ownership for EPC F & G rated properties in Birmingam, for Goldman Sachs + """ + + properties = pd.read_excel("Birmingham EPC F & G Properties.xlsx") + company_ownership = pd.read_csv("/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_04.csv") + # FIlter on relevant postcodes + company_ownership = company_ownership[ + company_ownership["Postcode"].str.lower().isin(properties["POSTCODE"].str.lower().unique())] + + # Now we filter properties the other way around + properties = properties[properties["POSTCODE"].str.lower().isin(company_ownership["Postcode"].str.lower().unique())] + # We end up with 7.4k entires on a postcode match, however we need to now do a direct address match + + ignore_title_numbers = [ + "WM922695", # Land at the back of 17 Plumstead Road, Birmingham (B44 0EA): relates to WM154788 + "WM426374", # land on the south side of 15 Carlyle Road, Edgbaston, Birmingham (B16 9BH): relates to WM537591 + "WM44948", + ] + company_ownership = company_ownership[~company_ownership["Title Number"].isin(ignore_title_numbers)] + # Remove entries where the address begins with the term "land adjoining": + + company_ownership = company_ownership[~company_ownership["Property Address"].str.startswith("land adjoining")] + + freehold_matching_lookup = [] + leasehold_matching_lookup = [] + for _, address in tqdm(properties.iterrows(), total=len(properties)): + filtered = company_ownership[ + company_ownership["Postcode"].str.lower() == address["POSTCODE"].lower() + ].copy() + + filtered["house_number"] = filtered["Property Address"].apply(SearchEpc.get_house_number) + house_no = SearchEpc.get_house_number(address["ADDRESS1"]) + + filtered = filtered[filtered["house_number"] == house_no] + + if filtered.empty: + continue + + filtered_freehold = filtered[filtered["Tenure"] == "Freehold"] + filtered_leasehold = filtered[filtered["Tenure"] == "Leasehold"] + + if filtered_freehold.shape[0] > 1: + raise ValueError("Multiple freehold matches") + + if filtered_leasehold.shape[0] > 1: + raise ValueError("Multiple leasehold matches") + + if not filtered_leasehold.empty: + leasehold_matching_lookup.append( + { + "UPRN": address["UPRN"], + "Title Number": filtered_leasehold["Title Number"].values[0] + } + ) + + if not filtered_freehold.empty: + freehold_matching_lookup.append( + { + "UPRN": address["UPRN"], + "Title Number": filtered_freehold["Title Number"].values[0] + } + ) + + freehold_matching_lookup = pd.DataFrame(freehold_matching_lookup) + leasehold_matching_lookup = pd.DataFrame(leasehold_matching_lookup) + + freehold_aggregate = aggregate_matches(freehold_matching_lookup, company_ownership) + leasehold_aggregate = aggregate_matches(leasehold_matching_lookup, company_ownership) diff --git a/etl/customers/livewest/route_march.py b/etl/customers/livewest/route_march.py index 713ee56a..9e69fd43 100644 --- a/etl/customers/livewest/route_march.py +++ b/etl/customers/livewest/route_march.py @@ -22,9 +22,8 @@ def route_march_may_2024(): asset_list = read_excel_from_s3( bucket_name="retrofit-datalake-dev", file_key="customers/Livewest/Livewest proposed route march Apr-May 2024.xlsx", - header_row=1 + header_row=0 ) - asset_list = pd.read_excel("/Users/khalimconn-kowlessar/Downloads/Livewest proposed route march Apr-May 2024.xlsx") epc_data = [] for _, unit in tqdm(asset_list.iterrows(), total=len(asset_list)):