From a2a5094b01a93ef73f68e546549303ea320706c6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 27 Jul 2024 22:37:13 +0100 Subject: [PATCH] working on land registry matches --- etl/customers/goldman/property_ownership.py | 162 +++++++++++++++++++- 1 file changed, 156 insertions(+), 6 deletions(-) diff --git a/etl/customers/goldman/property_ownership.py b/etl/customers/goldman/property_ownership.py index 1b1cf014..7958e93b 100644 --- a/etl/customers/goldman/property_ownership.py +++ b/etl/customers/goldman/property_ownership.py @@ -345,9 +345,6 @@ def app(): company_ownership["Postcode"].str.lower().isin(properties["POSTCODE"].str.lower().unique()) ] - # Read in land registry - land_registry = pd.read_csv("/Users/khalimconn-kowlessar/Downloads/land_registry_prices_paid_filtered.csv") - # Now we filter properties the other way around properties = properties[properties["POSTCODE"].str.lower().isin(company_ownership["Postcode"].str.lower().unique())] # We end up with 7.4k entires on a postcode match, however we need to now do a direct address match @@ -485,14 +482,167 @@ def app(): # leasehold_matching_lookup = remove_duplicate_matches(leasehold_matching_lookup, properties, company_ownership) matched_addresses = combined_matching_lookup.merge( - properties[["UPRN", "ADDRESS", "CURRENT_ENERGY_EFFICIENCY", "CURRENT_ENERGY_RATING"]].rename( - columns={"ADDRESS": "epc_address"}), + properties[ + [ + "UPRN", + "ADDRESS", + "ADDRESS1", + "CURRENT_ENERGY_EFFICIENCY", + "CURRENT_ENERGY_RATING", + "POSTCODE" + ] + ].rename( + columns={ + "ADDRESS": "epc_address", + "ADDRESS1": "epc_address1", + "POSTCODE": "epc_postcode" + } + ), how="left", on="UPRN" ).merge( - company_ownership[["Title Number", "Property Address", "Company Registration No. (1)", "Proprietor Name (1)"]], + company_ownership[ + [ + "Title Number", + "Property Address", + "Postcode", + "Company Registration No. (1)", + "Proprietor Name (1)", + + ] + ], how="left", on="Title Number" ) + # Let's try and get the house number + matched_addresses["house_number"] = ( + matched_addresses["epc_address"] + .apply(remove_text_in_brackets) + .apply(SearchEpc.get_house_number) + .str.lower() + .str.replace(",", "") + ) + + # Read in land registry + land_registry = pd.read_csv( + "/Users/khalimconn-kowlessar/Downloads/land_registry_prices_paid_filtered.csv", + ) + + # We now perform a match between the land registry data and the matched address, in an attempt to find + # out when these properties last sold. The land registry data has been pre filtered on the postcodes in this + # data, and for sales within the last 5 years, to ensure the file isn't too large. + + land_registry["postcode"] = land_registry["postcode"].str.lower().str.strip() + land_registry["street"] = land_registry["street"].str.lower().str.strip() + land_registry["paon"] = land_registry["paon"].str.lower().str.strip() + land_registry["date_of_transfer"] = pd.to_datetime(land_registry["date_of_transfer"]) + + def is_substring(x, match_string): + + if pd.isnull(x): + return False + + return x in match_string.lower() + + def house_number_match(paon, house_number): + # Firstly try and convert to numberic + try: + paon_numeric = int(paon) + house_number_numeric = int(house_number) + return paon_numeric == house_number_numeric + except Exception as e: # noqa + # If we can't convert both to numeric, we do an equality + + return paon == house_number + + def check_equalities(lr_filtered): + all_paon_equal = all(lr_filtered["paon"] == lr_filtered["paon"].values[0]) + if pd.isnull(lr_filtered["saon"].values[0]): + all_saon_equal = all(pd.isnull(lr_filtered["saon"])) + else: + all_saon_equal = all(lr_filtered["saon"] == lr_filtered["saon"].values[0]) + + all_street_equal = all(lr_filtered["street"] == lr_filtered["street"].values[0]) + + return all_paon_equal, all_saon_equal, all_street_equal + + land_registry_matches = [] + for _, match in tqdm(matched_addresses.iterrows(), total=len(matched_addresses)): + + # Filter land registry on the postcode + lr_filtered = land_registry[ + (land_registry["postcode"] == match["epc_postcode"].lower().strip()) + ] + + # Filter further, when the street is in in the address + # street should be contained in epc_address + lr_filtered = lr_filtered[ + lr_filtered["street"].apply(lambda x: is_substring(x, match["epc_address"].lower())) + ] + + if lr_filtered.empty: + continue + + # We now check if paon is in address 1 + lr_filtered["paon_match"] = lr_filtered["paon"].apply(lambda x: house_number_match(x, match["house_number"])) + # We also try the secondary match + lr_filtered["saon_match"] = lr_filtered["saon"].apply( + lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address1"]) + ) + + # We fileter where we have a primary or secondary match + lr_filtered = lr_filtered[ + lr_filtered["paon_match"] | lr_filtered["saon_match"] + ] + + if lr_filtered.empty: + continue + elif lr_filtered.shape[0] == 1: + land_registry_matches.append( + { + "transaction_id": lr_filtered['transaction_id'].values[0], + "price": lr_filtered["price"].values[0], + "date_of_transfer": lr_filtered["date_of_transfer"].values[0], + } + ) + continue + elif lr_filtered.shape[0] > 1: + # We make sure all records are the same and take the newest + all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered) + has_paon_match = any(lr_filtered["paon_match"]) + + if all_paon_equal and all_street_equal and all_saon_equal: + # Take the newest record, append and continue + lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False) + lr_filtered = lr_filtered.head(1) + land_registry_matches.append( + { + "transaction_id": lr_filtered['transaction_id'].values[0], + "price": lr_filtered["price"].values[0], + "date_of_transfer": lr_filtered["date_of_transfer"].values[0], + } + ) + elif has_paon_match and all_street_equal: + # Peform filter on paon + lr_filtered = lr_filtered[lr_filtered["paon_match"]] + # Do an addtiioanl equality check + all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered) + if all_paon_equal and all_street_equal and all_saon_equal: + lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False) + lr_filtered = lr_filtered.head(1) + land_registry_matches.append( + { + "transaction_id": lr_filtered['transaction_id'].values[0], + "price": lr_filtered["price"].values[0], + "date_of_transfer": lr_filtered["date_of_transfer"].values[0], + } + ) + else: + raise NotImplementedError("wtf") + else: + raise NotImplementedError("wtf") + else: + raise NotImplementedError("What happened here?") + # shared_freehold_match = pd.DataFrame(shared_freehold_match) # Strore these files # freehold_matching_lookup.to_excel("freehold_matching_lookup.xlsx")