From 2174a85a8bc79bd696e1b814c81b7d609d45b680 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 28 Jul 2024 15:21:05 +0100 Subject: [PATCH] adding to land registry matching logic --- etl/customers/goldman/property_ownership.py | 111 +++++++++++++++++--- 1 file changed, 94 insertions(+), 17 deletions(-) diff --git a/etl/customers/goldman/property_ownership.py b/etl/customers/goldman/property_ownership.py index 7958e93b..f1f0de38 100644 --- a/etl/customers/goldman/property_ownership.py +++ b/etl/customers/goldman/property_ownership.py @@ -357,6 +357,8 @@ def app(): properties = properties.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN") # TODO: Do we want to filter properties based on lodgement dates? + # E.g. we might want to filter properties that have had a sale EPC lodged in the last x months, because + # this could be indicative of a sale happening, and the land registry data may not have caught up yet # Remove entries where the address begins with the term "land adjoining", or other records that don't reference the # the property itself @@ -456,13 +458,9 @@ def app(): freehold_matching_lookup = pd.DataFrame(freehold_matching_lookup) leasehold_matching_lookup = pd.DataFrame(leasehold_matching_lookup) - # shared_leasehold_match = pd.concat(shared_leasehold_match) - # shared_freehold_match = pd.concat(shared_freehold_match) - # freehold_matching_lookup.to_excel("freehold_matching_lookup_new.xlsx") - # leasehold_matching_lookup.to_excel("leasehold_matching_lookup_new.xlsx") - # shared_leasehold_match.to_excel("shared_leasehold_match_new.xlsx") - # shared_freehold_match.to_excel("shared_freehold_match_new.xlsx") + # freehold_matching_lookup.to_excel("freehold_matching_lookup V2.xlsx") + # leasehold_matching_lookup.to_excel("leasehold_matching_lookup V2.xlsx") # The approximate matches aren't very good freehold_matching_lookup = freehold_matching_lookup[freehold_matching_lookup["match_type"] == "exact"] @@ -477,10 +475,6 @@ def app(): # We also have duplicates at a UPRN level combined_matching_lookup = remove_duplicate_uprn_matches(combined_matching_lookup, properties, company_ownership) - # There are some cases where we have duplicates - # freehold_matching_lookup = remove_duplicate_matches(freehold_matching_lookup, properties, company_ownership) - # leasehold_matching_lookup = remove_duplicate_matches(leasehold_matching_lookup, properties, company_ownership) - matched_addresses = combined_matching_lookup.merge( properties[ [ @@ -534,6 +528,7 @@ def app(): land_registry["postcode"] = land_registry["postcode"].str.lower().str.strip() land_registry["street"] = land_registry["street"].str.lower().str.strip() land_registry["paon"] = land_registry["paon"].str.lower().str.strip() + land_registry["saon"] = land_registry["saon"].str.lower().str.strip() land_registry["date_of_transfer"] = pd.to_datetime(land_registry["date_of_transfer"]) def is_substring(x, match_string): @@ -576,8 +571,9 @@ def app(): # Filter further, when the street is in in the address # street should be contained in epc_address lr_filtered = lr_filtered[ - lr_filtered["street"].apply(lambda x: is_substring(x, match["epc_address"].lower())) - ] + lr_filtered["street"].apply(lambda x: is_substring(x, match["epc_address"].lower())) | + lr_filtered["street"].apply(lambda x: is_substring(x, match["Property Address"].lower())) + ] if lr_filtered.empty: continue @@ -585,10 +581,11 @@ def app(): # We now check if paon is in address 1 lr_filtered["paon_match"] = lr_filtered["paon"].apply(lambda x: house_number_match(x, match["house_number"])) # We also try the secondary match - lr_filtered["saon_match"] = lr_filtered["saon"].apply( - lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address1"]) + lr_filtered["saon_match"] = ( + lr_filtered["saon"].apply( + lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address1"]) + ) ) - # We fileter where we have a primary or secondary match lr_filtered = lr_filtered[ lr_filtered["paon_match"] | lr_filtered["saon_match"] @@ -599,6 +596,7 @@ def app(): elif lr_filtered.shape[0] == 1: land_registry_matches.append( { + "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], @@ -616,11 +614,13 @@ def app(): lr_filtered = lr_filtered.head(1) land_registry_matches.append( { + "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], } ) + continue elif has_paon_match and all_street_equal: # Peform filter on paon lr_filtered = lr_filtered[lr_filtered["paon_match"]] @@ -631,15 +631,92 @@ def app(): lr_filtered = lr_filtered.head(1) land_registry_matches.append( { + "uprn": match["UPRN"], "transaction_id": lr_filtered['transaction_id'].values[0], "price": lr_filtered["price"].values[0], "date_of_transfer": lr_filtered["date_of_transfer"].values[0], } ) else: - raise NotImplementedError("wtf") + # We do a match on saon + lr_filtered["saon_match2"] = lr_filtered["saon"].apply( + lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address"]) + ) + + lr_filtered = lr_filtered[lr_filtered["saon_match2"]] + + if lr_filtered.empty: + continue + elif lr_filtered.shape[0] == 1: + land_registry_matches.append( + { + "uprn": match["UPRN"], + "transaction_id": lr_filtered['transaction_id'].values[0], + "price": lr_filtered["price"].values[0], + "date_of_transfer": lr_filtered["date_of_transfer"].values[0], + } + ) + continue + else: + raise NotImplementedError("wtf") else: - raise NotImplementedError("wtf") + # We have a final check, based on an observed case + lr_address_1 = " ".join([x.lower().strip() for x in match["Property Address"].split(",")[0:2]]) + + lr_filtered["paon_match2"] = lr_filtered["paon"].apply( + lambda x: False if pd.isnull(x) else is_substring(x, lr_address_1) + ) + + lr_filtered = lr_filtered[lr_filtered["paon_match2"]] + + if lr_filtered.empty: + continue + elif lr_filtered.shape[0] == 1: + land_registry_matches.append( + { + "uprn": match["UPRN"], + "transaction_id": lr_filtered['transaction_id'].values[0], + "price": lr_filtered["price"].values[0], + "date_of_transfer": lr_filtered["date_of_transfer"].values[0], + } + ) + continue + else: + # Check all the same + all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered) + + # Check saon is house number with exact match + lr_filtered["saon_match2"] = lr_filtered["saon"].apply( + lambda x: False if pd.isnull(x) else house_number_match(x, match["house_number"]) + ) + + if all_paon_equal and all_saon_equal and all_street_equal: + # Take the newest record + lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False) + lr_filtered = lr_filtered.head(1) + land_registry_matches.append( + { + "uprn": match["UPRN"], + "transaction_id": lr_filtered['transaction_id'].values[0], + "price": lr_filtered["price"].values[0], + "date_of_transfer": lr_filtered["date_of_transfer"].values[0], + } + ) + continue + elif any(lr_filtered["saon_match2"]): + lr_filtered = lr_filtered[lr_filtered["saon_match2"]] + if lr_filtered.shape[0] == 1: + land_registry_matches.append( + { + "uprn": match["UPRN"], + "transaction_id": lr_filtered['transaction_id'].values[0], + "price": lr_filtered["price"].values[0], + "date_of_transfer": lr_filtered["date_of_transfer"].values[0], + } + ) + continue + + raise NotImplementedError("wtf") else: raise NotImplementedError("What happened here?")