From 2174a85a8bc79bd696e1b814c81b7d609d45b680 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 28 Jul 2024 15:21:05 +0100
Subject: [PATCH] adding to land registry matching logic

---
 etl/customers/goldman/property_ownership.py | 111 +++++++++++++++++---
 1 file changed, 94 insertions(+), 17 deletions(-)

diff --git a/etl/customers/goldman/property_ownership.py b/etl/customers/goldman/property_ownership.py
index 7958e93b..f1f0de38 100644
--- a/etl/customers/goldman/property_ownership.py
+++ b/etl/customers/goldman/property_ownership.py
@@ -357,6 +357,8 @@ def app():
     properties = properties.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN")
 
     # TODO: Do we want to filter properties based on lodgement dates?
+    #       E.g. we might want to filter properties that have had a sale EPC lodged in the last x months, because
+    #       this could be indicative of a sale happening, and the land registry data may not have caught up yet
 
     # Remove entries where the address begins with the term "land adjoining", or other records that don't reference the
     # the property itself
@@ -456,13 +458,9 @@ def app():
 
     freehold_matching_lookup = pd.DataFrame(freehold_matching_lookup)
     leasehold_matching_lookup = pd.DataFrame(leasehold_matching_lookup)
-    # shared_leasehold_match = pd.concat(shared_leasehold_match)
-    # shared_freehold_match = pd.concat(shared_freehold_match)
 
-    # freehold_matching_lookup.to_excel("freehold_matching_lookup_new.xlsx")
-    # leasehold_matching_lookup.to_excel("leasehold_matching_lookup_new.xlsx")
-    # shared_leasehold_match.to_excel("shared_leasehold_match_new.xlsx")
-    # shared_freehold_match.to_excel("shared_freehold_match_new.xlsx")
+    # freehold_matching_lookup.to_excel("freehold_matching_lookup V2.xlsx")
+    # leasehold_matching_lookup.to_excel("leasehold_matching_lookup V2.xlsx")
 
     # The approximate matches aren't very good
     freehold_matching_lookup = freehold_matching_lookup[freehold_matching_lookup["match_type"] == "exact"]
@@ -477,10 +475,6 @@ def app():
     # We also have duplicates at a UPRN level
     combined_matching_lookup = remove_duplicate_uprn_matches(combined_matching_lookup, properties, company_ownership)
 
-    # There are some cases where we have duplicates
-    # freehold_matching_lookup = remove_duplicate_matches(freehold_matching_lookup, properties, company_ownership)
-    # leasehold_matching_lookup = remove_duplicate_matches(leasehold_matching_lookup, properties, company_ownership)
-
     matched_addresses = combined_matching_lookup.merge(
         properties[
             [
@@ -534,6 +528,7 @@ def app():
     land_registry["postcode"] = land_registry["postcode"].str.lower().str.strip()
     land_registry["street"] = land_registry["street"].str.lower().str.strip()
     land_registry["paon"] = land_registry["paon"].str.lower().str.strip()
+    land_registry["saon"] = land_registry["saon"].str.lower().str.strip()
     land_registry["date_of_transfer"] = pd.to_datetime(land_registry["date_of_transfer"])
 
     def is_substring(x, match_string):
@@ -576,8 +571,9 @@ def app():
         # Filter further, when the street is in in the address
         # street should be contained in epc_address
         lr_filtered = lr_filtered[
-            lr_filtered["street"].apply(lambda x: is_substring(x, match["epc_address"].lower()))
-        ]
+            lr_filtered["street"].apply(lambda x: is_substring(x, match["epc_address"].lower())) |
+            lr_filtered["street"].apply(lambda x: is_substring(x, match["Property Address"].lower()))
+            ]
 
         if lr_filtered.empty:
             continue
@@ -585,10 +581,11 @@ def app():
         # We now check if paon is in address 1
         lr_filtered["paon_match"] = lr_filtered["paon"].apply(lambda x: house_number_match(x, match["house_number"]))
         # We also try the secondary match
-        lr_filtered["saon_match"] = lr_filtered["saon"].apply(
-            lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address1"])
+        lr_filtered["saon_match"] = (
+            lr_filtered["saon"].apply(
+                lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address1"])
+            )
         )
-
         # We fileter where we have a primary or secondary match
         lr_filtered = lr_filtered[
             lr_filtered["paon_match"] | lr_filtered["saon_match"]
@@ -599,6 +596,7 @@ def app():
         elif lr_filtered.shape[0] == 1:
             land_registry_matches.append(
                 {
+                    "uprn": match["UPRN"],
                     "transaction_id": lr_filtered['transaction_id'].values[0],
                     "price": lr_filtered["price"].values[0],
                     "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
@@ -616,11 +614,13 @@ def app():
                 lr_filtered = lr_filtered.head(1)
                 land_registry_matches.append(
                     {
+                        "uprn": match["UPRN"],
                         "transaction_id": lr_filtered['transaction_id'].values[0],
                         "price": lr_filtered["price"].values[0],
                         "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
                     }
                 )
+                continue
             elif has_paon_match and all_street_equal:
                 # Peform filter on paon
                 lr_filtered = lr_filtered[lr_filtered["paon_match"]]
@@ -631,15 +631,92 @@ def app():
                     lr_filtered = lr_filtered.head(1)
                     land_registry_matches.append(
                         {
+                            "uprn": match["UPRN"],
                             "transaction_id": lr_filtered['transaction_id'].values[0],
                             "price": lr_filtered["price"].values[0],
                             "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
                         }
                     )
                 else:
-                    raise NotImplementedError("wtf")
+                    # We do a match on saon
+                    lr_filtered["saon_match2"] = lr_filtered["saon"].apply(
+                        lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address"])
+                    )
+
+                    lr_filtered = lr_filtered[lr_filtered["saon_match2"]]
+
+                    if lr_filtered.empty:
+                        continue
+                    elif lr_filtered.shape[0] == 1:
+                        land_registry_matches.append(
+                            {
+                                "uprn": match["UPRN"],
+                                "transaction_id": lr_filtered['transaction_id'].values[0],
+                                "price": lr_filtered["price"].values[0],
+                                "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
+                            }
+                        )
+                        continue
+                    else:
+                        raise NotImplementedError("wtf")
             else:
-                raise NotImplementedError("wtf")
+                # We have a final check, based on an observed case
+                lr_address_1 = " ".join([x.lower().strip() for x in match["Property Address"].split(",")[0:2]])
+
+                lr_filtered["paon_match2"] = lr_filtered["paon"].apply(
+                    lambda x: False if pd.isnull(x) else is_substring(x, lr_address_1)
+                )
+
+                lr_filtered = lr_filtered[lr_filtered["paon_match2"]]
+
+                if lr_filtered.empty:
+                    continue
+                elif lr_filtered.shape[0] == 1:
+                    land_registry_matches.append(
+                        {
+                            "uprn": match["UPRN"],
+                            "transaction_id": lr_filtered['transaction_id'].values[0],
+                            "price": lr_filtered["price"].values[0],
+                            "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
+                        }
+                    )
+                    continue
+                else:
+                    # Check all the same
+                    all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered)
+
+                    # Check saon is house number with exact match
+                    lr_filtered["saon_match2"] = lr_filtered["saon"].apply(
+                        lambda x: False if pd.isnull(x) else house_number_match(x, match["house_number"])
+                    )
+
+                    if all_paon_equal and all_saon_equal and all_street_equal:
+                        # Take the newest record
+                        lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
+                        lr_filtered = lr_filtered.head(1)
+                        land_registry_matches.append(
+                            {
+                                "uprn": match["UPRN"],
+                                "transaction_id": lr_filtered['transaction_id'].values[0],
+                                "price": lr_filtered["price"].values[0],
+                                "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
+                            }
+                        )
+                        continue
+                    elif any(lr_filtered["saon_match2"]):
+                        lr_filtered = lr_filtered[lr_filtered["saon_match2"]]
+                        if lr_filtered.shape[0] == 1:
+                            land_registry_matches.append(
+                                {
+                                    "uprn": match["UPRN"],
+                                    "transaction_id": lr_filtered['transaction_id'].values[0],
+                                    "price": lr_filtered["price"].values[0],
+                                    "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
+                                }
+                            )
+                            continue
+
+                    raise NotImplementedError("wtf")
         else:
             raise NotImplementedError("What happened here?")