working on land registry matches

2026-07-27 23:35:01 +00:00 · 2024-07-27 22:37:13 +01:00 · 2024-07-27 22:37:13 +01:00 · a2a5094b01
commit a2a5094b01
parent 971a74017e
1 changed files with 156 additions and 6 deletions
--- a/etl/customers/goldman/property_ownership.py
+++ b/etl/customers/goldman/property_ownership.py
@ -345,9 +345,6 @@ def app():
        company_ownership["Postcode"].str.lower().isin(properties["POSTCODE"].str.lower().unique())
    ]

-    # Read in land registry
-    land_registry = pd.read_csv("/Users/khalimconn-kowlessar/Downloads/land_registry_prices_paid_filtered.csv")
-
    # Now we filter properties the other way around
    properties = properties[properties["POSTCODE"].str.lower().isin(company_ownership["Postcode"].str.lower().unique())]
    # We end up with 7.4k entires on a postcode match, however we need to now do a direct address match
@ -485,14 +482,167 @@ def app():
    # leasehold_matching_lookup = remove_duplicate_matches(leasehold_matching_lookup, properties, company_ownership)

    matched_addresses = combined_matching_lookup.merge(
-        properties[["UPRN", "ADDRESS", "CURRENT_ENERGY_EFFICIENCY", "CURRENT_ENERGY_RATING"]].rename(
-            columns={"ADDRESS": "epc_address"}),
+        properties[
+            [
+                "UPRN",
+                "ADDRESS",
+                "ADDRESS1",
+                "CURRENT_ENERGY_EFFICIENCY",
+                "CURRENT_ENERGY_RATING",
+                "POSTCODE"
+            ]
+        ].rename(
+            columns={
+                "ADDRESS": "epc_address",
+                "ADDRESS1": "epc_address1",
+                "POSTCODE": "epc_postcode"
+            }
+        ),
        how="left", on="UPRN"
    ).merge(
-        company_ownership[["Title Number", "Property Address", "Company Registration No. (1)", "Proprietor Name (1)"]],
+        company_ownership[
+            [
+                "Title Number",
+                "Property Address",
+                "Postcode",
+                "Company Registration No. (1)",
+                "Proprietor Name (1)",
+
+            ]
+        ],
        how="left", on="Title Number"
    )

+    # Let's try and get the house number
+    matched_addresses["house_number"] = (
+        matched_addresses["epc_address"]
+        .apply(remove_text_in_brackets)
+        .apply(SearchEpc.get_house_number)
+        .str.lower()
+        .str.replace(",", "")
+    )
+
+    # Read in land registry
+    land_registry = pd.read_csv(
+        "/Users/khalimconn-kowlessar/Downloads/land_registry_prices_paid_filtered.csv",
+    )
+
+    # We now perform a match between the land registry data and the matched address, in an attempt to find
+    # out when these properties last sold. The land registry data has been pre filtered on the postcodes in this
+    # data, and for sales within the last 5 years, to ensure the file isn't too large.
+
+    land_registry["postcode"] = land_registry["postcode"].str.lower().str.strip()
+    land_registry["street"] = land_registry["street"].str.lower().str.strip()
+    land_registry["paon"] = land_registry["paon"].str.lower().str.strip()
+    land_registry["date_of_transfer"] = pd.to_datetime(land_registry["date_of_transfer"])
+
+    def is_substring(x, match_string):
+
+        if pd.isnull(x):
+            return False
+
+        return x in match_string.lower()
+
+    def house_number_match(paon, house_number):
+        # Firstly try and convert to numberic
+        try:
+            paon_numeric = int(paon)
+            house_number_numeric = int(house_number)
+            return paon_numeric == house_number_numeric
+        except Exception as e:  # noqa
+            # If we can't convert both to numeric, we do an equality
+
+            return paon == house_number
+
+    def check_equalities(lr_filtered):
+        all_paon_equal = all(lr_filtered["paon"] == lr_filtered["paon"].values[0])
+        if pd.isnull(lr_filtered["saon"].values[0]):
+            all_saon_equal = all(pd.isnull(lr_filtered["saon"]))
+        else:
+            all_saon_equal = all(lr_filtered["saon"] == lr_filtered["saon"].values[0])
+
+        all_street_equal = all(lr_filtered["street"] == lr_filtered["street"].values[0])
+
+        return all_paon_equal, all_saon_equal, all_street_equal
+
+    land_registry_matches = []
+    for _, match in tqdm(matched_addresses.iterrows(), total=len(matched_addresses)):
+
+        # Filter land registry on the postcode
+        lr_filtered = land_registry[
+            (land_registry["postcode"] == match["epc_postcode"].lower().strip())
+        ]
+
+        # Filter further, when the street is in in the address
+        # street should be contained in epc_address
+        lr_filtered = lr_filtered[
+            lr_filtered["street"].apply(lambda x: is_substring(x, match["epc_address"].lower()))
+        ]
+
+        if lr_filtered.empty:
+            continue
+
+        # We now check if paon is in address 1
+        lr_filtered["paon_match"] = lr_filtered["paon"].apply(lambda x: house_number_match(x, match["house_number"]))
+        # We also try the secondary match
+        lr_filtered["saon_match"] = lr_filtered["saon"].apply(
+            lambda x: False if pd.isnull(x) else is_substring(x, match["epc_address1"])
+        )
+
+        # We fileter where we have a primary or secondary match
+        lr_filtered = lr_filtered[
+            lr_filtered["paon_match"] | lr_filtered["saon_match"]
+            ]
+
+        if lr_filtered.empty:
+            continue
+        elif lr_filtered.shape[0] == 1:
+            land_registry_matches.append(
+                {
+                    "transaction_id": lr_filtered['transaction_id'].values[0],
+                    "price": lr_filtered["price"].values[0],
+                    "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
+                }
+            )
+            continue
+        elif lr_filtered.shape[0] > 1:
+            # We make sure all records are the same and take the newest
+            all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered)
+            has_paon_match = any(lr_filtered["paon_match"])
+
+            if all_paon_equal and all_street_equal and all_saon_equal:
+                # Take the newest record, append and continue
+                lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
+                lr_filtered = lr_filtered.head(1)
+                land_registry_matches.append(
+                    {
+                        "transaction_id": lr_filtered['transaction_id'].values[0],
+                        "price": lr_filtered["price"].values[0],
+                        "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
+                    }
+                )
+            elif has_paon_match and all_street_equal:
+                # Peform filter on paon
+                lr_filtered = lr_filtered[lr_filtered["paon_match"]]
+                # Do an addtiioanl equality check
+                all_paon_equal, all_saon_equal, all_street_equal = check_equalities(lr_filtered)
+                if all_paon_equal and all_street_equal and all_saon_equal:
+                    lr_filtered = lr_filtered.sort_values("date_of_transfer", ascending=False)
+                    lr_filtered = lr_filtered.head(1)
+                    land_registry_matches.append(
+                        {
+                            "transaction_id": lr_filtered['transaction_id'].values[0],
+                            "price": lr_filtered["price"].values[0],
+                            "date_of_transfer": lr_filtered["date_of_transfer"].values[0],
+                        }
+                    )
+                else:
+                    raise NotImplementedError("wtf")
+            else:
+                raise NotImplementedError("wtf")
+        else:
+            raise NotImplementedError("What happened here?")
+
    # shared_freehold_match = pd.DataFrame(shared_freehold_match)
    # Strore these files
    # freehold_matching_lookup.to_excel("freehold_matching_lookup.xlsx")