completed matching for ha107, added levenstein method

2026-07-27 23:35:01 +00:00 · 2024-02-23 12:08:44 +00:00 · 2024-02-23 12:08:44 +00:00 · cef20c6e2c
commit cef20c6e2c
parent ccb764d4a9
1 changed files with 64 additions and 0 deletions
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@ -1,6 +1,7 @@
 import os
 import re
 import openpyxl
+import Levenshtein
 from pathlib import Path
 import msgpack
 from datetime import datetime
@ -453,6 +454,41 @@ class DataLoader:
            "Snelland Road Wickenby, Lincoln", "Snelland Road, Wickenby, Lincoln"
        )

+        # Replace Reasby Road Snelland, Lincoln with Reasby Road, Snelland, Lincoln
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Reasby Road Snelland, Lincoln", "Reasby Road, Snelland, Lincoln"
+        )
+
+        # Replace Silver Street Bardney, Lincoln with Silver Street, Bardney, Lincoln
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Silver Street Bardney, Lincoln", "Silver Street, Bardney, Lincoln"
+        )
+
+        # Replace Manor Close Bardney, Lincoln with Manor Close, Bardney, Lincoln
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Manor Close Bardney, Lincoln", "Manor Close, Bardney, Lincoln"
+        )
+
+        # Replace Ferry Road Southrey, Lincoln with Ferry Road, Southrey, Lincoln
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Ferry Road Southrey, Lincoln", "Ferry Road, Southrey, Lincoln"
+        )
+
+        # Replace Harvey Kent Gardens Bardney, Lincoln with Harvey Kent Gardens, Bardney, Lincoln
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Harvey Kent Gardens Bardney, Lincoln", "Harvey Kent Gardens, Bardney, Lincoln"
+        )
+
+        # Replace Wragby Road Bardney, Lincoln with Wragby Road, Bardney, Lincoln
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "Wragby Road Bardney, Lincoln", "Wragby Road, Bardney, Lincoln"
+        )
+
+        # Replace SPRINKHILL ROAD with SPINKHILL ROAD
+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "SPRINKHILL ROAD", "SPINKHILL ROAD"
+        )
+
        return survey_list

    def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
@ -481,10 +517,35 @@ class DataLoader:
            ].copy()

            df = df[df["matching_address"].str.contains(str(house_number))]
+
+            if df.empty:
+                print(row["Street / Block Name"])
+                print(house_number)
+                print(row["Post Code"])
+                raise ValueError("Investigate")
+
            if df.shape[0] != 1:
                df = df[df["HouseNo"].astype(str) == str(house_number)]
                if df.shape[0] != 1:
                    df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())]
+
+                    full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + row[
+                        "Town/Area"].lower().strip() + row["Post Code"].lower().strip()
+                    # Remove any spaces from the full key
+                    full_key = full_key.replace(" ", "")
+
+                    match_to = df["matching_address"].tolist()
+                    # Strip out punctuation and spaces
+                    match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
+                    match_to = [x.replace(" ", "") for x in match_to]
+
+                    # Perform matching between full key and match_to
+                    distances = [Levenshtein.distance(full_key, s) for s in match_to]
+                    best_match_index = distances.index(min(distances))
+                    # We might want to consider a threshold for the distance, however for the momeny,
+                    # we don't consider this for the moment
+                    df = df.iloc[best_match_index:best_match_index + 1]
+
                    if df.shape[0] != 1:
                        postcode_lower = row["Post Code"].lower()
                        if postcode_lower in missed_postcodes:
@ -510,6 +571,9 @@ class DataLoader:

        matching_lookup = pd.DataFrame(matching_lookup)

+        if matching_lookup.shape[0] != survey_list.shape[0]:
+            raise ValueError("Mismatch in the number of survey rows and matching lookup rows")
+
        # Merge onto the survey list
        survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id")