From 022244377d36557f83081e505b8068ab2bd98004 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 7 Mar 2024 12:26:16 +0000
Subject: [PATCH] working on fixing missed matched in eco3 matching

---
 .../ha_15_32/ha_analysis_batch_3.py           | 84 +++++++++++++++----
 1 file changed, 66 insertions(+), 18 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index ea5b0456..a5845990 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -171,6 +171,10 @@ class DataLoader:
         "HA107": 51,
     }
 
+    UNMATCHED_ECO3 = {
+        "HA25": 94
+    }
+
     def __init__(self, directories, december_figures_filepath, use_cache, rebuild):
         self.directories = directories
         self.use_cache = use_cache
@@ -1458,9 +1462,6 @@ class DataLoader:
 
     def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name):
 
-        # We add on a matching postcode without spaces for this
-        # asset_list["matching_postcode_no_space"] = asset_list["matching_postcode"].str.lower().str.replace(" ", "")
-
         # May need an eco3 list correction function
 
         # NEADS DRIVE, postcode with bs305dt, is not found in the asset list
@@ -1471,8 +1472,17 @@ class DataLoader:
         eco3_list = eco3_list[
             ~pd.isnull(eco3_list["Post Code"])
         ]
+        # We have a bunch of genuine duplicates
+        eco3_list = eco3_list.drop_duplicates(["NO ", "Street / Block Name", "Post Code"])
+
+        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
+            "HALWILL MEADOOW", "HALWILL MEADOW"
+        )
+
+        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
+            "Hall Road", "Hall Rd"
+        )
 
-        missed_postcodes = []
         if ha_name == "HA25":
             missed_postcodes = {
                 postcode.lower() for postcode in eco3_list["Post Code"] if
@@ -1480,10 +1490,18 @@ class DataLoader:
             }
             eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)]
 
+        # For the asset list, we create a matching address without any punctuation
+        # TODO: We should generally just remove puncutation from addresses when matching
+        asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(r'[^\w\s]', '',
+                                                                                                   regex=True)
+        # Remove double spaces
+        asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace(
+            "  ", " "
+        )
+
         matching_lookup = []
         missed = []
         for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)):
-
             postcode = row["Post Code"].lower().strip()
 
             # df will never be empty, since we've already done a check for common postcodes
@@ -1507,24 +1525,20 @@ class DataLoader:
                 if " " in str(house_number):
                     house_number = house_number.split(" ")[0].strip()
 
-            df = df[df["matching_address"].str.contains(str(house_number))]
+            # We must do the house number filter
+            df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
+
+            # Perform a search on streetname
+            # We do this to prevent duplicate matches to properties with the same postcode and house number,
+            # but different streets
+            street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0]
+            street_name_section1 = re.sub(r'[^\w\s]', '', street_name_section1)
+            df = df[df["matching_address_no_punctuation"].str.contains(street_name_section1)]
 
             if df.empty:
                 missed.append(row["eco3_list_row_id"])
                 continue
 
-            if df.shape[0] != 1:
-                df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
-
-            if df.empty:
-                missed.append(row["eco3_list_row_id"])
-                continue
-
-            if df.shape[0] != 1:
-                # Perform a search on streetname
-                street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0]
-                df = df[df["matching_address"].str.contains(street_name_section1)]
-
             if df.shape[0] != 1:
                 print(row["Street / Block Name"])
                 print(house_number)
@@ -1538,6 +1552,40 @@ class DataLoader:
                 }
             )
 
+        # We verify the missed
+        # -HA25 contains 88 missed entries. These are actually 8 unique postcodes, where surveys were conducted
+        # on properties that had house numbers outside of the asset list
+        if len(missed) != self.UNMATCHED_ECO3[ha_name]:
+            raise ValueError(
+                f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched"
+            )
+
+        # TODO: 194 missed
+
+        matching_lookup = pd.DataFrame(matching_lookup)
+        # Check dupes as this will cause problems later on
+        if matching_lookup["asset_list_row_id"].duplicated().any():
+            raise ValueError("Duplicated asset list row ids")
+
+        missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)]
+        missed_df.head(3).tail(1)["eco3_list_row_id"]
+
+        duped_ids = matching_lookup[matching_lookup["asset_list_row_id"].duplicated()]["asset_list_row_id"].tolist()
+        duped_df = matching_lookup[
+            matching_lookup["asset_list_row_id"].isin(duped_ids)
+        ]
+        duped_surveys = eco3_list[
+            eco3_list["eco3_list_row_id"].isin(duped_df["eco3_list_row_id"].values)
+        ].copy()
+
+        duped_surveys = duped_surveys.merge(matching_lookup, how="left", on="eco3_list_row_id")
+
+        duped_surveys[
+            ["NO ", "Street / Block Name", "Post Code", "eco3_list_row_id", "asset_list_row_id"]
+        ].sort_values("asset_list_row_id").head()
+
+        asset_list[asset_list["asset_list_row_id"] == "HA2515145"]["matching_address"].values
+
     @staticmethod
     def extract_streetname(address, house_number=None, postcode=None):
         """