From b09bd63b53c8d9b14f11c1c5b7cb38b28c63afbc Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 7 Mar 2024 12:53:25 +0000
Subject: [PATCH] done with ha25 matching for now

---
 .../ha_15_32/ha_analysis_batch_3.py           | 66 +++++++++++--------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index a5845990..f0813aef 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -172,7 +172,7 @@ class DataLoader:
     }
 
     UNMATCHED_ECO3 = {
-        "HA25": 94
+        "HA25": 119
     }
 
     def __init__(self, directories, december_figures_filepath, use_cache, rebuild):
@@ -478,7 +478,7 @@ class DataLoader:
         # For HA1 and HA25, there is an exception in the structure of the data. We don't have any survey or ciga
         # lists, and so
         # we can return the asset list now
-        if ha_name in ["HA1", "HA25"]:
+        if ha_name in ["HA1"]:
             return asset_list, pd.DataFrame(), pd.DataFrame()
 
         # If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be
@@ -1460,10 +1460,8 @@ class DataLoader:
 
         return survey_list
 
-    def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name):
-
-        # May need an eco3 list correction function
-
+    @staticmethod
+    def correct_ha25_eco3_list(eco3_list):
         # NEADS DRIVE, postcode with bs305dt, is not found in the asset list
         eco3_list = eco3_list[
             ~(eco3_list["Post Code"] == "BS305DT")
@@ -1483,6 +1481,29 @@ class DataLoader:
             "Hall Road", "Hall Rd"
         )
 
+        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
+            "SPRINGFIELD WAY SAINT DAY", "SPRINGFIELD WAY ST DAY"
+        )
+        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
+            "BOND SPEAR COURT", "BOND-SPEAR COURT"
+        )
+        eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace(
+            "ST.MARYS HILL", "ST MARYS HILL"
+        )
+        # Correct the postcode for edmund road
+        eco3_list["Post Code"] = np.where(
+            (eco3_list["Street / Block Name"] == "EDMUND ROAD") &
+            (eco3_list["Post Code"] == "TR14 8QJ"),
+            "TR15 1BY",
+            eco3_list["Post Code"]
+        )
+        return eco3_list
+
+    def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name):
+
+        eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list")
+        eco3_list = eco3_list_correction_function(eco3_list)
+
         if ha_name == "HA25":
             missed_postcodes = {
                 postcode.lower() for postcode in eco3_list["Post Code"] if
@@ -1492,8 +1513,9 @@ class DataLoader:
 
         # For the asset list, we create a matching address without any punctuation
         # TODO: We should generally just remove puncutation from addresses when matching
-        asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(r'[^\w\s]', '',
-                                                                                                   regex=True)
+        asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(
+            r'[^\w\s]', '', regex=True
+        )
         # Remove double spaces
         asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace(
             "  ", " "
@@ -1502,6 +1524,8 @@ class DataLoader:
         matching_lookup = []
         missed = []
         for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)):
+            # if row["eco3_list_row_id"] == "HA25_Eco3_5422":
+            #     raise Exception()
             postcode = row["Post Code"].lower().strip()
 
             # df will never be empty, since we've already done a check for common postcodes
@@ -1553,38 +1577,24 @@ class DataLoader:
             )
 
         # We verify the missed
-        # -HA25 contains 88 missed entries. These are actually 8 unique postcodes, where surveys were conducted
-        # on properties that had house numbers outside of the asset list
+        # HA25 contains 119 missed entries. These are actually 24 unique postcodes, and the majority belong to 2
+        # where many surveys were conducted on house numbers, not in the asset list
         if len(missed) != self.UNMATCHED_ECO3[ha_name]:
             raise ValueError(
                 f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched"
             )
 
-        # TODO: 194 missed
-
         matching_lookup = pd.DataFrame(matching_lookup)
         # Check dupes as this will cause problems later on
         if matching_lookup["asset_list_row_id"].duplicated().any():
             raise ValueError("Duplicated asset list row ids")
 
-        missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)]
-        missed_df.head(3).tail(1)["eco3_list_row_id"]
+        # Merge onto eco3 list
+        eco3_list = eco3_list.merge(matching_lookup, how="left", on="eco3_list_row_id")
 
-        duped_ids = matching_lookup[matching_lookup["asset_list_row_id"].duplicated()]["asset_list_row_id"].tolist()
-        duped_df = matching_lookup[
-            matching_lookup["asset_list_row_id"].isin(duped_ids)
-        ]
-        duped_surveys = eco3_list[
-            eco3_list["eco3_list_row_id"].isin(duped_df["eco3_list_row_id"].values)
-        ].copy()
+        asset_list = asset_list.drop(columns=["matching_address_no_punctuation"])
 
-        duped_surveys = duped_surveys.merge(matching_lookup, how="left", on="eco3_list_row_id")
-
-        duped_surveys[
-            ["NO ", "Street / Block Name", "Post Code", "eco3_list_row_id", "asset_list_row_id"]
-        ].sort_values("asset_list_row_id").head()
-
-        asset_list[asset_list["asset_list_row_id"] == "HA2515145"]["matching_address"].values
+        return eco3_list
 
     @staticmethod
     def extract_streetname(address, house_number=None, postcode=None):