From cf9253d06201bbadca263eef269de973957b9556 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 22 Jan 2024 11:41:52 +0000
Subject: [PATCH] working on matching code for HA6 asset and survey lists

---
 .../ha_15_32/ha_analysis_batch_3.py           | 49 +++++++++++++++++--
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index bd2c6c99..7fbddd54 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -52,7 +52,7 @@ class DataLoader:
 
         rows_data = []
         rows_colors = []
-        for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        for row in tqdm(sheet.iter_rows(min_row=2, values_only=False)):  # Assuming the first row is headers
             row_data = [cell.value for cell in row]  # This will get you the cell values
             row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
             # row_color = COLOR_INDEX[row_color]
@@ -137,7 +137,7 @@ class DataLoader:
         )
 
         # Add in asset_list_row_id
-        survey_list["survey_list_row_id"] = [ha_name + str(i) for i in range(0, len(survey_list))]
+        survey_list["survey_list_row_id"] = [ha_name + "_surveys_" + str(i) for i in range(0, len(survey_list))]
 
         # We now do the matching between the asset list and the survey list.
         # What we'll get from this is a lookup table from the asset list to the survey list
@@ -150,14 +150,53 @@ class DataLoader:
         return survey_list
 
     def merge_ha_6(self, asset_list, survey_list):
-        pass
+
+        # Prepare the asset list
+        asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().strip()
+        asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().strip()
+
+        split_addresses = asset_list['matching_address'].str.split(',', expand=True)
+        split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5']
+        house_numbers = split_addresses['temp'].str.split(' ', expand=True)
+        house_numbers.columns = ['HouseNo', 'part1', 'part2', "part3", "part4", "part5"]
+
+        asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
+        del split_addresses, house_numbers
+
+        matching_lookup = []
+        for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)):
+            house_number = row["NO."]
+            if isinstance(house_number, str):
+                house_number = house_number.lower().strip()
+
+            # Filter on the first line of the address
+            df = asset_list[
+                asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip())
+            ].copy()
+            df = df[df["matching_address"].str.contains(str(house_number))]
+            if df.shape[0] != 1:
+                df = df[df["HouseNo"] == str(house_number)]
+                if df.shape[0] != 1:
+                    df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())]
+                    if df.shape[0] != 1:
+                        print(row["Street / Block Name"])
+                        print(house_number)
+                        print(row["Post Code"].lower())
+                        raise ValueError("Investigate")
+
+            matching_lookup.append(
+                {
+                    "survey_list_row_id": row["survey_list_row_id"],
+                    "asset_list_row_id": df["asset_list_row_id"].values[0],
+                }
+            )
 
     def load(self):
 
         data = {}
         for ha_name, file_config in self.files.items():
             # Load asset list
-            # logger.info("LOading asset list for {}".format(ha_name))
+            logger.info("Loading asset list for {}".format(ha_name))
             asset_list = self.load_asset_list(
                 file_path=file_config["asset_list"]["filepath"],
                 ha_name=ha_name,
@@ -165,6 +204,7 @@ class DataLoader:
             )
 
             if file_config.get("survey_list"):
+                logger.info("Loading survey list for {}".format(ha_name))
                 survey_list = self.load_survey_list(
                     file_path=file_config["survey_list"]["filepath"],
                     ha_name=ha_name,
@@ -209,3 +249,4 @@ def app():
     }
 
     loader = DataLoader(files)
+    loader.load()