29% through matching eco3 ha25

2026-07-27 23:35:01 +00:00 · 2024-03-07 10:42:51 +00:00 · 2024-03-07 10:42:51 +00:00 · 5c3f6320dd
commit 5c3f6320dd
parent 067a66c1b1
1 changed files with 117 additions and 19 deletions
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@ -183,7 +183,7 @@ class DataLoader:

    def create_asset_list_matching_address(self, ha_name, asset_list):

-        if ha_name in ["HA1", "HA6", "HA16", "HA24", "HA25"]:
+        if ha_name in ["HA1", "HA6", "HA16", "HA24"]:
            asset_list["matching_address"] = asset_list[
                self.COLUMN_CONFIG[ha_name]["address"]
            ].astype(str).str.lower().str.strip()
@ -214,6 +214,14 @@ class DataLoader:
                asset_list["Postcode"].astype(str).str.lower().str.strip()
            )
            asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
+        elif ha_name == "HA25":
+            asset_list["matching_address"] = asset_list[
+                self.COLUMN_CONFIG[ha_name]["address"]
+            ].astype(str).str.lower().str.strip()
+
+            asset_list["matching_postcode"] = asset_list['matching_address'].apply(
+                lambda x: ' '.join(x.split()[-2:]) if pd.notnull(x) else x
+            )
        elif ha_name == "HA28":
            asset_list["matching_address"] = (
                asset_list["House Number"].astype(str).str.lower().str.strip() + ", " +
@ -352,6 +360,9 @@ class DataLoader:
            house_numbers = house_numbers.iloc[:, 0:1]
            house_numbers.columns = ['HouseNo']

+            # Remove trailing punctuation such as , or ;
+            house_numbers["HouseNo"] = house_numbers["HouseNo"].str.rstrip(',;')
+
            asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)

        return asset_list
@ -425,27 +436,16 @@ class DataLoader:
        workbook = openpyxl.load_workbook(filepath)
        asset_sheetname = self.get_asset_sheetname(workbook)

-        # TODO: TEMP
-        sheetnames_lower = [x.lower() for x in workbook.sheetnames]
-        if any("eco3" in x for x in sheetnames_lower):
-            raise Exception("REMOVE ME")
-
        asset_sheet = workbook[asset_sheetname]
        asset_sheet_colnames = [cell.value for cell in asset_sheet[1]]
        if ha_name == "HA25":
            asset_sheet_colnames[11] = "matching_postcode"

-        values_only = not ha_name != "HA25"
-
        rows_data = []
-        if not values_only:
-            for row in asset_sheet.iter_rows(min_row=2, values_only=values_only):
-                row_data = [cell.value for cell in row]  # This will get you the cell values
-                rows_data.append(row_data)
-        else:
-            for row in asset_sheet.iter_rows(min_row=2, values_only=values_only):  # use values_only=True to get values
-                row_data = list(row)  # No need for comprehension, values_only=True returns a tuple of values
-                rows_data.append(row_data)
+
+        for row in asset_sheet.iter_rows(min_row=2, values_only=False):
+            row_data = [cell.value for cell in row]  # This will get you the cell values
+            rows_data.append(row_data)

        asset_list = pd.DataFrame(rows_data, columns=asset_sheet_colnames)

@ -477,6 +477,29 @@ class DataLoader:
        if ha_name in ["HA1", "HA25"]:
            return asset_list, pd.DataFrame(), pd.DataFrame()

+        # If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be
+        # suitable under ECO4, since their walls will be filled
+        eco3_list = pd.DataFrame()
+        sheetnames_lower = [x.lower() for x in workbook.sheetnames]
+        eco3_sheetname_index = [i for i, x in enumerate(sheetnames_lower) if "eco3" in x.replace(" ", "")]
+        if eco3_sheetname_index:
+            eco3_sheetname = workbook.sheetnames[eco3_sheetname_index[0]]
+            eco3_sheet = workbook[eco3_sheetname]
+            eco3_rows = []
+            for row in eco3_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+                row_data = [cell.value for cell in row]  # This will get you the cell values
+                eco3_rows.append(row_data)
+
+            eco3_list = pd.DataFrame(eco3_rows, columns=[cell.value for cell in eco3_sheet[1]])
+            # Remove columns that are None
+            eco3_list = eco3_list.loc[:, eco3_list.columns.notnull()]
+            # Remove rows that are completely empty
+            eco3_list = eco3_list.loc[eco3_list.loc[:, eco3_list.columns].notnull().any(axis=1)]
+            eco3_list["eco3_list_row_id"] = [ha_name + "_Eco3_" + str(i) for i in range(0, len(eco3_list))]
+
+            # Perform the eco3 merge
+            eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name)
+
        # We check if there is a survey list
        survey_sheetname = self.get_survey_sheetname(workbook)
        survey_sheet = workbook[survey_sheetname]
@ -518,7 +541,7 @@ class DataLoader:
            ciga_list = self.dedupe_ciga_list(ciga_list)
            ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)

-        return asset_list, survey_list, ciga_list
+        return asset_list, survey_list, ciga_list, eco3_list

    @staticmethod
    def correct_ha6_asset_list(asset_list):
@ -1433,6 +1456,79 @@ class DataLoader:

        return survey_list

+    def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name):
+
+        # We add on a matching postcode without spaces for this
+        # asset_list["matching_postcode_no_space"] = asset_list["matching_postcode"].str.lower().str.replace(" ", "")
+
+        # May need an eco3 list correction function
+
+        # NEADS DRIVE, postcode with bs305dt, is not found in the asset list
+        eco3_list = eco3_list[
+            ~(eco3_list["Post Code"] == "BS305DT")
+        ]
+        # Drop rows with missings postcode
+        eco3_list = eco3_list[
+            ~pd.isnull(eco3_list["Post Code"])
+        ]
+
+        missed_postcodes = []
+        if ha_name == "HA25":
+            missed_postcodes = {
+                postcode.lower() for postcode in eco3_list["Post Code"] if
+                postcode.lower() not in asset_list["matching_postcode"].values
+            }
+            eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)]
+
+        matching_lookup = []
+        missed = []
+        for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)):
+
+            postcode = row["Post Code"].lower().strip()
+
+            # df will never be empty, since we've already done a check for common postcodes
+            df = asset_list[
+                asset_list["matching_postcode"].str.contains(postcode)
+            ]
+
+            house_number = row["NO "]
+            if isinstance(house_number, str):
+                house_number = house_number.lower().strip()
+
+            if not any(df["matching_address"].str.contains(str(house_number))):
+                if "flat" in str(house_number):
+                    house_number = house_number.split("flat")[1].strip()
+
+                # We check if we had an instance of flat x, y
+                if "," in str(house_number):
+                    house_number = house_number.split(",")[0].strip()
+
+                # We may also have a space for an instance of flat x y
+                if " " in str(house_number):
+                    house_number = house_number.split(" ")[0].strip()
+
+            df = df[df["matching_address"].str.contains(str(house_number))]
+
+            if df.empty:
+                missed.append(row["eco3_list_row_id"])
+                continue
+
+            if df.shape[0] != 1:
+                df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
+
+            if df.shape[0] != 1:
+                print(row["Street / Block Name"])
+                print(house_number)
+                print(row["Post Code"])
+                raise ValueError("Investigate")
+
+            matching_lookup.append(
+                {
+                    "eco3_list_row_id": row["eco3_list_row_id"],
+                    "asset_list_row_id": df["asset_list_row_id"].values[0],
+                }
+            )
+
    @staticmethod
    def extract_streetname(address, house_number=None, postcode=None):
        """
@ -4008,11 +4104,13 @@ def app():
    # Add in: "HA25"
    # TODO: Remove ECO3 sales from HA25
    priority_has = [
-        "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA28", "HA32", "HA38", "HA39", "HA107",
+        "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA20", "HA24", "HA25", "HA28", "HA32", "HA39", "HA107",
    ]
    # Next HAs to do: 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come back on this],
    # Then: 28 [DONE],
-    # 38, 41, 10, 14, 20, 48
+    # 41, 10, 14 [DONE], 20, 48, 50
+    # 38[problematic, but no ECO4]
+    # TODO - do 50 and 25 next
    # Filter down the directories to only the priority HAs
    directories = [d for d in directories if d.split("/")[2] in priority_has]