formatting asset list

2026-06-08 11:17:27 +00:00 · 2023-12-28 10:51:53 +00:00 · 2023-12-28 10:51:53 +00:00 · 0e5c343319
commit 0e5c343319
parent d51e1c913d
1 changed files with 34 additions and 24 deletions
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@ -26,33 +26,31 @@ load_dotenv(ENV_FILE)


 def load_data():
-    workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx')
+    workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx', data_only=True)
    sheet = workbook.active
-    # There are no colnames so we create them ourselves
-    sheet_colnames = [
-        "property_reference",
-        "address",
-        "tenure",
-        "property_type",
-        "unknown1",
-        "year_built",
-        "unknown2",
-        "heating_type",
-        "wall_type",
-        "roof_type",
-        "postcode"
-    ]

    rows_data = []
    rows_colors = []
-    for row in sheet.iter_rows(min_row=1, values_only=False):  # Assuming the first row is headers
-        row_data = [cell.value for cell in row]  # This will get you the cell values
-        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
-        # row_color = COLOR_INDEX[row_color]
+    for row in sheet.iter_rows(min_row=1, values_only=True):  # use values_only=True to get values
+
+        row_data = list(row)  # No need for comprehension, values_only=True returns a tuple of values
        rows_data.append(row_data)
+
+    # Headers are on the final row. Pop them off and store them and then remove them from rows_data
+    headers = rows_data.pop()
+    # The postcode header is None, so we replace it with "postcode"
+    headers[-1] = "postcode"
+
+    # Handle colours separately
+    for row in sheet.iter_rows(min_row=1, values_only=False):
+        # Assume first cell color is indicative of entire row
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
        rows_colors.append(row_color)

-    asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
+    # Remove the final row of colours, which is the header
+    rows_colors.pop()
+
+    asset_list = pd.DataFrame(rows_data, columns=headers)
    asset_list['row_color'] = rows_colors

    asset_list["row_colour_name"] = np.where(
@ -65,6 +63,19 @@ def load_data():
        np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future")
    )

+    asset_list["address"] = asset_list["T1_Address"].copy().str.lower()
+    asset_list["address"] = asset_list["address"].str.replace("flat", "")
+    asset_list["address"] = asset_list["address"].str.strip()
+
+    split_addresses = asset_list['address'].str.split(' ', expand=True)
+    split_addresses.columns = ['HouseNo', 'address2', 'address3', 'address4', 'address5', 'address6', 'address7',
+                               'address8',
+                               'address9', 'address10', 'address11', 'address12', 'address13']
+    split_addresses["HouseNo"] = split_addresses["HouseNo"].str.replace(";", "")
+
+    # We could re-concatenate but we only care about HouseNo for the moment
+    asset_list = pd.concat([asset_list, split_addresses[["HouseNo"]]], axis=1)
+
    # We analysis historical ECO3 survey list
    eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx')
    eco3_survey_sheet = eco3_survey_workbook["CAVITY"]
@ -154,13 +165,12 @@ def load_data():
            house_number = house_number.lower()

        # Filter on the first line of the address
-        df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy()
-        # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
-        df = df[df["Address"].str.lower().str.contains(str(house_number))]
+        df = asset_list[asset_list["T1_Address"].str.lower().str.contains(row["ADDRESS 1"].lower())].copy()
+        df = df[df["T1_Address"].str.lower().str.contains(str(house_number))]
        if df.shape[0] != 1:
            df = df[df["HouseNo"] == str(house_number)]
            if df.shape[0] != 1:
-                df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+                df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())]
                if df.shape[0] != 1:
                    print(row["Street / Block Name"])
                    print(house_number)