working on loading data for ha25

2026-07-27 23:35:01 +00:00 · 2023-12-27 16:03:31 +00:00 · 2023-12-27 16:03:31 +00:00 · d51e1c913d
commit d51e1c913d
parent f68256ee12
1 changed files with 108 additions and 5 deletions
--- a/etl/eligibility/ha_15_32/ha25_app.py
+++ b/etl/eligibility/ha_15_32/ha25_app.py
@ -28,11 +28,24 @@ load_dotenv(ENV_FILE)
 def load_data():
    workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx')
    sheet = workbook.active
-    sheet_colnames = [cell.value for cell in sheet[1]]
+    # There are no colnames so we create them ourselves
+    sheet_colnames = [
+        "property_reference",
+        "address",
+        "tenure",
+        "property_type",
+        "unknown1",
+        "year_built",
+        "unknown2",
+        "heating_type",
+        "wall_type",
+        "roof_type",
+        "postcode"
+    ]

    rows_data = []
    rows_colors = []
-    for row in sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+    for row in sheet.iter_rows(min_row=1, values_only=False):  # Assuming the first row is headers
        row_data = [cell.value for cell in row]  # This will get you the cell values
        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
        # row_color = COLOR_INDEX[row_color]
@ -54,8 +67,7 @@ def load_data():

    # We analysis historical ECO3 survey list
    eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx')
-    dir(eco3_survey_workbook)
-    eco3_survey_sheet = eco3_survey_workbook.active
+    eco3_survey_sheet = eco3_survey_workbook["CAVITY"]

    eco3_survey_rows = []
    eco3_survey_colors = []
@ -72,5 +84,96 @@ def load_data():
    eco3_survey_list["row_colour"] = eco3_survey_colors
    # Remove rows where street name is missing
    eco3_survey_list = eco3_survey_list[~pd.isnull(eco3_survey_list["Street / Block Name"])]
+    # We need to parse the row colours
+    # We have the following mappings:
+    # FF7030A0: purple
+    # FF92D050: green
+    # FFFF0000: red
+    # FFFFFF00: yellow
+    # FF38FD23: green
+    eco3_survey_list["row_colour_name"] = np.where(
+        eco3_survey_list["row_colour"] == "FF7030A0", "purple",
+        np.where(eco3_survey_list["row_colour"] == "FF92D050", "green",
+                 np.where(eco3_survey_list["row_colour"] == "FFFF0000", "red",
+                          np.where(eco3_survey_list["row_colour"] == "FFFFFF00", "yellow",
+                                   np.where(eco3_survey_list["row_colour"] == "FF38FD23", "green", "unknown")
+                                   )
+                          )
+                 )
+    )

-    eco3_survey_list["INSTALLED OR CANCELLED"]
+    # We map the meaning:
+    # red: cancelled
+    # green: installed advised install complete
+    # purple: installer advised install complete + post works EPC
+    # yellow: filler row - drop
+    eco3_survey_list["row_colour_code"] = np.where(
+        eco3_survey_list["row_colour_name"] == "red", "cancelled",
+        np.where(eco3_survey_list["row_colour_name"] == "green", "installed advised install complete",
+                 np.where(eco3_survey_list["row_colour_name"] == "purple",
+                          "installer advised install complete + post works EPC",
+                          np.where(eco3_survey_list["row_colour_name"] == "yellow", "filler row - drop", "unknown")
+                          )
+                 )
+    )
+
+    # This is good enough for the indicative cancellation rates
+
+    # We now read in the indicative survey list which identified pospects for ECO4 works
+    eco4_survey_workbook = openpyxl.load_workbook(
+        f'etl/eligibility/ha_15_32/HESTIA - HA 25 ADHOC ISOLATED IDENTIFIED PROPERTIES FOR CWI.xlsx'
+    )
+    eco4_prospect_survey_sheet = eco4_survey_workbook["LiveWest"]
+
+    eco4_prospects_survey_rows = []
+    eco4_prospects_survey_colors = []
+
+    for row in eco4_prospect_survey_sheet.iter_rows(min_row=2, values_only=False):  # Assuming the first row is headers
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
+        # row_color = COLOR_INDEX[row_color]
+        eco4_prospects_survey_rows.append(row_data)
+        eco4_prospects_survey_colors.append(row_color)
+
+    # Some adhoc analysis on the eco3 survey list, just to get completion and cancellation rates historically
+    eco4_prospects_survey_list = pd.DataFrame(
+        eco4_prospects_survey_rows, columns=[cell.value for cell in eco4_prospect_survey_sheet[1]]
+    )
+    eco4_prospects_survey_list["row_colour"] = eco4_prospects_survey_colors
+
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.lower()
+    eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.strip()
+
+    eco4_prospects_survey_list = eco4_prospects_survey_list[~pd.isnull(eco4_prospects_survey_list["ADDRESS 1"])]
+    eco4_prospects_survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(eco4_prospects_survey_list))]
+
+    matched = []
+    for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)):
+        house_number = row["NO"]
+        if isinstance(house_number, str):
+            house_number = house_number.lower()
+
+        # Filter on the first line of the address
+        df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy()
+        # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+        df = df[df["Address"].str.lower().str.contains(str(house_number))]
+        if df.shape[0] != 1:
+            df = df[df["HouseNo"] == str(house_number)]
+            if df.shape[0] != 1:
+                df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
+                if df.shape[0] != 1:
+                    print(row["Street / Block Name"])
+                    print(house_number)
+                    print(row["Post Code"].lower())
+                    raise ValueError("Investigate")
+
+        matched.append(
+            {
+                "survey_key": row["survey_key"],
+                "matched_address": df["Address"].values[0],
+                "survey_house_no": row["NO."],
+                "survey_street_name": row["Street / Block Name"],
+                "survey_postcode": row["Post Code"],
+                "survey_status": row["INSTALLED OR CANCELLED"]
+            }
+        )