diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py index 473ba9de..244bb5fd 100644 --- a/etl/eligibility/ha_15_32/ha25_app.py +++ b/etl/eligibility/ha_15_32/ha25_app.py @@ -28,11 +28,24 @@ load_dotenv(ENV_FILE) def load_data(): workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx') sheet = workbook.active - sheet_colnames = [cell.value for cell in sheet[1]] + # There are no colnames so we create them ourselves + sheet_colnames = [ + "property_reference", + "address", + "tenure", + "property_type", + "unknown1", + "year_built", + "unknown2", + "heating_type", + "wall_type", + "roof_type", + "postcode" + ] rows_data = [] rows_colors = [] - for row in sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + for row in sheet.iter_rows(min_row=1, values_only=False): # Assuming the first row is headers row_data = [cell.value for cell in row] # This will get you the cell values row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None # row_color = COLOR_INDEX[row_color] @@ -54,8 +67,7 @@ def load_data(): # We analysis historical ECO3 survey list eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx') - dir(eco3_survey_workbook) - eco3_survey_sheet = eco3_survey_workbook.active + eco3_survey_sheet = eco3_survey_workbook["CAVITY"] eco3_survey_rows = [] eco3_survey_colors = [] @@ -72,5 +84,96 @@ def load_data(): eco3_survey_list["row_colour"] = eco3_survey_colors # Remove rows where street name is missing eco3_survey_list = eco3_survey_list[~pd.isnull(eco3_survey_list["Street / Block Name"])] + # We need to parse the row colours + # We have the following mappings: + # FF7030A0: purple + # FF92D050: green + # FFFF0000: red + # FFFFFF00: yellow + # FF38FD23: green + eco3_survey_list["row_colour_name"] = np.where( + eco3_survey_list["row_colour"] == "FF7030A0", "purple", + np.where(eco3_survey_list["row_colour"] == "FF92D050", "green", + np.where(eco3_survey_list["row_colour"] == "FFFF0000", "red", + np.where(eco3_survey_list["row_colour"] == "FFFFFF00", "yellow", + np.where(eco3_survey_list["row_colour"] == "FF38FD23", "green", "unknown") + ) + ) + ) + ) - eco3_survey_list["INSTALLED OR CANCELLED"] + # We map the meaning: + # red: cancelled + # green: installed advised install complete + # purple: installer advised install complete + post works EPC + # yellow: filler row - drop + eco3_survey_list["row_colour_code"] = np.where( + eco3_survey_list["row_colour_name"] == "red", "cancelled", + np.where(eco3_survey_list["row_colour_name"] == "green", "installed advised install complete", + np.where(eco3_survey_list["row_colour_name"] == "purple", + "installer advised install complete + post works EPC", + np.where(eco3_survey_list["row_colour_name"] == "yellow", "filler row - drop", "unknown") + ) + ) + ) + + # This is good enough for the indicative cancellation rates + + # We now read in the indicative survey list which identified pospects for ECO4 works + eco4_survey_workbook = openpyxl.load_workbook( + f'etl/eligibility/ha_15_32/HESTIA - HA 25 ADHOC ISOLATED IDENTIFIED PROPERTIES FOR CWI.xlsx' + ) + eco4_prospect_survey_sheet = eco4_survey_workbook["LiveWest"] + + eco4_prospects_survey_rows = [] + eco4_prospects_survey_colors = [] + + for row in eco4_prospect_survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + # row_color = COLOR_INDEX[row_color] + eco4_prospects_survey_rows.append(row_data) + eco4_prospects_survey_colors.append(row_color) + + # Some adhoc analysis on the eco3 survey list, just to get completion and cancellation rates historically + eco4_prospects_survey_list = pd.DataFrame( + eco4_prospects_survey_rows, columns=[cell.value for cell in eco4_prospect_survey_sheet[1]] + ) + eco4_prospects_survey_list["row_colour"] = eco4_prospects_survey_colors + + eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.lower() + eco4_prospects_survey_list["ADDRESS 1"] = eco4_prospects_survey_list["ADDRESS 1"].str.strip() + + eco4_prospects_survey_list = eco4_prospects_survey_list[~pd.isnull(eco4_prospects_survey_list["ADDRESS 1"])] + eco4_prospects_survey_list["survey_key"] = ["survey_" + str(i) for i in range(0, len(eco4_prospects_survey_list))] + + matched = [] + for _, row in tqdm(eco4_prospects_survey_list.iterrows(), total=len(eco4_prospects_survey_list)): + house_number = row["NO"] + if isinstance(house_number, str): + house_number = house_number.lower() + + # Filter on the first line of the address + df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy() + # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())] + df = df[df["Address"].str.lower().str.contains(str(house_number))] + if df.shape[0] != 1: + df = df[df["HouseNo"] == str(house_number)] + if df.shape[0] != 1: + df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())] + if df.shape[0] != 1: + print(row["Street / Block Name"]) + print(house_number) + print(row["Post Code"].lower()) + raise ValueError("Investigate") + + matched.append( + { + "survey_key": row["survey_key"], + "matched_address": df["Address"].values[0], + "survey_house_no": row["NO."], + "survey_street_name": row["Street / Block Name"], + "survey_postcode": row["Post Code"], + "survey_status": row["INSTALLED OR CANCELLED"] + } + )