diff --git a/etl/eligibility/ha_15_32/ha25_app.py b/etl/eligibility/ha_15_32/ha25_app.py index 244bb5fd..88502e69 100644 --- a/etl/eligibility/ha_15_32/ha25_app.py +++ b/etl/eligibility/ha_15_32/ha25_app.py @@ -26,33 +26,31 @@ load_dotenv(ENV_FILE) def load_data(): - workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx') + workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx', data_only=True) sheet = workbook.active - # There are no colnames so we create them ourselves - sheet_colnames = [ - "property_reference", - "address", - "tenure", - "property_type", - "unknown1", - "year_built", - "unknown2", - "heating_type", - "wall_type", - "roof_type", - "postcode" - ] rows_data = [] rows_colors = [] - for row in sheet.iter_rows(min_row=1, values_only=False): # Assuming the first row is headers - row_data = [cell.value for cell in row] # This will get you the cell values - row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None - # row_color = COLOR_INDEX[row_color] + for row in sheet.iter_rows(min_row=1, values_only=True): # use values_only=True to get values + + row_data = list(row) # No need for comprehension, values_only=True returns a tuple of values rows_data.append(row_data) + + # Headers are on the final row. Pop them off and store them and then remove them from rows_data + headers = rows_data.pop() + # The postcode header is None, so we replace it with "postcode" + headers[-1] = "postcode" + + # Handle colours separately + for row in sheet.iter_rows(min_row=1, values_only=False): + # Assume first cell color is indicative of entire row + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None rows_colors.append(row_color) - asset_list = pd.DataFrame(rows_data, columns=sheet_colnames) + # Remove the final row of colours, which is the header + rows_colors.pop() + + asset_list = pd.DataFrame(rows_data, columns=headers) asset_list['row_color'] = rows_colors asset_list["row_colour_name"] = np.where( @@ -65,6 +63,19 @@ def load_data(): np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future") ) + asset_list["address"] = asset_list["T1_Address"].copy().str.lower() + asset_list["address"] = asset_list["address"].str.replace("flat", "") + asset_list["address"] = asset_list["address"].str.strip() + + split_addresses = asset_list['address'].str.split(' ', expand=True) + split_addresses.columns = ['HouseNo', 'address2', 'address3', 'address4', 'address5', 'address6', 'address7', + 'address8', + 'address9', 'address10', 'address11', 'address12', 'address13'] + split_addresses["HouseNo"] = split_addresses["HouseNo"].str.replace(";", "") + + # We could re-concatenate but we only care about HouseNo for the moment + asset_list = pd.concat([asset_list, split_addresses[["HouseNo"]]], axis=1) + # We analysis historical ECO3 survey list eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx') eco3_survey_sheet = eco3_survey_workbook["CAVITY"] @@ -154,13 +165,12 @@ def load_data(): house_number = house_number.lower() # Filter on the first line of the address - df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy() - # df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())] - df = df[df["Address"].str.lower().str.contains(str(house_number))] + df = asset_list[asset_list["T1_Address"].str.lower().str.contains(row["ADDRESS 1"].lower())].copy() + df = df[df["T1_Address"].str.lower().str.contains(str(house_number))] if df.shape[0] != 1: df = df[df["HouseNo"] == str(house_number)] if df.shape[0] != 1: - df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())] + df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())] if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number)