formatting asset list

This commit is contained in:
Khalim Conn-Kowlessar 2023-12-28 10:51:53 +00:00
parent d51e1c913d
commit 0e5c343319

View file

@ -26,33 +26,31 @@ load_dotenv(ENV_FILE)
def load_data():
workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx')
workbook = openpyxl.load_workbook('etl/eligibility/ha_15_32/HESTIA - HA 25 ASSET LIST.xlsx', data_only=True)
sheet = workbook.active
# There are no colnames so we create them ourselves
sheet_colnames = [
"property_reference",
"address",
"tenure",
"property_type",
"unknown1",
"year_built",
"unknown2",
"heating_type",
"wall_type",
"roof_type",
"postcode"
]
rows_data = []
rows_colors = []
for row in sheet.iter_rows(min_row=1, values_only=False): # Assuming the first row is headers
row_data = [cell.value for cell in row] # This will get you the cell values
row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
# row_color = COLOR_INDEX[row_color]
for row in sheet.iter_rows(min_row=1, values_only=True): # use values_only=True to get values
row_data = list(row) # No need for comprehension, values_only=True returns a tuple of values
rows_data.append(row_data)
# Headers are on the final row. Pop them off and store them and then remove them from rows_data
headers = rows_data.pop()
# The postcode header is None, so we replace it with "postcode"
headers[-1] = "postcode"
# Handle colours separately
for row in sheet.iter_rows(min_row=1, values_only=False):
# Assume first cell color is indicative of entire row
row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None
rows_colors.append(row_color)
asset_list = pd.DataFrame(rows_data, columns=sheet_colnames)
# Remove the final row of colours, which is the header
rows_colors.pop()
asset_list = pd.DataFrame(rows_data, columns=headers)
asset_list['row_color'] = rows_colors
asset_list["row_colour_name"] = np.where(
@ -65,6 +63,19 @@ def load_data():
np.where(asset_list["row_colour_name"] == "green", "identified potential eco", "maybe in the future")
)
asset_list["address"] = asset_list["T1_Address"].copy().str.lower()
asset_list["address"] = asset_list["address"].str.replace("flat", "")
asset_list["address"] = asset_list["address"].str.strip()
split_addresses = asset_list['address'].str.split(' ', expand=True)
split_addresses.columns = ['HouseNo', 'address2', 'address3', 'address4', 'address5', 'address6', 'address7',
'address8',
'address9', 'address10', 'address11', 'address12', 'address13']
split_addresses["HouseNo"] = split_addresses["HouseNo"].str.replace(";", "")
# We could re-concatenate but we only care about HouseNo for the moment
asset_list = pd.concat([asset_list, split_addresses[["HouseNo"]]], axis=1)
# We analysis historical ECO3 survey list
eco3_survey_workbook = openpyxl.load_workbook(f'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx')
eco3_survey_sheet = eco3_survey_workbook["CAVITY"]
@ -154,13 +165,12 @@ def load_data():
house_number = house_number.lower()
# Filter on the first line of the address
df = asset_list[asset_list["Address"].str.lower().str.contains(row["Street / Block Name"].lower())].copy()
# df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
df = df[df["Address"].str.lower().str.contains(str(house_number))]
df = asset_list[asset_list["T1_Address"].str.lower().str.contains(row["ADDRESS 1"].lower())].copy()
df = df[df["T1_Address"].str.lower().str.contains(str(house_number))]
if df.shape[0] != 1:
df = df[df["HouseNo"] == str(house_number)]
if df.shape[0] != 1:
df = df[df["Postcode"].str.lower().str.contains(row["Post Code"].lower())]
df = df[df["postcode"].str.lower().str.contains(row["POSTCODE"].lower())]
if df.shape[0] != 1:
print(row["Street / Block Name"])
print(house_number)