29% through matching eco3 ha25

This commit is contained in:
Khalim Conn-Kowlessar 2024-03-07 10:42:51 +00:00
parent 067a66c1b1
commit 5c3f6320dd

View file

@ -183,7 +183,7 @@ class DataLoader:
def create_asset_list_matching_address(self, ha_name, asset_list):
if ha_name in ["HA1", "HA6", "HA16", "HA24", "HA25"]:
if ha_name in ["HA1", "HA6", "HA16", "HA24"]:
asset_list["matching_address"] = asset_list[
self.COLUMN_CONFIG[ha_name]["address"]
].astype(str).str.lower().str.strip()
@ -214,6 +214,14 @@ class DataLoader:
asset_list["Postcode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
elif ha_name == "HA25":
asset_list["matching_address"] = asset_list[
self.COLUMN_CONFIG[ha_name]["address"]
].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list['matching_address'].apply(
lambda x: ' '.join(x.split()[-2:]) if pd.notnull(x) else x
)
elif ha_name == "HA28":
asset_list["matching_address"] = (
asset_list["House Number"].astype(str).str.lower().str.strip() + ", " +
@ -352,6 +360,9 @@ class DataLoader:
house_numbers = house_numbers.iloc[:, 0:1]
house_numbers.columns = ['HouseNo']
# Remove trailing punctuation such as , or ;
house_numbers["HouseNo"] = house_numbers["HouseNo"].str.rstrip(',;')
asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
return asset_list
@ -425,27 +436,16 @@ class DataLoader:
workbook = openpyxl.load_workbook(filepath)
asset_sheetname = self.get_asset_sheetname(workbook)
# TODO: TEMP
sheetnames_lower = [x.lower() for x in workbook.sheetnames]
if any("eco3" in x for x in sheetnames_lower):
raise Exception("REMOVE ME")
asset_sheet = workbook[asset_sheetname]
asset_sheet_colnames = [cell.value for cell in asset_sheet[1]]
if ha_name == "HA25":
asset_sheet_colnames[11] = "matching_postcode"
values_only = not ha_name != "HA25"
rows_data = []
if not values_only:
for row in asset_sheet.iter_rows(min_row=2, values_only=values_only):
row_data = [cell.value for cell in row] # This will get you the cell values
rows_data.append(row_data)
else:
for row in asset_sheet.iter_rows(min_row=2, values_only=values_only): # use values_only=True to get values
row_data = list(row) # No need for comprehension, values_only=True returns a tuple of values
rows_data.append(row_data)
for row in asset_sheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
rows_data.append(row_data)
asset_list = pd.DataFrame(rows_data, columns=asset_sheet_colnames)
@ -477,6 +477,29 @@ class DataLoader:
if ha_name in ["HA1", "HA25"]:
return asset_list, pd.DataFrame(), pd.DataFrame()
# If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be
# suitable under ECO4, since their walls will be filled
eco3_list = pd.DataFrame()
sheetnames_lower = [x.lower() for x in workbook.sheetnames]
eco3_sheetname_index = [i for i, x in enumerate(sheetnames_lower) if "eco3" in x.replace(" ", "")]
if eco3_sheetname_index:
eco3_sheetname = workbook.sheetnames[eco3_sheetname_index[0]]
eco3_sheet = workbook[eco3_sheetname]
eco3_rows = []
for row in eco3_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
row_data = [cell.value for cell in row] # This will get you the cell values
eco3_rows.append(row_data)
eco3_list = pd.DataFrame(eco3_rows, columns=[cell.value for cell in eco3_sheet[1]])
# Remove columns that are None
eco3_list = eco3_list.loc[:, eco3_list.columns.notnull()]
# Remove rows that are completely empty
eco3_list = eco3_list.loc[eco3_list.loc[:, eco3_list.columns].notnull().any(axis=1)]
eco3_list["eco3_list_row_id"] = [ha_name + "_Eco3_" + str(i) for i in range(0, len(eco3_list))]
# Perform the eco3 merge
eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name)
# We check if there is a survey list
survey_sheetname = self.get_survey_sheetname(workbook)
survey_sheet = workbook[survey_sheetname]
@ -518,7 +541,7 @@ class DataLoader:
ciga_list = self.dedupe_ciga_list(ciga_list)
ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)
return asset_list, survey_list, ciga_list
return asset_list, survey_list, ciga_list, eco3_list
@staticmethod
def correct_ha6_asset_list(asset_list):
@ -1433,6 +1456,79 @@ class DataLoader:
return survey_list
def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name):
# We add on a matching postcode without spaces for this
# asset_list["matching_postcode_no_space"] = asset_list["matching_postcode"].str.lower().str.replace(" ", "")
# May need an eco3 list correction function
# NEADS DRIVE, postcode with bs305dt, is not found in the asset list
eco3_list = eco3_list[
~(eco3_list["Post Code"] == "BS305DT")
]
# Drop rows with missings postcode
eco3_list = eco3_list[
~pd.isnull(eco3_list["Post Code"])
]
missed_postcodes = []
if ha_name == "HA25":
missed_postcodes = {
postcode.lower() for postcode in eco3_list["Post Code"] if
postcode.lower() not in asset_list["matching_postcode"].values
}
eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)]
matching_lookup = []
missed = []
for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)):
postcode = row["Post Code"].lower().strip()
# df will never be empty, since we've already done a check for common postcodes
df = asset_list[
asset_list["matching_postcode"].str.contains(postcode)
]
house_number = row["NO "]
if isinstance(house_number, str):
house_number = house_number.lower().strip()
if not any(df["matching_address"].str.contains(str(house_number))):
if "flat" in str(house_number):
house_number = house_number.split("flat")[1].strip()
# We check if we had an instance of flat x, y
if "," in str(house_number):
house_number = house_number.split(",")[0].strip()
# We may also have a space for an instance of flat x y
if " " in str(house_number):
house_number = house_number.split(" ")[0].strip()
df = df[df["matching_address"].str.contains(str(house_number))]
if df.empty:
missed.append(row["eco3_list_row_id"])
continue
if df.shape[0] != 1:
df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)]
if df.shape[0] != 1:
print(row["Street / Block Name"])
print(house_number)
print(row["Post Code"])
raise ValueError("Investigate")
matching_lookup.append(
{
"eco3_list_row_id": row["eco3_list_row_id"],
"asset_list_row_id": df["asset_list_row_id"].values[0],
}
)
@staticmethod
def extract_streetname(address, house_number=None, postcode=None):
"""
@ -4008,11 +4104,13 @@ def app():
# Add in: "HA25"
# TODO: Remove ECO3 sales from HA25
priority_has = [
"HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA28", "HA32", "HA38", "HA39", "HA107",
"HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA20", "HA24", "HA25", "HA28", "HA32", "HA39", "HA107",
]
# Next HAs to do: 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come back on this],
# Then: 28 [DONE],
# 38, 41, 10, 14, 20, 48
# 41, 10, 14 [DONE], 20, 48, 50
# 38[problematic, but no ECO4]
# TODO - do 50 and 25 next
# Filter down the directories to only the priority HAs
directories = [d for d in directories if d.split("/")[2] in priority_has]