Trying to handle streetname extraction and edge case in ciga matching

This commit is contained in:
Khalim Conn-Kowlessar 2024-02-22 16:00:23 +00:00
parent d3bff08df8
commit c6daf52046

View file

@ -1,4 +1,5 @@
import os
import re
import openpyxl
from pathlib import Path
import msgpack
@ -36,6 +37,10 @@ class DataLoader:
}
}
UNMATCHED_CIGA = {
"HA14": 6
}
def __init__(self, directories, use_cache):
self.directories = directories
self.use_cache = use_cache
@ -101,6 +106,9 @@ class DataLoader:
else:
split_addresses = asset_list['matching_address'].str.split(',', expand=True)
house_numbers = split_addresses[0].str.split(' ', expand=True)
# If we have "flat" or valley" as the house number, then the house number is actually in the second column
house_numbers[0] = np.where(house_numbers[0].isin(["flat", "valley"]), house_numbers[1], house_numbers[0])
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
# many columns there might be
house_numbers = house_numbers.iloc[:, 0:1]
@ -117,7 +125,7 @@ class DataLoader:
:return:
"""
if ha_name in ["HA6"]:
if ha_name in ["HA6", "HA14"]:
split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
house_numbers = split_addresses[0].str.split(' ', expand=True)
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
@ -132,16 +140,23 @@ class DataLoader:
return ciga_list
@staticmethod
def get_sheetname(workbook):
def get_asset_sheetname(workbook):
if "Asset List" in workbook.sheetnames:
return "Asset List"
else:
return "Assets"
@staticmethod
def get_ciga_sheetname(workbook):
if "CIGA Checks" in workbook.sheetnames:
return "CIGA Checks"
else:
return "CIGA"
def load_asset_list(self, filepath, ha_name):
workbook = openpyxl.load_workbook(filepath)
sheetname = self.get_sheetname(workbook)
asset_sheet = workbook[sheetname]
asset_sheetname = self.get_asset_sheetname(workbook)
asset_sheet = workbook[asset_sheetname]
asset_sheet_colnames = [cell.value for cell in asset_sheet[1]]
rows_data = []
@ -165,41 +180,46 @@ class DataLoader:
asset_list = self.append_asset_list_built_form(ha_name=ha_name, asset_list=asset_list)
# We correct the asset list if it needs it
# Correct the asset list
correction_function_name = f"correct_{ha_name.lower()}_asset_list"
if hasattr(self, correction_function_name):
asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list")
asset_list = asset_list_correction_function(asset_list)
# We check if there is a survey list
survey_list = pd.DataFrame()
if "ECO Surveys" in workbook.sheetnames:
survey_sheet = workbook["ECO Surveys"]
survey_rows = []
for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
row_data = [cell.value for cell in row] # This will get you the cell values
survey_rows.append(row_data)
survey_sheetname = "ECO Surveys"
survey_sheet = workbook[survey_sheetname]
survey_rows = []
for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
row_data = [cell.value for cell in row] # This will get you the cell values
survey_rows.append(row_data)
survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
# Remove columns that are None
survey_list = survey_list.loc[:, survey_list.columns.notnull()]
survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))]
survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
# Remove columns that are None
survey_list = survey_list.loc[:, survey_list.columns.notnull()]
survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))]
# Perform survey list merge
if not survey_list.empty:
survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name)
# Perform survey list merge
if not survey_list.empty:
survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name)
# We check if there are CIGA checks
ciga_list = pd.DataFrame()
if "CIGA Checks" in workbook.sheetnames:
ciga_sheet = workbook["CIGA Checks"]
ciga_rows = []
for row in ciga_sheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
ciga_rows.append(row_data)
ciga_sheetname = self.get_ciga_sheetname(workbook)
ciga_sheet = workbook[ciga_sheetname]
ciga_rows = []
for row in ciga_sheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
ciga_rows.append(row_data)
ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]])
# Remove columns that are None
ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()]
survey_list["survey_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(survey_list))]
# Perform ciga list merge
if not ciga_list.empty:
ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list)
ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)
ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]])
# Remove columns that are None
ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()]
ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
# Perform ciga list merge
if not ciga_list.empty:
ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list)
ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)
return asset_list, survey_list, ciga_list
@ -222,6 +242,21 @@ class DataLoader:
@staticmethod
def correct_ha14_asset_list(asset_list):
# For 5 Queens Court, DE72 3NP, the postcode is actually DE72 3QZ
asset_list.loc[
(asset_list["Address 1"] == "5 Queens Court") &
(asset_list["Postcode"].str.strip() == "DE72 3NP"),
"matching_postcode"
] = "DE72 3QZ"
# We then correct the matching_address
asset_list.loc[
(asset_list["Address 1"] == "5 Queens Court") &
(asset_list["Postcode"].str.strip() == "DE72 3NP"),
"matching_address"
] = "5 queens court, garfield avenue, draycott, derby, de72 3qz"
return asset_list
@staticmethod
@ -363,13 +398,22 @@ class DataLoader:
"Oiliver Road", "Oliver Road"
)
# For postodes DE7 4FB, DE7 4EZ, it's actually spelled WINDERMERE AVENUE, not WINDEREMERE AVENUE (without the
# extra e)
survey_list.loc[
(survey_list["Street / Block Name"] == "WINDEREMERE AVENUE") &
(survey_list["Post Code"].isin(["DE7 4FB", "DE7 4EZ"])),
"Street / Block Name"
] = "WINDERMERE AVENUE"
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"MACDONALD SQAURE", "MACDONALD SQUARE"
)
return survey_list
def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
# Correct the asset list
asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list")
asset_list = asset_list_correction_function(asset_list)
# Correct the survey list
survey_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_survey_list")
survey_list = survey_list_correction_function(survey_list)
@ -411,7 +455,7 @@ class DataLoader:
print(row["Street / Block Name"])
print(house_number)
print(row["Post Code"].lower())
print(row["Post Code"])
raise ValueError("Investigate")
matching_lookup.append(
@ -428,8 +472,38 @@ class DataLoader:
return survey_list
@staticmethod
def extract_streetname(address, house_number=None, postcode=None):
"""
Cleans an address by removing the house number and postcode, and converts everything to lower case.
:param address: The full address as a string.
:param house_number: The house number to remove, as a string or integer.
:param postcode: The postcode to remove, as a string.
:return: The cleaned address.
"""
# Convert everything to lower case
address = address.lower()
if house_number is not None:
# Remove the house number
address = re.sub(r'\b{}\b'.format(house_number), '', address, flags=re.IGNORECASE).strip()
if postcode is not None:
# Remove the postcode
address = re.sub(r'\b{}\b'.format(re.escape(postcode)), '', address, flags=re.IGNORECASE).strip()
# Get first section before a comma
address = address.split(",")[0]
# Additional cleaning to remove extra spaces and commas left over
address = re.sub(r'\s+', ' ', address) # Replace multiple spaces with a single space
address = re.sub(r'\s*,\s*', ', ', address) # Clean up space around commas
return address
def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name):
matching_lookup = []
unmatched_addresses = []
for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)):
house_number = row["HouseNo"]
@ -442,22 +516,35 @@ class DataLoader:
].copy()
df = df[df["HouseNo"] == str(house_number)]
# For ciga, we skip
if df.empty:
if row["Matched Postcode"] == "LE3 3EE":
dew
unmatched_addresses.append(
{
"ciga_list_row_id": row["ciga_list_row_id"],
"HouseNo": house_number,
"Matched Postcode": row["Matched Postcode"]
}
)
continue
# TODO: Might need to consider street name at some point
if df.shape[0] != 1:
if df.shape[0] != 1:
df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())]
if df.shape[0] != 1:
postcode_lower = row["Post Code"].lower()
# if postcode_lower in missed_postcodes:
# matching_lookup.append(
# {
# "survey_list_row_id": row["survey_list_row_id"],
# "asset_list_row_id": None,
# }
# )
# continue
# We split house number and postcode out of the matched address for ciga
street_name = self.extract_streetname(
address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"]
)
df = df[df["matching_address"].str.contains(street_name)]
if df.shape[0] != 1:
# The final check we do here is to check for the presence of flat in the address
if "flat" in row["Matched Address"]:
df = df[df["matching_address"].str.contains("flat")]
else:
df = df[df["matching_address"].str.contains("flat") == False]
if df.shape[0] != 1:
print(row["Street / Block Name"])
print(house_number)
print(row["Post Code"].lower())
@ -470,6 +557,13 @@ class DataLoader:
}
)
# We have an acceptable number of ciga failures for each HA
if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]:
raise ValueError(f"Unmatched addresses for {ha_name} is not as expected")
# In ciga: 35 Valley Drive, Leicester, LE3 3EE
#
matching_lookup = pd.DataFrame(matching_lookup)
# Merge onto the ciga list