mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Trying to handle streetname extraction and edge case in ciga matching
This commit is contained in:
parent
d3bff08df8
commit
c6daf52046
1 changed files with 143 additions and 49 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
import re
|
||||
import openpyxl
|
||||
from pathlib import Path
|
||||
import msgpack
|
||||
|
|
@ -36,6 +37,10 @@ class DataLoader:
|
|||
}
|
||||
}
|
||||
|
||||
UNMATCHED_CIGA = {
|
||||
"HA14": 6
|
||||
}
|
||||
|
||||
def __init__(self, directories, use_cache):
|
||||
self.directories = directories
|
||||
self.use_cache = use_cache
|
||||
|
|
@ -101,6 +106,9 @@ class DataLoader:
|
|||
else:
|
||||
split_addresses = asset_list['matching_address'].str.split(',', expand=True)
|
||||
house_numbers = split_addresses[0].str.split(' ', expand=True)
|
||||
# If we have "flat" or valley" as the house number, then the house number is actually in the second column
|
||||
house_numbers[0] = np.where(house_numbers[0].isin(["flat", "valley"]), house_numbers[1], house_numbers[0])
|
||||
|
||||
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
|
||||
# many columns there might be
|
||||
house_numbers = house_numbers.iloc[:, 0:1]
|
||||
|
|
@ -117,7 +125,7 @@ class DataLoader:
|
|||
:return:
|
||||
"""
|
||||
|
||||
if ha_name in ["HA6"]:
|
||||
if ha_name in ["HA6", "HA14"]:
|
||||
split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
|
||||
house_numbers = split_addresses[0].str.split(' ', expand=True)
|
||||
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
|
||||
|
|
@ -132,16 +140,23 @@ class DataLoader:
|
|||
return ciga_list
|
||||
|
||||
@staticmethod
|
||||
def get_sheetname(workbook):
|
||||
def get_asset_sheetname(workbook):
|
||||
if "Asset List" in workbook.sheetnames:
|
||||
return "Asset List"
|
||||
else:
|
||||
return "Assets"
|
||||
|
||||
@staticmethod
|
||||
def get_ciga_sheetname(workbook):
|
||||
if "CIGA Checks" in workbook.sheetnames:
|
||||
return "CIGA Checks"
|
||||
else:
|
||||
return "CIGA"
|
||||
|
||||
def load_asset_list(self, filepath, ha_name):
|
||||
workbook = openpyxl.load_workbook(filepath)
|
||||
sheetname = self.get_sheetname(workbook)
|
||||
asset_sheet = workbook[sheetname]
|
||||
asset_sheetname = self.get_asset_sheetname(workbook)
|
||||
asset_sheet = workbook[asset_sheetname]
|
||||
asset_sheet_colnames = [cell.value for cell in asset_sheet[1]]
|
||||
|
||||
rows_data = []
|
||||
|
|
@ -165,41 +180,46 @@ class DataLoader:
|
|||
|
||||
asset_list = self.append_asset_list_built_form(ha_name=ha_name, asset_list=asset_list)
|
||||
|
||||
# We correct the asset list if it needs it
|
||||
# Correct the asset list
|
||||
correction_function_name = f"correct_{ha_name.lower()}_asset_list"
|
||||
if hasattr(self, correction_function_name):
|
||||
asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list")
|
||||
asset_list = asset_list_correction_function(asset_list)
|
||||
|
||||
# We check if there is a survey list
|
||||
survey_list = pd.DataFrame()
|
||||
if "ECO Surveys" in workbook.sheetnames:
|
||||
survey_sheet = workbook["ECO Surveys"]
|
||||
survey_rows = []
|
||||
for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
|
||||
row_data = [cell.value for cell in row] # This will get you the cell values
|
||||
survey_rows.append(row_data)
|
||||
survey_sheetname = "ECO Surveys"
|
||||
survey_sheet = workbook[survey_sheetname]
|
||||
survey_rows = []
|
||||
for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
|
||||
row_data = [cell.value for cell in row] # This will get you the cell values
|
||||
survey_rows.append(row_data)
|
||||
|
||||
survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
|
||||
# Remove columns that are None
|
||||
survey_list = survey_list.loc[:, survey_list.columns.notnull()]
|
||||
survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))]
|
||||
survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]])
|
||||
# Remove columns that are None
|
||||
survey_list = survey_list.loc[:, survey_list.columns.notnull()]
|
||||
survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))]
|
||||
|
||||
# Perform survey list merge
|
||||
if not survey_list.empty:
|
||||
survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name)
|
||||
# Perform survey list merge
|
||||
if not survey_list.empty:
|
||||
survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name)
|
||||
|
||||
# We check if there are CIGA checks
|
||||
ciga_list = pd.DataFrame()
|
||||
if "CIGA Checks" in workbook.sheetnames:
|
||||
ciga_sheet = workbook["CIGA Checks"]
|
||||
ciga_rows = []
|
||||
for row in ciga_sheet.iter_rows(min_row=2, values_only=False):
|
||||
row_data = [cell.value for cell in row] # This will get you the cell values
|
||||
ciga_rows.append(row_data)
|
||||
ciga_sheetname = self.get_ciga_sheetname(workbook)
|
||||
ciga_sheet = workbook[ciga_sheetname]
|
||||
ciga_rows = []
|
||||
for row in ciga_sheet.iter_rows(min_row=2, values_only=False):
|
||||
row_data = [cell.value for cell in row] # This will get you the cell values
|
||||
ciga_rows.append(row_data)
|
||||
|
||||
ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]])
|
||||
# Remove columns that are None
|
||||
ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()]
|
||||
survey_list["survey_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(survey_list))]
|
||||
# Perform ciga list merge
|
||||
if not ciga_list.empty:
|
||||
ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list)
|
||||
ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)
|
||||
ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]])
|
||||
# Remove columns that are None
|
||||
ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()]
|
||||
ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
|
||||
# Perform ciga list merge
|
||||
if not ciga_list.empty:
|
||||
ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list)
|
||||
ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)
|
||||
|
||||
return asset_list, survey_list, ciga_list
|
||||
|
||||
|
|
@ -222,6 +242,21 @@ class DataLoader:
|
|||
|
||||
@staticmethod
|
||||
def correct_ha14_asset_list(asset_list):
|
||||
|
||||
# For 5 Queens Court, DE72 3NP, the postcode is actually DE72 3QZ
|
||||
asset_list.loc[
|
||||
(asset_list["Address 1"] == "5 Queens Court") &
|
||||
(asset_list["Postcode"].str.strip() == "DE72 3NP"),
|
||||
"matching_postcode"
|
||||
] = "DE72 3QZ"
|
||||
|
||||
# We then correct the matching_address
|
||||
asset_list.loc[
|
||||
(asset_list["Address 1"] == "5 Queens Court") &
|
||||
(asset_list["Postcode"].str.strip() == "DE72 3NP"),
|
||||
"matching_address"
|
||||
] = "5 queens court, garfield avenue, draycott, derby, de72 3qz"
|
||||
|
||||
return asset_list
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -363,13 +398,22 @@ class DataLoader:
|
|||
"Oiliver Road", "Oliver Road"
|
||||
)
|
||||
|
||||
# For postodes DE7 4FB, DE7 4EZ, it's actually spelled WINDERMERE AVENUE, not WINDEREMERE AVENUE (without the
|
||||
# extra e)
|
||||
survey_list.loc[
|
||||
(survey_list["Street / Block Name"] == "WINDEREMERE AVENUE") &
|
||||
(survey_list["Post Code"].isin(["DE7 4FB", "DE7 4EZ"])),
|
||||
"Street / Block Name"
|
||||
] = "WINDERMERE AVENUE"
|
||||
|
||||
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
||||
"MACDONALD SQAURE", "MACDONALD SQUARE"
|
||||
)
|
||||
|
||||
return survey_list
|
||||
|
||||
def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
|
||||
|
||||
# Correct the asset list
|
||||
asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list")
|
||||
asset_list = asset_list_correction_function(asset_list)
|
||||
# Correct the survey list
|
||||
survey_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_survey_list")
|
||||
survey_list = survey_list_correction_function(survey_list)
|
||||
|
|
@ -411,7 +455,7 @@ class DataLoader:
|
|||
|
||||
print(row["Street / Block Name"])
|
||||
print(house_number)
|
||||
print(row["Post Code"].lower())
|
||||
print(row["Post Code"])
|
||||
raise ValueError("Investigate")
|
||||
|
||||
matching_lookup.append(
|
||||
|
|
@ -428,8 +472,38 @@ class DataLoader:
|
|||
|
||||
return survey_list
|
||||
|
||||
@staticmethod
|
||||
def extract_streetname(address, house_number=None, postcode=None):
|
||||
"""
|
||||
Cleans an address by removing the house number and postcode, and converts everything to lower case.
|
||||
|
||||
:param address: The full address as a string.
|
||||
:param house_number: The house number to remove, as a string or integer.
|
||||
:param postcode: The postcode to remove, as a string.
|
||||
:return: The cleaned address.
|
||||
"""
|
||||
# Convert everything to lower case
|
||||
address = address.lower()
|
||||
|
||||
if house_number is not None:
|
||||
# Remove the house number
|
||||
address = re.sub(r'\b{}\b'.format(house_number), '', address, flags=re.IGNORECASE).strip()
|
||||
|
||||
if postcode is not None:
|
||||
# Remove the postcode
|
||||
address = re.sub(r'\b{}\b'.format(re.escape(postcode)), '', address, flags=re.IGNORECASE).strip()
|
||||
|
||||
# Get first section before a comma
|
||||
address = address.split(",")[0]
|
||||
# Additional cleaning to remove extra spaces and commas left over
|
||||
address = re.sub(r'\s+', ' ', address) # Replace multiple spaces with a single space
|
||||
address = re.sub(r'\s*,\s*', ', ', address) # Clean up space around commas
|
||||
|
||||
return address
|
||||
|
||||
def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name):
|
||||
matching_lookup = []
|
||||
unmatched_addresses = []
|
||||
for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)):
|
||||
|
||||
house_number = row["HouseNo"]
|
||||
|
|
@ -442,22 +516,35 @@ class DataLoader:
|
|||
].copy()
|
||||
|
||||
df = df[df["HouseNo"] == str(house_number)]
|
||||
# For ciga, we skip
|
||||
if df.empty:
|
||||
if row["Matched Postcode"] == "LE3 3EE":
|
||||
dew
|
||||
unmatched_addresses.append(
|
||||
{
|
||||
"ciga_list_row_id": row["ciga_list_row_id"],
|
||||
"HouseNo": house_number,
|
||||
"Matched Postcode": row["Matched Postcode"]
|
||||
}
|
||||
)
|
||||
continue
|
||||
# TODO: Might need to consider street name at some point
|
||||
if df.shape[0] != 1:
|
||||
|
||||
if df.shape[0] != 1:
|
||||
df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())]
|
||||
if df.shape[0] != 1:
|
||||
postcode_lower = row["Post Code"].lower()
|
||||
# if postcode_lower in missed_postcodes:
|
||||
# matching_lookup.append(
|
||||
# {
|
||||
# "survey_list_row_id": row["survey_list_row_id"],
|
||||
# "asset_list_row_id": None,
|
||||
# }
|
||||
# )
|
||||
# continue
|
||||
# We split house number and postcode out of the matched address for ciga
|
||||
street_name = self.extract_streetname(
|
||||
address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"]
|
||||
)
|
||||
df = df[df["matching_address"].str.contains(street_name)]
|
||||
|
||||
if df.shape[0] != 1:
|
||||
# The final check we do here is to check for the presence of flat in the address
|
||||
if "flat" in row["Matched Address"]:
|
||||
df = df[df["matching_address"].str.contains("flat")]
|
||||
else:
|
||||
df = df[df["matching_address"].str.contains("flat") == False]
|
||||
|
||||
if df.shape[0] != 1:
|
||||
print(row["Street / Block Name"])
|
||||
print(house_number)
|
||||
print(row["Post Code"].lower())
|
||||
|
|
@ -470,6 +557,13 @@ class DataLoader:
|
|||
}
|
||||
)
|
||||
|
||||
# We have an acceptable number of ciga failures for each HA
|
||||
if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]:
|
||||
raise ValueError(f"Unmatched addresses for {ha_name} is not as expected")
|
||||
|
||||
# In ciga: 35 Valley Drive, Leicester, LE3 3EE
|
||||
#
|
||||
|
||||
matching_lookup = pd.DataFrame(matching_lookup)
|
||||
|
||||
# Merge onto the ciga list
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue