working on survey match for ha107

This commit is contained in:
Khalim Conn-Kowlessar 2024-02-22 17:46:11 +00:00
parent 75102704cd
commit 32352bbde1

View file

@ -40,7 +40,9 @@ class DataLoader:
UNMATCHED_CIGA = {
# We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not
# the asset list
"HA14": 4
"HA14": 4,
# There's just too many unmatched here - if we identify some homes that
"HA6": 117
}
def __init__(self, directories, use_cache):
@ -78,11 +80,11 @@ class DataLoader:
elif ha_name == "HA107":
# Create matching_address by concatenating House No, Street, Town, District, Postcode
asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Street"].str.lower().str.strip() + ", " + \
asset_list["Town"].str.lower().str.strip() + ", " + \
asset_list["District"].str.lower().str.strip() + ", " + \
asset_list["Postcode"].str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \
asset_list["District"].astype(str).str.lower().str.strip() + ", " + \
asset_list["Postcode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
else:
raise NotImplementedError("implement me")
@ -155,6 +157,13 @@ class DataLoader:
else:
return "CIGA"
@staticmethod
def get_survey_sheetname(workbook):
if "ECO Surveys" in workbook.sheetnames:
return "ECO Surveys"
else:
return "ECO surveys"
def load_asset_list(self, filepath, ha_name):
workbook = openpyxl.load_workbook(filepath)
asset_sheetname = self.get_asset_sheetname(workbook)
@ -189,8 +198,13 @@ class DataLoader:
asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list")
asset_list = asset_list_correction_function(asset_list)
# For HA1, there is an exception in the structure of the data. We don't have any survey or ciga lists, and so
# we can return the asset list now
if ha_name == "HA1":
return asset_list, pd.DataFrame(), pd.DataFrame()
# We check if there is a survey list
survey_sheetname = "ECO Surveys"
survey_sheetname = self.get_survey_sheetname(workbook)
survey_sheet = workbook[survey_sheetname]
survey_rows = []
for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers
@ -217,6 +231,9 @@ class DataLoader:
ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]])
# Remove columns that are None
ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()]
# Remove rows with missing postcode which happens in a small number of cases
ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])]
ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
# Perform ciga list merge
if not ciga_list.empty:
@ -414,6 +431,10 @@ class DataLoader:
return survey_list
@staticmethod
def correct_ha107_survey_list(survey_list):
return survey_list
def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
# Correct the survey list
@ -441,7 +462,7 @@ class DataLoader:
df = df[df["matching_address"].str.contains(str(house_number))]
if df.shape[0] != 1:
df = df[df["HouseNo"] == str(house_number)]
df = df[df["HouseNo"].astype(str) == str(house_number)]
if df.shape[0] != 1:
df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())]
if df.shape[0] != 1:
@ -506,6 +527,7 @@ class DataLoader:
def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name):
matching_lookup = []
unmatched_addresses = []
for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)):
house_number = row["HouseNo"]
@ -528,7 +550,7 @@ class DataLoader:
}
)
continue
if df.shape[0] != 1:
# We split house number and postcode out of the matched address for ciga
@ -561,9 +583,6 @@ class DataLoader:
if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]:
raise ValueError(f"Unmatched addresses for {ha_name} is not as expected")
# In ciga: 35 Valley Drive, Leicester, LE3 3EE
#
matching_lookup = pd.DataFrame(matching_lookup)
# Merge onto the ciga list
@ -612,7 +631,7 @@ class DataLoader:
for filepath in self.directories:
ha_name = filepath.split("/")[2]
# Load asset list
logger.info("Loading asset list for {}".format(ha_name))
logger.info("Loading data for {}".format(ha_name))
asset_list, survey_list, ciga_list = self.load_asset_list(
filepath=filepath,
ha_name=ha_name,