From 32352bbde145c6a0c76f503c766e7fca80c2af99 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 22 Feb 2024 17:46:11 +0000 Subject: [PATCH] working on survey match for ha107 --- .../ha_15_32/ha_analysis_batch_3.py | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1a28500b..9e850c0e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -40,7 +40,9 @@ class DataLoader: UNMATCHED_CIGA = { # We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not # the asset list - "HA14": 4 + "HA14": 4, + # There's just too many unmatched here - if we identify some homes that + "HA6": 117 } def __init__(self, directories, use_cache): @@ -78,11 +80,11 @@ class DataLoader: elif ha_name == "HA107": # Create matching_address by concatenating House No, Street, Town, District, Postcode asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \ - asset_list["Street"].str.lower().str.strip() + ", " + \ - asset_list["Town"].str.lower().str.strip() + ", " + \ - asset_list["District"].str.lower().str.strip() + ", " + \ - asset_list["Postcode"].str.lower().str.strip() - asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip() + asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["District"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() else: raise NotImplementedError("implement me") @@ -155,6 +157,13 @@ class DataLoader: else: return "CIGA" + @staticmethod + def get_survey_sheetname(workbook): + if "ECO Surveys" in workbook.sheetnames: + return "ECO Surveys" + else: + return "ECO surveys" + def load_asset_list(self, filepath, ha_name): workbook = openpyxl.load_workbook(filepath) asset_sheetname = self.get_asset_sheetname(workbook) @@ -189,8 +198,13 @@ class DataLoader: asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list") asset_list = asset_list_correction_function(asset_list) + # For HA1, there is an exception in the structure of the data. We don't have any survey or ciga lists, and so + # we can return the asset list now + if ha_name == "HA1": + return asset_list, pd.DataFrame(), pd.DataFrame() + # We check if there is a survey list - survey_sheetname = "ECO Surveys" + survey_sheetname = self.get_survey_sheetname(workbook) survey_sheet = workbook[survey_sheetname] survey_rows = [] for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers @@ -217,6 +231,9 @@ class DataLoader: ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]]) # Remove columns that are None ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()] + # Remove rows with missing postcode which happens in a small number of cases + ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])] + ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))] # Perform ciga list merge if not ciga_list.empty: @@ -414,6 +431,10 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha107_survey_list(survey_list): + return survey_list + def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): # Correct the survey list @@ -441,7 +462,7 @@ class DataLoader: df = df[df["matching_address"].str.contains(str(house_number))] if df.shape[0] != 1: - df = df[df["HouseNo"] == str(house_number)] + df = df[df["HouseNo"].astype(str) == str(house_number)] if df.shape[0] != 1: df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())] if df.shape[0] != 1: @@ -506,6 +527,7 @@ class DataLoader: def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name): matching_lookup = [] unmatched_addresses = [] + for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)): house_number = row["HouseNo"] @@ -528,7 +550,7 @@ class DataLoader: } ) continue - + if df.shape[0] != 1: # We split house number and postcode out of the matched address for ciga @@ -561,9 +583,6 @@ class DataLoader: if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]: raise ValueError(f"Unmatched addresses for {ha_name} is not as expected") - # In ciga: 35 Valley Drive, Leicester, LE3 3EE - # - matching_lookup = pd.DataFrame(matching_lookup) # Merge onto the ciga list @@ -612,7 +631,7 @@ class DataLoader: for filepath in self.directories: ha_name = filepath.split("/")[2] # Load asset list - logger.info("Loading asset list for {}".format(ha_name)) + logger.info("Loading data for {}".format(ha_name)) asset_list, survey_list, ciga_list = self.load_asset_list( filepath=filepath, ha_name=ha_name,