From cf9253d06201bbadca263eef269de973957b9556 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 22 Jan 2024 11:41:52 +0000 Subject: [PATCH] working on matching code for HA6 asset and survey lists --- .../ha_15_32/ha_analysis_batch_3.py | 49 +++++++++++++++++-- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index bd2c6c99..7fbddd54 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -52,7 +52,7 @@ class DataLoader: rows_data = [] rows_colors = [] - for row in sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + for row in tqdm(sheet.iter_rows(min_row=2, values_only=False)): # Assuming the first row is headers row_data = [cell.value for cell in row] # This will get you the cell values row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None # row_color = COLOR_INDEX[row_color] @@ -137,7 +137,7 @@ class DataLoader: ) # Add in asset_list_row_id - survey_list["survey_list_row_id"] = [ha_name + str(i) for i in range(0, len(survey_list))] + survey_list["survey_list_row_id"] = [ha_name + "_surveys_" + str(i) for i in range(0, len(survey_list))] # We now do the matching between the asset list and the survey list. # What we'll get from this is a lookup table from the asset list to the survey list @@ -150,14 +150,53 @@ class DataLoader: return survey_list def merge_ha_6(self, asset_list, survey_list): - pass + + # Prepare the asset list + asset_list["matching_address"] = asset_list["propertyaddress"].str.lower().strip() + asset_list["matching_postcode"] = asset_list["Post Code"].str.lower().strip() + + split_addresses = asset_list['matching_address'].str.split(',', expand=True) + split_addresses.columns = ['temp', 'address2', 'address3', 'address4', 'address5'] + house_numbers = split_addresses['temp'].str.split(' ', expand=True) + house_numbers.columns = ['HouseNo', 'part1', 'part2', "part3", "part4", "part5"] + + asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) + del split_addresses, house_numbers + + matching_lookup = [] + for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): + house_number = row["NO."] + if isinstance(house_number, str): + house_number = house_number.lower().strip() + + # Filter on the first line of the address + df = asset_list[ + asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip()) + ].copy() + df = df[df["matching_address"].str.contains(str(house_number))] + if df.shape[0] != 1: + df = df[df["HouseNo"] == str(house_number)] + if df.shape[0] != 1: + df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())] + if df.shape[0] != 1: + print(row["Street / Block Name"]) + print(house_number) + print(row["Post Code"].lower()) + raise ValueError("Investigate") + + matching_lookup.append( + { + "survey_list_row_id": row["survey_list_row_id"], + "asset_list_row_id": df["asset_list_row_id"].values[0], + } + ) def load(self): data = {} for ha_name, file_config in self.files.items(): # Load asset list - # logger.info("LOading asset list for {}".format(ha_name)) + logger.info("Loading asset list for {}".format(ha_name)) asset_list = self.load_asset_list( file_path=file_config["asset_list"]["filepath"], ha_name=ha_name, @@ -165,6 +204,7 @@ class DataLoader: ) if file_config.get("survey_list"): + logger.info("Loading survey list for {}".format(ha_name)) survey_list = self.load_survey_list( file_path=file_config["survey_list"]["filepath"], ha_name=ha_name, @@ -209,3 +249,4 @@ def app(): } loader = DataLoader(files) + loader.load()