diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index c4f6307c..3ea9649e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -183,7 +183,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA16", "HA24", "HA25"]: + if ha_name in ["HA1", "HA6", "HA16", "HA24"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -214,6 +214,14 @@ class DataLoader: asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA25": + asset_list["matching_address"] = asset_list[ + self.COLUMN_CONFIG[ha_name]["address"] + ].astype(str).str.lower().str.strip() + + asset_list["matching_postcode"] = asset_list['matching_address'].apply( + lambda x: ' '.join(x.split()[-2:]) if pd.notnull(x) else x + ) elif ha_name == "HA28": asset_list["matching_address"] = ( asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + @@ -352,6 +360,9 @@ class DataLoader: house_numbers = house_numbers.iloc[:, 0:1] house_numbers.columns = ['HouseNo'] + # Remove trailing punctuation such as , or ; + house_numbers["HouseNo"] = house_numbers["HouseNo"].str.rstrip(',;') + asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) return asset_list @@ -425,27 +436,16 @@ class DataLoader: workbook = openpyxl.load_workbook(filepath) asset_sheetname = self.get_asset_sheetname(workbook) - # TODO: TEMP - sheetnames_lower = [x.lower() for x in workbook.sheetnames] - if any("eco3" in x for x in sheetnames_lower): - raise Exception("REMOVE ME") - asset_sheet = workbook[asset_sheetname] asset_sheet_colnames = [cell.value for cell in asset_sheet[1]] if ha_name == "HA25": asset_sheet_colnames[11] = "matching_postcode" - values_only = not ha_name != "HA25" - rows_data = [] - if not values_only: - for row in asset_sheet.iter_rows(min_row=2, values_only=values_only): - row_data = [cell.value for cell in row] # This will get you the cell values - rows_data.append(row_data) - else: - for row in asset_sheet.iter_rows(min_row=2, values_only=values_only): # use values_only=True to get values - row_data = list(row) # No need for comprehension, values_only=True returns a tuple of values - rows_data.append(row_data) + + for row in asset_sheet.iter_rows(min_row=2, values_only=False): + row_data = [cell.value for cell in row] # This will get you the cell values + rows_data.append(row_data) asset_list = pd.DataFrame(rows_data, columns=asset_sheet_colnames) @@ -477,6 +477,29 @@ class DataLoader: if ha_name in ["HA1", "HA25"]: return asset_list, pd.DataFrame(), pd.DataFrame() + # If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be + # suitable under ECO4, since their walls will be filled + eco3_list = pd.DataFrame() + sheetnames_lower = [x.lower() for x in workbook.sheetnames] + eco3_sheetname_index = [i for i, x in enumerate(sheetnames_lower) if "eco3" in x.replace(" ", "")] + if eco3_sheetname_index: + eco3_sheetname = workbook.sheetnames[eco3_sheetname_index[0]] + eco3_sheet = workbook[eco3_sheetname] + eco3_rows = [] + for row in eco3_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + eco3_rows.append(row_data) + + eco3_list = pd.DataFrame(eco3_rows, columns=[cell.value for cell in eco3_sheet[1]]) + # Remove columns that are None + eco3_list = eco3_list.loc[:, eco3_list.columns.notnull()] + # Remove rows that are completely empty + eco3_list = eco3_list.loc[eco3_list.loc[:, eco3_list.columns].notnull().any(axis=1)] + eco3_list["eco3_list_row_id"] = [ha_name + "_Eco3_" + str(i) for i in range(0, len(eco3_list))] + + # Perform the eco3 merge + eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name) + # We check if there is a survey list survey_sheetname = self.get_survey_sheetname(workbook) survey_sheet = workbook[survey_sheetname] @@ -518,7 +541,7 @@ class DataLoader: ciga_list = self.dedupe_ciga_list(ciga_list) ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) - return asset_list, survey_list, ciga_list + return asset_list, survey_list, ciga_list, eco3_list @staticmethod def correct_ha6_asset_list(asset_list): @@ -1433,6 +1456,79 @@ class DataLoader: return survey_list + def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): + + # We add on a matching postcode without spaces for this + # asset_list["matching_postcode_no_space"] = asset_list["matching_postcode"].str.lower().str.replace(" ", "") + + # May need an eco3 list correction function + + # NEADS DRIVE, postcode with bs305dt, is not found in the asset list + eco3_list = eco3_list[ + ~(eco3_list["Post Code"] == "BS305DT") + ] + # Drop rows with missings postcode + eco3_list = eco3_list[ + ~pd.isnull(eco3_list["Post Code"]) + ] + + missed_postcodes = [] + if ha_name == "HA25": + missed_postcodes = { + postcode.lower() for postcode in eco3_list["Post Code"] if + postcode.lower() not in asset_list["matching_postcode"].values + } + eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)] + + matching_lookup = [] + missed = [] + for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)): + + postcode = row["Post Code"].lower().strip() + + # df will never be empty, since we've already done a check for common postcodes + df = asset_list[ + asset_list["matching_postcode"].str.contains(postcode) + ] + + house_number = row["NO "] + if isinstance(house_number, str): + house_number = house_number.lower().strip() + + if not any(df["matching_address"].str.contains(str(house_number))): + if "flat" in str(house_number): + house_number = house_number.split("flat")[1].strip() + + # We check if we had an instance of flat x, y + if "," in str(house_number): + house_number = house_number.split(",")[0].strip() + + # We may also have a space for an instance of flat x y + if " " in str(house_number): + house_number = house_number.split(" ")[0].strip() + + df = df[df["matching_address"].str.contains(str(house_number))] + + if df.empty: + missed.append(row["eco3_list_row_id"]) + continue + + if df.shape[0] != 1: + df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] + + if df.shape[0] != 1: + print(row["Street / Block Name"]) + print(house_number) + print(row["Post Code"]) + raise ValueError("Investigate") + + matching_lookup.append( + { + "eco3_list_row_id": row["eco3_list_row_id"], + "asset_list_row_id": df["asset_list_row_id"].values[0], + } + ) + @staticmethod def extract_streetname(address, house_number=None, postcode=None): """ @@ -4008,11 +4104,13 @@ def app(): # Add in: "HA25" # TODO: Remove ECO3 sales from HA25 priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA28", "HA32", "HA38", "HA39", "HA107", + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA20", "HA24", "HA25", "HA28", "HA32", "HA39", "HA107", ] # Next HAs to do: 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come back on this], # Then: 28 [DONE], - # 38, 41, 10, 14, 20, 48 + # 41, 10, 14 [DONE], 20, 48, 50 + # 38[problematic, but no ECO4] + # TODO - do 50 and 25 next # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has]