diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 95ca3901..2d95a946 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -43,7 +43,8 @@ class DataLoader: # the asset list "HA14": 4, # There's just too many unmatched here - if we identify some homes that - "HA6": 117 + "HA6": 117, + "HA107": 52 } def __init__(self, directories, use_cache): @@ -130,7 +131,7 @@ class DataLoader: :return: """ - if ha_name in ["HA6", "HA14"]: + if ha_name in ["HA6", "HA14", "HA107"]: split_addresses = ciga_list['Matched Address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how @@ -153,8 +154,11 @@ class DataLoader: @staticmethod def get_ciga_sheetname(workbook): + if "CIGA Checks" in workbook.sheetnames: return "CIGA Checks" + elif "CIGA checks" in workbook.sheetnames: + return "CIGA checks" else: return "CIGA" @@ -490,6 +494,22 @@ class DataLoader: return survey_list + @staticmethod + def levenstein_match(matching_string, df): + match_to = df["matching_address"].tolist() + # Strip out punctuation and spaces + match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to] + match_to = [x.replace(" ", "") for x in match_to] + + # Perform matching between full key and match_to + distances = [Levenshtein.distance(matching_string, s) for s in match_to] + best_match_index = distances.index(min(distances)) + # We might want to consider a threshold for the distance, however for the momeny, + # we don't consider this for the moment + df = df.iloc[best_match_index:best_match_index + 1] + + return df + def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): # Correct the survey list @@ -544,17 +564,7 @@ class DataLoader: # Remove any spaces from the full key full_key = full_key.replace(" ", "") - match_to = df["matching_address"].tolist() - # Strip out punctuation and spaces - match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to] - match_to = [x.replace(" ", "") for x in match_to] - - # Perform matching between full key and match_to - distances = [Levenshtein.distance(full_key, s) for s in match_to] - best_match_index = distances.index(min(distances)) - # We might want to consider a threshold for the distance, however for the momeny, - # we don't consider this for the moment - df = df.iloc[best_match_index:best_match_index + 1] + df = self.levenstein_match(full_key, df) if df.shape[0] != 1: print(row["Street / Block Name"]) @@ -623,7 +633,7 @@ class DataLoader: asset_list["matching_address"].str.contains(row["Matched Postcode"].lower().strip()) ].copy() - df = df[df["HouseNo"] == str(house_number)] + df = df[df["HouseNo"].astype(str) == str(house_number)] # For ciga, we skip if df.empty: unmatched_addresses.append( @@ -641,7 +651,9 @@ class DataLoader: street_name = self.extract_streetname( address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"] ) - df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)] + # We check if any of the rows contains the street name and if they do, filter + if any(df["matching_address"].str.replace(",", "").str.contains(street_name)): + df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)] if df.shape[0] != 1: # The final check we do here is to check for the presence of flat in the address @@ -650,6 +662,13 @@ class DataLoader: else: df = df[df["matching_address"].str.contains("flat") == False] + if df.shape[0] != 1: + full_key = str(row["HouseNo"]).lower().strip() + row["Matched Address"].lower().strip() + row[ + "Matched Postcode"].lower().strip() + # Remove any spaces from the full key + full_key = full_key.replace(" ", "") + df = self.levenstein_match(full_key, df) + if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number) @@ -737,6 +756,19 @@ class DataLoader: s3_file_name="ha-analysis/batch3-inputs.pickle", ) + def ha_facts_and_figures(self): + """ + This function will return a dictionary of facts and figures for each HA + :return: + """ + ha_facts_and_figures = [] + for ha_name, data_assets in self.data.items(): + asset_list = data_assets["asset_list"] + survey_list = data_assets["survey_list"] + ciga_list = data_assets["ciga_list"] + + return ha_facts_and_figures + def get_epc_data( loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True @@ -1511,6 +1543,7 @@ def app(): loader = DataLoader(directories, use_cache) loader.load() + loader.ha_facts_and_figures() # TODO: We probably need to make sure that we have all of the columns that we need