diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 60ef485a..bf3e6d31 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1,6 +1,7 @@ import os import re import openpyxl +import Levenshtein from pathlib import Path import msgpack from datetime import datetime @@ -453,6 +454,41 @@ class DataLoader: "Snelland Road Wickenby, Lincoln", "Snelland Road, Wickenby, Lincoln" ) + # Replace Reasby Road Snelland, Lincoln with Reasby Road, Snelland, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Reasby Road Snelland, Lincoln", "Reasby Road, Snelland, Lincoln" + ) + + # Replace Silver Street Bardney, Lincoln with Silver Street, Bardney, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Silver Street Bardney, Lincoln", "Silver Street, Bardney, Lincoln" + ) + + # Replace Manor Close Bardney, Lincoln with Manor Close, Bardney, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Manor Close Bardney, Lincoln", "Manor Close, Bardney, Lincoln" + ) + + # Replace Ferry Road Southrey, Lincoln with Ferry Road, Southrey, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Ferry Road Southrey, Lincoln", "Ferry Road, Southrey, Lincoln" + ) + + # Replace Harvey Kent Gardens Bardney, Lincoln with Harvey Kent Gardens, Bardney, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Harvey Kent Gardens Bardney, Lincoln", "Harvey Kent Gardens, Bardney, Lincoln" + ) + + # Replace Wragby Road Bardney, Lincoln with Wragby Road, Bardney, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Wragby Road Bardney, Lincoln", "Wragby Road, Bardney, Lincoln" + ) + + # Replace SPRINKHILL ROAD with SPINKHILL ROAD + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "SPRINKHILL ROAD", "SPINKHILL ROAD" + ) + return survey_list def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): @@ -481,10 +517,35 @@ class DataLoader: ].copy() df = df[df["matching_address"].str.contains(str(house_number))] + + if df.empty: + print(row["Street / Block Name"]) + print(house_number) + print(row["Post Code"]) + raise ValueError("Investigate") + if df.shape[0] != 1: df = df[df["HouseNo"].astype(str) == str(house_number)] if df.shape[0] != 1: df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())] + + full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + row[ + "Town/Area"].lower().strip() + row["Post Code"].lower().strip() + # Remove any spaces from the full key + full_key = full_key.replace(" ", "") + + match_to = df["matching_address"].tolist() + # Strip out punctuation and spaces + match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to] + match_to = [x.replace(" ", "") for x in match_to] + + # Perform matching between full key and match_to + distances = [Levenshtein.distance(full_key, s) for s in match_to] + best_match_index = distances.index(min(distances)) + # We might want to consider a threshold for the distance, however for the momeny, + # we don't consider this for the moment + df = df.iloc[best_match_index:best_match_index + 1] + if df.shape[0] != 1: postcode_lower = row["Post Code"].lower() if postcode_lower in missed_postcodes: @@ -510,6 +571,9 @@ class DataLoader: matching_lookup = pd.DataFrame(matching_lookup) + if matching_lookup.shape[0] != survey_list.shape[0]: + raise ValueError("Mismatch in the number of survey rows and matching lookup rows") + # Merge onto the survey list survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id")