completed matching for ha107, added levenstein method

This commit is contained in:
Khalim Conn-Kowlessar 2024-02-23 12:08:44 +00:00
parent ccb764d4a9
commit cef20c6e2c

View file

@ -1,6 +1,7 @@
import os
import re
import openpyxl
import Levenshtein
from pathlib import Path
import msgpack
from datetime import datetime
@ -453,6 +454,41 @@ class DataLoader:
"Snelland Road Wickenby, Lincoln", "Snelland Road, Wickenby, Lincoln"
)
# Replace Reasby Road Snelland, Lincoln with Reasby Road, Snelland, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Reasby Road Snelland, Lincoln", "Reasby Road, Snelland, Lincoln"
)
# Replace Silver Street Bardney, Lincoln with Silver Street, Bardney, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Silver Street Bardney, Lincoln", "Silver Street, Bardney, Lincoln"
)
# Replace Manor Close Bardney, Lincoln with Manor Close, Bardney, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Manor Close Bardney, Lincoln", "Manor Close, Bardney, Lincoln"
)
# Replace Ferry Road Southrey, Lincoln with Ferry Road, Southrey, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Ferry Road Southrey, Lincoln", "Ferry Road, Southrey, Lincoln"
)
# Replace Harvey Kent Gardens Bardney, Lincoln with Harvey Kent Gardens, Bardney, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Harvey Kent Gardens Bardney, Lincoln", "Harvey Kent Gardens, Bardney, Lincoln"
)
# Replace Wragby Road Bardney, Lincoln with Wragby Road, Bardney, Lincoln
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"Wragby Road Bardney, Lincoln", "Wragby Road, Bardney, Lincoln"
)
# Replace SPRINKHILL ROAD with SPINKHILL ROAD
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"SPRINKHILL ROAD", "SPINKHILL ROAD"
)
return survey_list
def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
@ -481,10 +517,35 @@ class DataLoader:
].copy()
df = df[df["matching_address"].str.contains(str(house_number))]
if df.empty:
print(row["Street / Block Name"])
print(house_number)
print(row["Post Code"])
raise ValueError("Investigate")
if df.shape[0] != 1:
df = df[df["HouseNo"].astype(str) == str(house_number)]
if df.shape[0] != 1:
df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())]
full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + row[
"Town/Area"].lower().strip() + row["Post Code"].lower().strip()
# Remove any spaces from the full key
full_key = full_key.replace(" ", "")
match_to = df["matching_address"].tolist()
# Strip out punctuation and spaces
match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
match_to = [x.replace(" ", "") for x in match_to]
# Perform matching between full key and match_to
distances = [Levenshtein.distance(full_key, s) for s in match_to]
best_match_index = distances.index(min(distances))
# We might want to consider a threshold for the distance, however for the momeny,
# we don't consider this for the moment
df = df.iloc[best_match_index:best_match_index + 1]
if df.shape[0] != 1:
postcode_lower = row["Post Code"].lower()
if postcode_lower in missed_postcodes:
@ -510,6 +571,9 @@ class DataLoader:
matching_lookup = pd.DataFrame(matching_lookup)
if matching_lookup.shape[0] != survey_list.shape[0]:
raise ValueError("Mismatch in the number of survey rows and matching lookup rows")
# Merge onto the survey list
survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id")