mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
completed matching for ha107, added levenstein method
This commit is contained in:
parent
ccb764d4a9
commit
cef20c6e2c
1 changed files with 64 additions and 0 deletions
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import re
|
||||
import openpyxl
|
||||
import Levenshtein
|
||||
from pathlib import Path
|
||||
import msgpack
|
||||
from datetime import datetime
|
||||
|
|
@ -453,6 +454,41 @@ class DataLoader:
|
|||
"Snelland Road Wickenby, Lincoln", "Snelland Road, Wickenby, Lincoln"
|
||||
)
|
||||
|
||||
# Replace Reasby Road Snelland, Lincoln with Reasby Road, Snelland, Lincoln
|
||||
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
||||
"Reasby Road Snelland, Lincoln", "Reasby Road, Snelland, Lincoln"
|
||||
)
|
||||
|
||||
# Replace Silver Street Bardney, Lincoln with Silver Street, Bardney, Lincoln
|
||||
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
||||
"Silver Street Bardney, Lincoln", "Silver Street, Bardney, Lincoln"
|
||||
)
|
||||
|
||||
# Replace Manor Close Bardney, Lincoln with Manor Close, Bardney, Lincoln
|
||||
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
||||
"Manor Close Bardney, Lincoln", "Manor Close, Bardney, Lincoln"
|
||||
)
|
||||
|
||||
# Replace Ferry Road Southrey, Lincoln with Ferry Road, Southrey, Lincoln
|
||||
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
||||
"Ferry Road Southrey, Lincoln", "Ferry Road, Southrey, Lincoln"
|
||||
)
|
||||
|
||||
# Replace Harvey Kent Gardens Bardney, Lincoln with Harvey Kent Gardens, Bardney, Lincoln
|
||||
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
||||
"Harvey Kent Gardens Bardney, Lincoln", "Harvey Kent Gardens, Bardney, Lincoln"
|
||||
)
|
||||
|
||||
# Replace Wragby Road Bardney, Lincoln with Wragby Road, Bardney, Lincoln
|
||||
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
||||
"Wragby Road Bardney, Lincoln", "Wragby Road, Bardney, Lincoln"
|
||||
)
|
||||
|
||||
# Replace SPRINKHILL ROAD with SPINKHILL ROAD
|
||||
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
|
||||
"SPRINKHILL ROAD", "SPINKHILL ROAD"
|
||||
)
|
||||
|
||||
return survey_list
|
||||
|
||||
def merge_surveys_to_assets(self, asset_list, survey_list, ha_name):
|
||||
|
|
@ -481,10 +517,35 @@ class DataLoader:
|
|||
].copy()
|
||||
|
||||
df = df[df["matching_address"].str.contains(str(house_number))]
|
||||
|
||||
if df.empty:
|
||||
print(row["Street / Block Name"])
|
||||
print(house_number)
|
||||
print(row["Post Code"])
|
||||
raise ValueError("Investigate")
|
||||
|
||||
if df.shape[0] != 1:
|
||||
df = df[df["HouseNo"].astype(str) == str(house_number)]
|
||||
if df.shape[0] != 1:
|
||||
df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())]
|
||||
|
||||
full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + row[
|
||||
"Town/Area"].lower().strip() + row["Post Code"].lower().strip()
|
||||
# Remove any spaces from the full key
|
||||
full_key = full_key.replace(" ", "")
|
||||
|
||||
match_to = df["matching_address"].tolist()
|
||||
# Strip out punctuation and spaces
|
||||
match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to]
|
||||
match_to = [x.replace(" ", "") for x in match_to]
|
||||
|
||||
# Perform matching between full key and match_to
|
||||
distances = [Levenshtein.distance(full_key, s) for s in match_to]
|
||||
best_match_index = distances.index(min(distances))
|
||||
# We might want to consider a threshold for the distance, however for the momeny,
|
||||
# we don't consider this for the moment
|
||||
df = df.iloc[best_match_index:best_match_index + 1]
|
||||
|
||||
if df.shape[0] != 1:
|
||||
postcode_lower = row["Post Code"].lower()
|
||||
if postcode_lower in missed_postcodes:
|
||||
|
|
@ -510,6 +571,9 @@ class DataLoader:
|
|||
|
||||
matching_lookup = pd.DataFrame(matching_lookup)
|
||||
|
||||
if matching_lookup.shape[0] != survey_list.shape[0]:
|
||||
raise ValueError("Mismatch in the number of survey rows and matching lookup rows")
|
||||
|
||||
# Merge onto the survey list
|
||||
survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue