Model/backend/address2UPRN/scoring.py

56 lines
1.4 KiB
Python

import pandas as pd
from backend.utils.addressMatch import AddressMatch
def all_uprns_match(
df: pd.DataFrame,
target_uprn: str,
column: str = "uprn",
) -> bool:
if column not in df.columns:
return False
uprns = df[column].dropna().astype(str).str.strip().unique()
if len(uprns) == 0:
return False
return len(uprns) == 1 and uprns[0] == str(target_uprn)
def rank_by_address_similarity(
df: pd.DataFrame,
user_address: str,
address_column: str = "address",
uprn_column: str = "uprn",
) -> pd.DataFrame:
"""
Annotate EPC results with lexicographical similarity scores and ranks.
Returns a DataFrame sorted by descending lexiscore.
DOES NOT choose or return a UPRN.
"""
if address_column not in df.columns:
raise ValueError(f"Missing column: {address_column}")
if uprn_column not in df.columns:
raise ValueError(f"Missing column: {uprn_column}")
out = df.copy()
user_norm = AddressMatch.normalise_address(user_address)
out["lexiscore"] = out[address_column].apply(
lambda x: AddressMatch.levenshtein(user_norm, x)
)
out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True)
out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int)
return out.sort_values(
["lexirank", "lexiscore"],
ascending=[True, False],
)