get rid of duplicate function and make better sensible variable name

This commit is contained in:
Jun-te Kim 2026-05-12 13:46:02 +00:00
parent b0e935d497
commit 27f2ef5e83
4 changed files with 7 additions and 35 deletions

View file

@ -14,11 +14,8 @@ from utils.s3 import (
) )
from datetime import datetime from datetime import datetime
from backend.utils.addressMatch import ( from backend.utils.addressMatch import AddressMatch
AddressMatch, from backend.address2UPRN.scoring import all_uprns_match, rank_by_address_similarity
get_uprn_candidates,
)
from backend.address2UPRN.scoring import all_uprns_match
from datatypes.epc.domain.historic_epc_matching import ( from datatypes.epc.domain.historic_epc_matching import (
match_addresses_for_postcode, match_addresses_for_postcode,
) )
@ -82,7 +79,7 @@ def get_uprn_with_epc_df(
if epc_df.empty: if epc_df.empty:
return None return None
scored_df = get_uprn_candidates( scored_df = rank_by_address_similarity(
epc_df, epc_df,
user_address=user_inputed_address, user_address=user_inputed_address,
) )
@ -174,7 +171,7 @@ def resolve_uprns_for_postcode_group(
for _, row in group_df.iterrows(): for _, row in group_df.iterrows():
user_address = str(row[address_col]).strip() user_address = str(row[address_col]).strip()
scored_df = get_uprn_candidates( scored_df = rank_by_address_similarity(
epc_df, epc_df,
user_address=user_address, user_address=user_address,
) )

View file

@ -19,7 +19,7 @@ def all_uprns_match(
return len(uprns) == 1 and uprns[0] == str(target_uprn) return len(uprns) == 1 and uprns[0] == str(target_uprn)
def get_uprn_candidates( def rank_by_address_similarity(
df: pd.DataFrame, df: pd.DataFrame,
user_address: str, user_address: str,
address_column: str = "address", address_column: str = "address",

View file

@ -242,31 +242,6 @@ def score_addresses(
return df[address_column].apply(lambda x: AddressMatch.score(user_address, x)) return df[address_column].apply(lambda x: AddressMatch.score(user_address, x))
def get_uprn_candidates(
df: pd.DataFrame,
user_address: str,
address_column: str = "address",
uprn_column: str = "uprn",
) -> pd.DataFrame:
"""
Annotate EPC results with lexicographical similarity scores and ranks.
Returns a DataFrame sorted by descending lexiscore.
"""
if address_column not in df.columns:
raise ValueError(f"Missing column: {address_column}")
if uprn_column not in df.columns:
raise ValueError(f"Missing column: {uprn_column}")
out = df.copy()
user_norm = AddressMatch.normalise_address(user_address)
out["lexiscore"] = out[address_column].apply(
lambda x: AddressMatch.levenshtein(user_norm, x)
)
out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True)
out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int)
return out.sort_values(["lexirank", "lexiscore"], ascending=[True, False])
def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool:
"""Returns True if all non-null UPRNs in df match the given uprn.""" """Returns True if all non-null UPRNs in df match the given uprn."""
if column not in df.columns: if column not in df.columns:

View file

@ -4,7 +4,7 @@ from typing import Optional
import pandas as pd import pandas as pd
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
from backend.address2UPRN.scoring import get_uprn_candidates from backend.address2UPRN.scoring import rank_by_address_similarity
from backend.utils.addressMatch import AddressMatch from backend.utils.addressMatch import AddressMatch
from datatypes.epc.domain.historic_epc import HistoricEpc from datatypes.epc.domain.historic_epc import HistoricEpc
from utils.pandas_utils import pandas_cell_to_str from utils.pandas_utils import pandas_cell_to_str
@ -85,7 +85,7 @@ def match_addresses_for_postcode(
) from e ) from e
raise raise
scored = get_uprn_candidates( scored = rank_by_address_similarity(
df, df,
user_address=user_address, user_address=user_address,
address_column=address_column, address_column=address_column,