diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 6006fec1..9c19eca9 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -14,11 +14,8 @@ from utils.s3 import ( ) from datetime import datetime -from backend.utils.addressMatch import ( - AddressMatch, - get_uprn_candidates, -) -from backend.address2UPRN.scoring import all_uprns_match +from backend.utils.addressMatch import AddressMatch +from backend.address2UPRN.scoring import all_uprns_match, rank_by_address_similarity from datatypes.epc.domain.historic_epc_matching import ( match_addresses_for_postcode, ) @@ -82,7 +79,7 @@ def get_uprn_with_epc_df( if epc_df.empty: return None - scored_df = get_uprn_candidates( + scored_df = rank_by_address_similarity( epc_df, user_address=user_inputed_address, ) @@ -174,7 +171,7 @@ def resolve_uprns_for_postcode_group( for _, row in group_df.iterrows(): user_address = str(row[address_col]).strip() - scored_df = get_uprn_candidates( + scored_df = rank_by_address_similarity( epc_df, user_address=user_address, ) diff --git a/backend/address2UPRN/scoring.py b/backend/address2UPRN/scoring.py index bfda2e71..2a681ad2 100644 --- a/backend/address2UPRN/scoring.py +++ b/backend/address2UPRN/scoring.py @@ -19,7 +19,7 @@ def all_uprns_match( return len(uprns) == 1 and uprns[0] == str(target_uprn) -def get_uprn_candidates( +def rank_by_address_similarity( df: pd.DataFrame, user_address: str, address_column: str = "address", diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index 3a7e7494..81896140 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -242,31 +242,6 @@ def score_addresses( return df[address_column].apply(lambda x: AddressMatch.score(user_address, x)) -def get_uprn_candidates( - df: pd.DataFrame, - user_address: str, - address_column: str = "address", - uprn_column: str = "uprn", -) -> pd.DataFrame: - """ - Annotate EPC results with lexicographical similarity scores and ranks. - Returns a DataFrame sorted by descending lexiscore. - """ - if address_column not in df.columns: - raise ValueError(f"Missing column: {address_column}") - if uprn_column not in df.columns: - raise ValueError(f"Missing column: {uprn_column}") - - out = df.copy() - user_norm = AddressMatch.normalise_address(user_address) - out["lexiscore"] = out[address_column].apply( - lambda x: AddressMatch.levenshtein(user_norm, x) - ) - out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) - out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) - return out.sort_values(["lexirank", "lexiscore"], ascending=[True, False]) - - def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: """Returns True if all non-null UPRNs in df match the given uprn.""" if column not in df.columns: diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py index 95ca9d9f..6ea2118b 100644 --- a/datatypes/epc/domain/historic_epc_matching.py +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -4,7 +4,7 @@ from typing import Optional import pandas as pd from botocore.exceptions import ClientError -from backend.address2UPRN.scoring import get_uprn_candidates +from backend.address2UPRN.scoring import rank_by_address_similarity from backend.utils.addressMatch import AddressMatch from datatypes.epc.domain.historic_epc import HistoricEpc from utils.pandas_utils import pandas_cell_to_str @@ -85,7 +85,7 @@ def match_addresses_for_postcode( ) from e raise - scored = get_uprn_candidates( + scored = rank_by_address_similarity( df, user_address=user_address, address_column=address_column,