mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
get rid of duplicate function and make better sensible variable name
This commit is contained in:
parent
b0e935d497
commit
27f2ef5e83
4 changed files with 7 additions and 35 deletions
|
|
@ -14,11 +14,8 @@ from utils.s3 import (
|
||||||
)
|
)
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from backend.utils.addressMatch import (
|
from backend.utils.addressMatch import AddressMatch
|
||||||
AddressMatch,
|
from backend.address2UPRN.scoring import all_uprns_match, rank_by_address_similarity
|
||||||
get_uprn_candidates,
|
|
||||||
)
|
|
||||||
from backend.address2UPRN.scoring import all_uprns_match
|
|
||||||
from datatypes.epc.domain.historic_epc_matching import (
|
from datatypes.epc.domain.historic_epc_matching import (
|
||||||
match_addresses_for_postcode,
|
match_addresses_for_postcode,
|
||||||
)
|
)
|
||||||
|
|
@ -82,7 +79,7 @@ def get_uprn_with_epc_df(
|
||||||
if epc_df.empty:
|
if epc_df.empty:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
scored_df = get_uprn_candidates(
|
scored_df = rank_by_address_similarity(
|
||||||
epc_df,
|
epc_df,
|
||||||
user_address=user_inputed_address,
|
user_address=user_inputed_address,
|
||||||
)
|
)
|
||||||
|
|
@ -174,7 +171,7 @@ def resolve_uprns_for_postcode_group(
|
||||||
for _, row in group_df.iterrows():
|
for _, row in group_df.iterrows():
|
||||||
user_address = str(row[address_col]).strip()
|
user_address = str(row[address_col]).strip()
|
||||||
|
|
||||||
scored_df = get_uprn_candidates(
|
scored_df = rank_by_address_similarity(
|
||||||
epc_df,
|
epc_df,
|
||||||
user_address=user_address,
|
user_address=user_address,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ def all_uprns_match(
|
||||||
return len(uprns) == 1 and uprns[0] == str(target_uprn)
|
return len(uprns) == 1 and uprns[0] == str(target_uprn)
|
||||||
|
|
||||||
|
|
||||||
def get_uprn_candidates(
|
def rank_by_address_similarity(
|
||||||
df: pd.DataFrame,
|
df: pd.DataFrame,
|
||||||
user_address: str,
|
user_address: str,
|
||||||
address_column: str = "address",
|
address_column: str = "address",
|
||||||
|
|
|
||||||
|
|
@ -242,31 +242,6 @@ def score_addresses(
|
||||||
return df[address_column].apply(lambda x: AddressMatch.score(user_address, x))
|
return df[address_column].apply(lambda x: AddressMatch.score(user_address, x))
|
||||||
|
|
||||||
|
|
||||||
def get_uprn_candidates(
|
|
||||||
df: pd.DataFrame,
|
|
||||||
user_address: str,
|
|
||||||
address_column: str = "address",
|
|
||||||
uprn_column: str = "uprn",
|
|
||||||
) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Annotate EPC results with lexicographical similarity scores and ranks.
|
|
||||||
Returns a DataFrame sorted by descending lexiscore.
|
|
||||||
"""
|
|
||||||
if address_column not in df.columns:
|
|
||||||
raise ValueError(f"Missing column: {address_column}")
|
|
||||||
if uprn_column not in df.columns:
|
|
||||||
raise ValueError(f"Missing column: {uprn_column}")
|
|
||||||
|
|
||||||
out = df.copy()
|
|
||||||
user_norm = AddressMatch.normalise_address(user_address)
|
|
||||||
out["lexiscore"] = out[address_column].apply(
|
|
||||||
lambda x: AddressMatch.levenshtein(user_norm, x)
|
|
||||||
)
|
|
||||||
out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True)
|
|
||||||
out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int)
|
|
||||||
return out.sort_values(["lexirank", "lexiscore"], ascending=[True, False])
|
|
||||||
|
|
||||||
|
|
||||||
def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool:
|
def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool:
|
||||||
"""Returns True if all non-null UPRNs in df match the given uprn."""
|
"""Returns True if all non-null UPRNs in df match the given uprn."""
|
||||||
if column not in df.columns:
|
if column not in df.columns:
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,7 @@ from typing import Optional
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from botocore.exceptions import ClientError
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
from backend.address2UPRN.scoring import get_uprn_candidates
|
from backend.address2UPRN.scoring import rank_by_address_similarity
|
||||||
from backend.utils.addressMatch import AddressMatch
|
from backend.utils.addressMatch import AddressMatch
|
||||||
from datatypes.epc.domain.historic_epc import HistoricEpc
|
from datatypes.epc.domain.historic_epc import HistoricEpc
|
||||||
from utils.pandas_utils import pandas_cell_to_str
|
from utils.pandas_utils import pandas_cell_to_str
|
||||||
|
|
@ -85,7 +85,7 @@ def match_addresses_for_postcode(
|
||||||
) from e
|
) from e
|
||||||
raise
|
raise
|
||||||
|
|
||||||
scored = get_uprn_candidates(
|
scored = rank_by_address_similarity(
|
||||||
df,
|
df,
|
||||||
user_address=user_address,
|
user_address=user_address,
|
||||||
address_column=address_column,
|
address_column=address_column,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue