rank address similiarity

This commit is contained in:
Jun-te Kim 2026-05-12 16:02:01 +00:00
parent 8b27a5173b
commit dfc100f78b
4 changed files with 23 additions and 14 deletions

View file

@ -15,7 +15,7 @@ from utils.s3 import (
from datetime import datetime from datetime import datetime
from backend.utils.addressMatch import AddressMatch from backend.utils.addressMatch import AddressMatch
from backend.address2UPRN.scoring import all_uprns_match, rank_by_address_similarity from backend.address2UPRN.scoring import all_uprns_match, rank_address_similarity
from datatypes.epc.domain.historic_epc_matching import ( from datatypes.epc.domain.historic_epc_matching import (
match_addresses_for_postcode, match_addresses_for_postcode,
) )
@ -79,7 +79,7 @@ def get_uprn_with_epc_df(
if epc_df.empty: if epc_df.empty:
return None return None
scored_df = rank_by_address_similarity( scored_df = rank_address_similarity(
epc_df, epc_df,
user_address=user_inputed_address, user_address=user_inputed_address,
) )
@ -171,7 +171,7 @@ def resolve_uprns_for_postcode_group(
for _, row in group_df.iterrows(): for _, row in group_df.iterrows():
user_address = str(row[address_col]).strip() user_address = str(row[address_col]).strip()
scored_df = rank_by_address_similarity( scored_df = rank_address_similarity(
epc_df, epc_df,
user_address=user_address, user_address=user_address,
) )

View file

@ -19,8 +19,8 @@ def all_uprns_match(
return len(uprns) == 1 and uprns[0] == str(target_uprn) return len(uprns) == 1 and uprns[0] == str(target_uprn)
def rank_by_address_similarity( def rank_address_similarity(
df: pd.DataFrame, address_list_df: pd.DataFrame,
user_address: str, user_address: str,
address_column: str = "address", address_column: str = "address",
uprn_column: str = "uprn", uprn_column: str = "uprn",
@ -32,13 +32,13 @@ def rank_by_address_similarity(
DOES NOT choose or return a UPRN. DOES NOT choose or return a UPRN.
""" """
if address_column not in df.columns: if address_column not in address_list_df.columns:
raise ValueError(f"Missing column: {address_column}") raise ValueError(f"Missing column: {address_column}")
if uprn_column not in df.columns: if uprn_column not in address_list_df.columns:
raise ValueError(f"Missing column: {uprn_column}") raise ValueError(f"Missing column: {uprn_column}")
out = df.copy() out = address_list_df.copy()
user_norm = AddressMatch.normalise_address(user_address) user_norm = AddressMatch.normalise_address(user_address)

View file

@ -4,7 +4,7 @@ from typing import Optional
import pandas as pd import pandas as pd
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
from backend.address2UPRN.scoring import rank_by_address_similarity from backend.address2UPRN.scoring import rank_address_similarity
from backend.utils.addressMatch import AddressMatch from backend.utils.addressMatch import AddressMatch
from datatypes.epc.domain.historic_epc import HistoricEpc from datatypes.epc.domain.historic_epc import HistoricEpc
from utils.pandas_utils import pandas_cell_to_str from utils.pandas_utils import pandas_cell_to_str
@ -85,7 +85,7 @@ def match_addresses_for_postcode(
) from e ) from e
raise raise
scored = rank_by_address_similarity( scored = rank_address_similarity(
df, df,
user_address=user_address, user_address=user_address,
address_column=address_column, address_column=address_column,

View file

@ -1,6 +1,7 @@
import os import os
import time import time
from enum import Enum from enum import Enum
from http import HTTPStatus
from typing import Optional, cast, Callable, Any from typing import Optional, cast, Callable, Any
from hubspot.client import Client # type: ignore[reportMissingTypeStubs] from hubspot.client import Client # type: ignore[reportMissingTypeStubs]
@ -86,19 +87,27 @@ class HubspotClient:
def _call_with_retry(self, fn: Callable[[], Any], max_retries: int = 2) -> Any: def _call_with_retry(self, fn: Callable[[], Any], max_retries: int = 2) -> Any:
""" """
Call fn(), retrying up to max_retries times on 429 rate-limit errors. Call fn(), retrying up to max_retries times on 429 rate-limit errors
or transient 5xx server errors.
Waits the minimal amount: the remaining interval window reported by HubSpot headers. Waits the minimal amount: the remaining interval window reported by HubSpot headers.
Falls back to the full interval (10s) if headers are absent. Falls back to the full interval (10s) if headers are absent.
Note: each HubSpot sub-module (deals, companies, etc.) ships its own ApiException Note: each HubSpot sub-module (deals, companies, etc.) ships its own ApiException
class with no shared base beyond Exception, so we detect 429s via duck-typing. class with no shared base beyond Exception, so we detect retryable statuses via duck-typing.
""" """
retryable_statuses = {
HTTPStatus.TOO_MANY_REQUESTS,
HTTPStatus.INTERNAL_SERVER_ERROR,
HTTPStatus.BAD_GATEWAY,
HTTPStatus.SERVICE_UNAVAILABLE,
HTTPStatus.GATEWAY_TIMEOUT,
}
for attempt in range(max_retries + 1): for attempt in range(max_retries + 1):
try: try:
return fn() return fn()
except Exception as e: except Exception as e:
status = getattr(e, "status", None) status = getattr(e, "status", None)
if status != 429 or attempt == max_retries: if status not in retryable_statuses or attempt == max_retries:
raise raise
headers = getattr(e, "headers", None) or {} headers = getattr(e, "headers", None) or {}
interval_ms = int( interval_ms = int(
@ -106,7 +115,7 @@ class HubspotClient:
) )
wait_s = interval_ms / 1000.0 wait_s = interval_ms / 1000.0
self.logger.warning( self.logger.warning(
f"HubSpot 429 (attempt {attempt + 1}/{max_retries}), " f"HubSpot {status} (attempt {attempt + 1}/{max_retries}), "
f"waiting {wait_s:.1f}s before retry." f"waiting {wait_s:.1f}s before retry."
) )
time.sleep(wait_s) time.sleep(wait_s)