rank address similiarity

This commit is contained in:
Jun-te Kim 2026-05-12 16:02:01 +00:00
parent 8b27a5173b
commit dfc100f78b
4 changed files with 23 additions and 14 deletions

View file

@ -15,7 +15,7 @@ from utils.s3 import (
from datetime import datetime
from backend.utils.addressMatch import AddressMatch
from backend.address2UPRN.scoring import all_uprns_match, rank_by_address_similarity
from backend.address2UPRN.scoring import all_uprns_match, rank_address_similarity
from datatypes.epc.domain.historic_epc_matching import (
match_addresses_for_postcode,
)
@ -79,7 +79,7 @@ def get_uprn_with_epc_df(
if epc_df.empty:
return None
scored_df = rank_by_address_similarity(
scored_df = rank_address_similarity(
epc_df,
user_address=user_inputed_address,
)
@ -171,7 +171,7 @@ def resolve_uprns_for_postcode_group(
for _, row in group_df.iterrows():
user_address = str(row[address_col]).strip()
scored_df = rank_by_address_similarity(
scored_df = rank_address_similarity(
epc_df,
user_address=user_address,
)

View file

@ -19,8 +19,8 @@ def all_uprns_match(
return len(uprns) == 1 and uprns[0] == str(target_uprn)
def rank_by_address_similarity(
df: pd.DataFrame,
def rank_address_similarity(
address_list_df: pd.DataFrame,
user_address: str,
address_column: str = "address",
uprn_column: str = "uprn",
@ -32,13 +32,13 @@ def rank_by_address_similarity(
DOES NOT choose or return a UPRN.
"""
if address_column not in df.columns:
if address_column not in address_list_df.columns:
raise ValueError(f"Missing column: {address_column}")
if uprn_column not in df.columns:
if uprn_column not in address_list_df.columns:
raise ValueError(f"Missing column: {uprn_column}")
out = df.copy()
out = address_list_df.copy()
user_norm = AddressMatch.normalise_address(user_address)

View file

@ -4,7 +4,7 @@ from typing import Optional
import pandas as pd
from botocore.exceptions import ClientError
from backend.address2UPRN.scoring import rank_by_address_similarity
from backend.address2UPRN.scoring import rank_address_similarity
from backend.utils.addressMatch import AddressMatch
from datatypes.epc.domain.historic_epc import HistoricEpc
from utils.pandas_utils import pandas_cell_to_str
@ -85,7 +85,7 @@ def match_addresses_for_postcode(
) from e
raise
scored = rank_by_address_similarity(
scored = rank_address_similarity(
df,
user_address=user_address,
address_column=address_column,

View file

@ -1,6 +1,7 @@
import os
import time
from enum import Enum
from http import HTTPStatus
from typing import Optional, cast, Callable, Any
from hubspot.client import Client # type: ignore[reportMissingTypeStubs]
@ -86,19 +87,27 @@ class HubspotClient:
def _call_with_retry(self, fn: Callable[[], Any], max_retries: int = 2) -> Any:
"""
Call fn(), retrying up to max_retries times on 429 rate-limit errors.
Call fn(), retrying up to max_retries times on 429 rate-limit errors
or transient 5xx server errors.
Waits the minimal amount: the remaining interval window reported by HubSpot headers.
Falls back to the full interval (10s) if headers are absent.
Note: each HubSpot sub-module (deals, companies, etc.) ships its own ApiException
class with no shared base beyond Exception, so we detect 429s via duck-typing.
class with no shared base beyond Exception, so we detect retryable statuses via duck-typing.
"""
retryable_statuses = {
HTTPStatus.TOO_MANY_REQUESTS,
HTTPStatus.INTERNAL_SERVER_ERROR,
HTTPStatus.BAD_GATEWAY,
HTTPStatus.SERVICE_UNAVAILABLE,
HTTPStatus.GATEWAY_TIMEOUT,
}
for attempt in range(max_retries + 1):
try:
return fn()
except Exception as e:
status = getattr(e, "status", None)
if status != 429 or attempt == max_retries:
if status not in retryable_statuses or attempt == max_retries:
raise
headers = getattr(e, "headers", None) or {}
interval_ms = int(
@ -106,7 +115,7 @@ class HubspotClient:
)
wait_s = interval_ms / 1000.0
self.logger.warning(
f"HubSpot 429 (attempt {attempt + 1}/{max_retries}), "
f"HubSpot {status} (attempt {attempt + 1}/{max_retries}), "
f"waiting {wait_s:.1f}s before retry."
)
time.sleep(wait_s)