rank address similiarity

2026-08-02 21:08:24 +00:00 · 2026-05-12 16:02:01 +00:00 · 2026-05-12 16:02:01 +00:00 · dfc100f78b
commit dfc100f78b
parent 8b27a5173b
4 changed files with 23 additions and 14 deletions
--- a/backend/address2UPRN/main.py
+++ b/backend/address2UPRN/main.py
@ -15,7 +15,7 @@ from utils.s3 import (
 from datetime import datetime

 from backend.utils.addressMatch import AddressMatch
-from backend.address2UPRN.scoring import all_uprns_match, rank_by_address_similarity
+from backend.address2UPRN.scoring import all_uprns_match, rank_address_similarity
 from datatypes.epc.domain.historic_epc_matching import (
    match_addresses_for_postcode,
 )
@ -79,7 +79,7 @@ def get_uprn_with_epc_df(
    if epc_df.empty:
        return None

-    scored_df = rank_by_address_similarity(
+    scored_df = rank_address_similarity(
        epc_df,
        user_address=user_inputed_address,
    )
@ -171,7 +171,7 @@ def resolve_uprns_for_postcode_group(
    for _, row in group_df.iterrows():
        user_address = str(row[address_col]).strip()

-        scored_df = rank_by_address_similarity(
+        scored_df = rank_address_similarity(
            epc_df,
            user_address=user_address,
        )
--- a/backend/address2UPRN/scoring.py
+++ b/backend/address2UPRN/scoring.py
@ -19,8 +19,8 @@ def all_uprns_match(
    return len(uprns) == 1 and uprns[0] == str(target_uprn)


-def rank_by_address_similarity(
-    df: pd.DataFrame,
+def rank_address_similarity(
+    address_list_df: pd.DataFrame,
    user_address: str,
    address_column: str = "address",
    uprn_column: str = "uprn",
@ -32,13 +32,13 @@ def rank_by_address_similarity(
    DOES NOT choose or return a UPRN.
    """

-    if address_column not in df.columns:
+    if address_column not in address_list_df.columns:
        raise ValueError(f"Missing column: {address_column}")

-    if uprn_column not in df.columns:
+    if uprn_column not in address_list_df.columns:
        raise ValueError(f"Missing column: {uprn_column}")

-    out = df.copy()
+    out = address_list_df.copy()

    user_norm = AddressMatch.normalise_address(user_address)

--- a/datatypes/epc/domain/historic_epc_matching.py
+++ b/datatypes/epc/domain/historic_epc_matching.py
@ -4,7 +4,7 @@ from typing import Optional
 import pandas as pd
 from botocore.exceptions import ClientError

-from backend.address2UPRN.scoring import rank_by_address_similarity
+from backend.address2UPRN.scoring import rank_address_similarity
 from backend.utils.addressMatch import AddressMatch
 from datatypes.epc.domain.historic_epc import HistoricEpc
 from utils.pandas_utils import pandas_cell_to_str
@ -85,7 +85,7 @@ def match_addresses_for_postcode(
            ) from e
        raise

-    scored = rank_by_address_similarity(
+    scored = rank_address_similarity(
        df,
        user_address=user_address,
        address_column=address_column,
--- a/etl/hubspot/hubspotClient.py
+++ b/etl/hubspot/hubspotClient.py
@ -1,6 +1,7 @@
 import os
 import time
 from enum import Enum
+from http import HTTPStatus
 from typing import Optional, cast, Callable, Any

 from hubspot.client import Client  # type: ignore[reportMissingTypeStubs]
@ -86,19 +87,27 @@ class HubspotClient:

    def _call_with_retry(self, fn: Callable[[], Any], max_retries: int = 2) -> Any:
        """
-        Call fn(), retrying up to max_retries times on 429 rate-limit errors.
+        Call fn(), retrying up to max_retries times on 429 rate-limit errors
+        or transient 5xx server errors.
        Waits the minimal amount: the remaining interval window reported by HubSpot headers.
        Falls back to the full interval (10s) if headers are absent.

        Note: each HubSpot sub-module (deals, companies, etc.) ships its own ApiException
-        class with no shared base beyond Exception, so we detect 429s via duck-typing.
+        class with no shared base beyond Exception, so we detect retryable statuses via duck-typing.
        """
+        retryable_statuses = {
+            HTTPStatus.TOO_MANY_REQUESTS,
+            HTTPStatus.INTERNAL_SERVER_ERROR,
+            HTTPStatus.BAD_GATEWAY,
+            HTTPStatus.SERVICE_UNAVAILABLE,
+            HTTPStatus.GATEWAY_TIMEOUT,
+        }
        for attempt in range(max_retries + 1):
            try:
                return fn()
            except Exception as e:
                status = getattr(e, "status", None)
-                if status != 429 or attempt == max_retries:
+                if status not in retryable_statuses or attempt == max_retries:
                    raise
                headers = getattr(e, "headers", None) or {}
                interval_ms = int(
@ -106,7 +115,7 @@ class HubspotClient:
                )
                wait_s = interval_ms / 1000.0
                self.logger.warning(
-                    f"HubSpot 429 (attempt {attempt + 1}/{max_retries}), "
+                    f"HubSpot {status} (attempt {attempt + 1}/{max_retries}), "
                    f"waiting {wait_s:.1f}s before retry."
                )
                time.sleep(wait_s)