added logic to deal with flats

2026-07-27 23:35:01 +00:00 · 2026-05-11 15:20:17 +00:00 · 2026-05-11 15:20:17 +00:00 · 9aae5bf482
commit 9aae5bf482
parent 6504785e7c
2 changed files with 102 additions and 13 deletions
--- a/backend/address2UPRN/main.py
+++ b/backend/address2UPRN/main.py
@ -24,22 +24,53 @@ from backend.utils.addressMatch import (
 logger = setup_logger()


-OPEN_EPC_API_TOKEN = os.getenv("OPEN_EPC_API_TOKEN")
-
-if OPEN_EPC_API_TOKEN is None:
-    raise RuntimeError("OPEN_EPC_API_TOKEN not defined in env")
-
-
 def get_epc_data_with_postcode(postcode: str) -> pd.DataFrame:
    from backend.epc_client.client import EpcClientService

-    service = EpcClientService(auth_token=OPEN_EPC_API_TOKEN)
+    token = os.getenv("OPEN_EPC_API_TOKEN")
+    if token is None:
+        raise RuntimeError("OPEN_EPC_API_TOKEN not defined in env")
+
+    service = EpcClientService(auth_token=token)
    results = service.search_by_postcode(postcode)
    return pd.DataFrame(
        [{"address": r.address_line_1, "uprn": r.uprn} for r in results]
    )


+def get_uprn_from_historic_epc(
+    user_inputed_address: str,
+    postcode: str,
+) -> Optional[tuple[str, str, float]]:
+    """Resolve a UPRN via historic EPC S3 data.
+
+    Returns (uprn, address, lexiscore) when the historic dataset agrees on a
+    single rank-1 UPRN, None otherwise (missing postcode file, zero score,
+    or ambiguous top rank). The score gate is `unambiguous_uprn`'s own
+    (score > 0); the 0.7 heuristic used for the new-EPC source isn't applied
+    here because historic addresses use a more verbose format that
+    systematically depresses lexiscores.
+    """
+    from datatypes.epc.domain.historic_epc_matching import (
+        match_addresses_for_postcode,
+    )
+
+    try:
+        result = match_addresses_for_postcode(user_inputed_address, postcode)
+    except FileNotFoundError:
+        return None
+
+    uprn = result.unambiguous_uprn()
+    if not uprn or uprn in ("", "nan"):
+        return None
+
+    top = result.top()
+    if top is None:
+        return None
+
+    return (uprn, top.record.address, top.lexiscore)
+
+
 def get_uprn_with_epc_df(
    user_inputed_address: str,
    epc_df: pd.DataFrame,
@ -95,20 +126,37 @@ def get_uprn(
 ):
    """
    Return uprn (str)
-    Return False if failed to find a sensible matching epc
-    Return None when epc found but no UPRN
+    Return None when no sensible match is found in either EPC source.

-    This function fetches EPC data via API for a single postcode.
-    For processing multiple addresses in the same postcode, use get_uprn_with_epc_df instead.
+    Tries the new EPC API first; if that yields no confident match, falls
+    back to the historic EPC dataset on S3.
+
+    For processing multiple addresses in the same postcode, use
+    get_uprn_with_epc_df instead.
    """
    df = get_epc_data_with_postcode(postcode=postcode)

-    return get_uprn_with_epc_df(
+    result = get_uprn_with_epc_df(
        user_inputed_address=user_inputed_address,
        epc_df=df,
-        verbose=verbose,
+        verbose=True,
    )

+    if not result:
+        result = get_uprn_from_historic_epc(
+            user_inputed_address=user_inputed_address,
+            postcode=postcode,
+        )
+        if result:
+            logger.info(
+                f"Historic EPC matched {user_inputed_address} in {postcode}"
+            )
+
+    if not result:
+        return None
+
+    return result if verbose else result[0]
+

 def resolve_uprns_for_postcode_group(
    group_df: pd.DataFrame,
@ -379,6 +427,7 @@ def handler(event, context, local=False):
                    )
                    continue

+
                # Process each address in this postcode with the same EPC data
                for row in postcode_rows:
                    try:
@ -404,6 +453,23 @@ def handler(event, context, local=False):
                            verbose=True,
                        )

+                        # Fallback to historic EPC if new EPC produced no match
+                        if not result:
+                            try:
+                                result = get_uprn_from_historic_epc(
+                                    user_inputed_address=address2uprn_user_input,
+                                    postcode=postcode,
+                                )
+                            except Exception as e:
+                                logger.error(
+                                    f"Historic EPC lookup failed for {address2uprn_user_input} in {postcode}: {e}"
+                                )
+                                result = None
+                            if result:
+                                logger.info(
+                                    f"Historic EPC matched {address2uprn_user_input} in {postcode}"
+                                )
+
                        # Parse result tuple if successful
                        if result:
                            uprn, found_address, score = result
--- a/backend/utils/addressMatch.py
+++ b/backend/utils/addressMatch.py
@ -178,6 +178,29 @@ class AddressMatch:
            tok in a_norm for tok in ("flat", "apt", "apartment", "unit")
        )
        has_flat_token_epc = "flat" in b_norm
+        # Slash-format like "3/137a" is an implicit flat reference
+        # (flat 3 of 137a) even without a "flat" keyword.
+        has_implicit_flat_user = bool(re.search(r"\d+\s*/\s*\d+", a_norm))
+        # If the user named a street, their leading number is a house number,
+        # not a flat number — so an EPC "Flat N, …" candidate is a wrong unit.
+        # Without a street token (e.g. "2 College House"), the user may be
+        # implicitly naming a flat in a named building; don't apply the guard.
+        STREET_TYPE_TOKENS = {
+            "road", "street", "lane", "avenue", "close", "way",
+            "crescent", "court", "drive", "place", "terrace", "mews",
+            "gardens", "square", "grove", "park", "walk", "row",
+            "green", "hill", "rise", "parade", "broadway",
+        }
+        user_tokens = set(a_norm.split())
+        has_street_type_user = bool(user_tokens & STREET_TYPE_TOKENS)
+
+        if (
+            has_flat_token_epc
+            and not has_flat_token_user
+            and not has_implicit_flat_user
+            and has_street_type_user
+        ):
+            return 0.0

        if (
            len(seq_a) == 2