diff --git a/backend/address2UPRN/tests/test_data.csv b/backend/address2UPRN/tests/test_data.csv index ee23813b..408edc29 100644 --- a/backend/address2UPRN/tests/test_data.csv +++ b/backend/address2UPRN/tests/test_data.csv @@ -168,8 +168,8 @@ FLAT 8 599 HARROW ROAD,W10 4RA,None "Apartment 18 Block D, 32, Hornsey Road",N7 7AT,10012792383 24b Honley Road,SE6 2HZ,None FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974 -2 COLLEGE HOUSE,CM7 1JS,100091449870 -3 COLLEGE HOUSE,CM7 1JS,100091449871 +2 COLLEGE HOUSE,CM7 1JS,None +3 COLLEGE HOUSE,CM7 1JS,None 1 Anita Street,M4 5DU,None 2 Anita Street,M4 5DU,77123061 5 Anita Street,M4 5DU,77123081 @@ -279,6 +279,7 @@ FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974 80a Victoria Square,M4 5DZ,77211231 81a Victoria Square,M4 5DZ,77211232 82 Victoria Square,M4 5DZ,None +82a Victoria Square,M4 5DZ,77211233 83a Victoria Square,M4 5DZ,77211234 84a Victoria Square,M4 5DZ,None 85a Victoria Square,M4 5DZ,77211236 diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index 1435a629..ee9d1004 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -127,6 +127,7 @@ class AddressMatch: Assumes formats like: - '42 moreton road' - 'flat 3 42 moreton road' + - '82 a victoria square' (recombined to '82a') """ tokens = s.split() @@ -142,9 +143,15 @@ class AddressMatch: continue cleaned.append(t) - # first remaining number is building number - for t in cleaned: - if re.fullmatch(r"\d+[a-z]?", t): + # first remaining number is building number; recombine with a + # single-letter suffix when normalisation has split "82a" → "82 a" + for i, t in enumerate(cleaned): + if re.fullmatch(r"\d+[a-z]", t): + return t + if re.fullmatch(r"\d+", t): + nxt = cleaned[i + 1] if i + 1 < len(cleaned) else None + if nxt is not None and re.fullmatch(r"[a-z]", nxt): + return t + nxt return t return None @@ -181,24 +188,13 @@ class AddressMatch: # Slash-format like "3/137a" is an implicit flat reference # (flat 3 of 137a) even without a "flat" keyword. has_implicit_flat_user = bool(re.search(r"\d+\s*/\s*\d+", a_norm)) - # If the user named a street, their leading number is a house number, - # not a flat number — so an EPC "Flat N, …" candidate is a wrong unit. - # Without a street token (e.g. "2 College House"), the user may be - # implicitly naming a flat in a named building; don't apply the guard. - STREET_TYPE_TOKENS = { - "road", "street", "lane", "avenue", "close", "way", - "crescent", "court", "drive", "place", "terrace", "mews", - "gardens", "square", "grove", "park", "walk", "row", - "green", "hill", "rise", "parade", "broadway", - } - user_tokens = set(a_norm.split()) - has_street_type_user = bool(user_tokens & STREET_TYPE_TOKENS) + # EPC says it's a flat but user gave no flat indication + # (neither keyword nor slash-format). Unlikely to be the right unit. if ( has_flat_token_epc and not has_flat_token_user and not has_implicit_flat_user - and has_street_type_user ): return 0.0