refactored test to deal with flats better

This commit is contained in:
Jun-te Kim 2026-05-11 16:23:03 +00:00
parent 9aae5bf482
commit 1934c889b0
2 changed files with 15 additions and 18 deletions

View file

@ -168,8 +168,8 @@ FLAT 8 599 HARROW ROAD,W10 4RA,None
"Apartment 18 Block D, 32, Hornsey Road",N7 7AT,10012792383
24b Honley Road,SE6 2HZ,None
FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974
2 COLLEGE HOUSE,CM7 1JS,100091449870
3 COLLEGE HOUSE,CM7 1JS,100091449871
2 COLLEGE HOUSE,CM7 1JS,None
3 COLLEGE HOUSE,CM7 1JS,None
1 Anita Street,M4 5DU,None
2 Anita Street,M4 5DU,77123061
5 Anita Street,M4 5DU,77123081
@ -279,6 +279,7 @@ FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974
80a Victoria Square,M4 5DZ,77211231
81a Victoria Square,M4 5DZ,77211232
82 Victoria Square,M4 5DZ,None
82a Victoria Square,M4 5DZ,77211233
83a Victoria Square,M4 5DZ,77211234
84a Victoria Square,M4 5DZ,None
85a Victoria Square,M4 5DZ,77211236

1 User Input Postcode Manual UPRN Code
168 Apartment 18 Block D, 32, Hornsey Road N7 7AT 10012792383
169 24b Honley Road SE6 2HZ None
170 FLAT B 158 LEAHURST ROAD SE13 5NL 100021976974
171 2 COLLEGE HOUSE CM7 1JS 100091449870 None
172 3 COLLEGE HOUSE CM7 1JS 100091449871 None
173 1 Anita Street M4 5DU None
174 2 Anita Street M4 5DU 77123061
175 5 Anita Street M4 5DU 77123081
279 80a Victoria Square M4 5DZ 77211231
280 81a Victoria Square M4 5DZ 77211232
281 82 Victoria Square M4 5DZ None
282 82a Victoria Square M4 5DZ 77211233
283 83a Victoria Square M4 5DZ 77211234
284 84a Victoria Square M4 5DZ None
285 85a Victoria Square M4 5DZ 77211236

View file

@ -127,6 +127,7 @@ class AddressMatch:
Assumes formats like:
- '42 moreton road'
- 'flat 3 42 moreton road'
- '82 a victoria square' (recombined to '82a')
"""
tokens = s.split()
@ -142,9 +143,15 @@ class AddressMatch:
continue
cleaned.append(t)
# first remaining number is building number
for t in cleaned:
if re.fullmatch(r"\d+[a-z]?", t):
# first remaining number is building number; recombine with a
# single-letter suffix when normalisation has split "82a" → "82 a"
for i, t in enumerate(cleaned):
if re.fullmatch(r"\d+[a-z]", t):
return t
if re.fullmatch(r"\d+", t):
nxt = cleaned[i + 1] if i + 1 < len(cleaned) else None
if nxt is not None and re.fullmatch(r"[a-z]", nxt):
return t + nxt
return t
return None
@ -181,24 +188,13 @@ class AddressMatch:
# Slash-format like "3/137a" is an implicit flat reference
# (flat 3 of 137a) even without a "flat" keyword.
has_implicit_flat_user = bool(re.search(r"\d+\s*/\s*\d+", a_norm))
# If the user named a street, their leading number is a house number,
# not a flat number — so an EPC "Flat N, …" candidate is a wrong unit.
# Without a street token (e.g. "2 College House"), the user may be
# implicitly naming a flat in a named building; don't apply the guard.
STREET_TYPE_TOKENS = {
"road", "street", "lane", "avenue", "close", "way",
"crescent", "court", "drive", "place", "terrace", "mews",
"gardens", "square", "grove", "park", "walk", "row",
"green", "hill", "rise", "parade", "broadway",
}
user_tokens = set(a_norm.split())
has_street_type_user = bool(user_tokens & STREET_TYPE_TOKENS)
# EPC says it's a flat but user gave no flat indication
# (neither keyword nor slash-format). Unlikely to be the right unit.
if (
has_flat_token_epc
and not has_flat_token_user
and not has_implicit_flat_user
and has_street_type_user
):
return 0.0