fix(elmhurst-mapper): strip interleaved Alternative-wall fragments from glazing label

When a property lodges an Alternative Wall, pdftotext interleaves the §11
"Location" column ("Alternative wall 1") into the wrapped glazing-TYPE cell,
producing labels like "Double between 2002 Alternative wall and 2021 1
Alternative wall" (cert 001431 storage-heater variants, simulated case 34).

The existing greedy trailing-suffix strip (\s+Alternative wall.*$) truncates
at the FIRST "Alternative wall", losing "and 2021" and yielding the
unmatchable "Double between 2002". Added a fallback that removes EVERY
"<External|Alternative|Party> wall [n]" fragment and any stray 1-2 digit
location index from the raw label, then retries the lookup. Loss-free: no
glazing-type key contains a wall-location phrase or a bare 1-2 digit number
(install-date years are 4 digits).

Unblocks the Summary cascade for any property with an Alternative Wall;
Summary-path only (the API path receives structured glazing codes, so the
API gauge is unaffected). Regression gate green (1 pre-existing fail
unrelated).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-11 07:07:08 +00:00
parent 85d6f8468c
commit b0a47cda05
2 changed files with 44 additions and 0 deletions

View file

@ -1540,6 +1540,23 @@ def test_summary_mapper_raises_on_unmapped_cylinder_insulation_label() -> None:
assert excinfo.value.value == "Polyester wool"
def test_elmhurst_glazing_type_code_strips_interleaved_alternative_wall() -> None:
# Arrange — when a property lodges an Alternative Wall (cert 001431
# storage-heater variants, "simulated case 34"), pdftotext interleaves
# the §11 "Alternative wall 1" location column into the wrapped
# glazing-type cell, e.g. "Double between 2002 Alternative wall and 2021
# 1 Alternative wall". The wall-location fragments are not part of the
# glazing type — the helper must recover "Double between 2002 and 2021".
# Act
code = _elmhurst_glazing_type_code(
"Double between 2002 Alternative wall and 2021 1 Alternative wall"
)
# Assert
assert code == _elmhurst_glazing_type_code("Double between 2002 and 2021")
def test_elmhurst_immersion_type_code_maps_dual_and_single() -> None:
# Arrange — Elmhurst Summary §15.1 "Immersion Heater" lodges "Dual"
# or "Single". RdSAP 10 §10.5 (PDF p.54): an immersion is "assumed

View file

@ -5470,6 +5470,21 @@ _ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE: Final[re.Pattern[str]] = re.compile(
_ELMHURST_GLAZING_LABEL_TRAILING_BP_RE: Final[re.Pattern[str]] = re.compile(
r"\s+(?:\d+(?:st|nd|rd|th)|Main)$"
)
# Fallback only: when a property lodges an Alternative Wall, pdftotext
# INTERLEAVES the §11 location column ("Alternative wall 1") into the
# wrapped glazing-TYPE cell, e.g. "Double between 2002 Alternative wall
# and 2021 1 Alternative wall" (cert 001431 storage-heater variants,
# `simulated case 34`). The greedy trailing-suffix strip truncates at the
# first "Alternative wall" (losing "and 2021"), so remove EVERY
# wall-location fragment + any stray 1-2 digit location index globally and
# retry. Loss-free: no glazing-type key contains a wall-location phrase or
# a bare 1-2 digit number (install-date years are 4 digits).
_ELMHURST_GLAZING_LABEL_EMBEDDED_WALL_RE: Final[re.Pattern[str]] = re.compile(
r"\s*(?:External|Alternative|Party)\s+wall(?:\s+\d+)?"
)
_ELMHURST_GLAZING_LABEL_STRAY_LOCATION_DIGIT_RE: Final[re.Pattern[str]] = re.compile(
r"\b\d{1,2}\b"
)
def _elmhurst_glazing_type_code(label: Optional[str]) -> int:
@ -5501,6 +5516,18 @@ def _elmhurst_glazing_type_code(label: Optional[str]) -> int:
code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(debp)
if code is not None:
return code
# Fallback: remove INTERLEAVED wall-location fragments from the raw
# label (Alternative/External/Party wall + stray location index) and
# collapse whitespace. Operates on `label`, not the greedily-truncated
# `cleaned`, so "Double between 2002 ... and 2021" survives.
dewall = _ELMHURST_GLAZING_LABEL_EMBEDDED_WALL_RE.sub(" ", label)
dewall = _ELMHURST_GLAZING_LABEL_STRAY_LOCATION_DIGIT_RE.sub(" ", dewall)
dewall = _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE.sub("", dewall)
dewall = re.sub(r"\s+", " ", dewall).strip()
if dewall != cleaned:
code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(dewall)
if code is not None:
return code
raise UnmappedElmhurstLabel("glazing_type", label)