fix(elmhurst-mapper): strip wrapped building-part fragment from glazing label

pdftotext can wrap the §11 building-part column onto the glazing-TYPE
token without an intervening glazing-gap descriptor, e.g. "Double between
2002 and 2021 1st" (the "1st" marks the 1st Extension). The existing
trailing-gap fallback only strips the fragment when preceded by "N mm";
the bare ordinal raised UnmappedElmhurstLabel.

New `_ELMHURST_GLAZING_LABEL_TRAILING_BP_RE` strips a trailing ordinal
("1st"/"2nd"/…) or "Main" and retries the lookup. No glazing-type key
ends in an ordinal or "Main", so it is loss-free. Surfaced by worksheet
`simulated case 33` (direct-acting electric boiler + immersion), which
previously could not be routed through the Summary cascade.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-10 21:25:42 +00:00
parent 3cb2711418
commit 020ac6f220
2 changed files with 30 additions and 0 deletions

View file

@ -1639,6 +1639,20 @@ def test_elmhurst_glazing_label_full_coverage_per_sap10_table_6b() -> None:
)
def test_elmhurst_glazing_label_strips_wrapped_building_part_fragment() -> None:
# Arrange — pdftotext wraps the §11 building-part column (e.g. "1st"
# for the 1st Extension) onto the glazing-TYPE token even when no
# glazing-GAP descriptor ("16 mm") sits between them, so the lodged
# label reads "Double between 2002 and 2021 1st". The fragment is a
# building-part marker, not part of the glazing type — it must be
# stripped so the label resolves to its base code. Worksheet
# `simulated case 33` (direct-acting electric boiler + immersion)
# surfaced this.
# Act / Assert — base "Double between 2002 and 2021" → code 3.
assert _elmhurst_glazing_type_code("Double between 2002 and 2021 1st") == 3
assert _elmhurst_glazing_type_code("Single glazing 2nd") == 1
def test_extension_party_wall_type_read_independently_of_as_main_wall() -> None:
# Arrange — RdSAP 10 §3.3: "As Main Wall: Yes" inherits only the
# external wall CONSTRUCTION; the party wall type is lodged

View file

@ -5435,6 +5435,15 @@ _ELMHURST_GLAZING_LABEL_NOISE_SUFFIX_RE: Final[re.Pattern[str]] = re.compile(
_ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE: Final[re.Pattern[str]] = re.compile(
r"\s+\d+\s*mm\b.*$"
)
# Fallback only: pdftotext can wrap the §11 building-part column onto the
# glazing-TYPE token WITHOUT an intervening glazing-gap descriptor, e.g.
# "Double between 2002 and 2021 1st" (the "1st" marks the 1st Extension).
# The ordinal / "Main" fragment is a building-part marker, not part of the
# glazing type — strip it and retry. No glazing-type key ends in an ordinal
# or "Main", so this is loss-free. Surfaced by `simulated case 33`.
_ELMHURST_GLAZING_LABEL_TRAILING_BP_RE: Final[re.Pattern[str]] = re.compile(
r"\s+(?:\d+(?:st|nd|rd|th)|Main)$"
)
def _elmhurst_glazing_type_code(label: Optional[str]) -> int:
@ -5459,6 +5468,13 @@ def _elmhurst_glazing_type_code(label: Optional[str]) -> int:
code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(degapped)
if code is not None:
return code
# Fallback: strip a trailing wrapped building-part fragment (ordinal /
# "Main") and retry.
debp = _ELMHURST_GLAZING_LABEL_TRAILING_BP_RE.sub("", cleaned).strip()
if debp != cleaned:
code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(debp)
if code is not None:
return code
raise UnmappedElmhurstLabel("glazing_type", label)