S0380.216: extractor — handle wrapped glazing-gap column in §11 labels

pdftotext dumps of hand-entered Elmhurst worksheets wrap the §11 glazing-
GAP column ("16 mm or more") onto the glazing-TYPE token, yielding labels
like "Double between 2002 and 2021 16 mm or [1st]" that
`_elmhurst_glazing_type_code` didn't recognise → UnmappedElmhurstLabel,
blocking the whole Summary from parsing.

Added a fallback: when the lightly-cleaned label isn't a known key, strip a
trailing wrapped gap descriptor (`\s+\d+\s*mm\b.*$`) and retry. Applied
AFTER the direct lookup so explicitly-mapped interleaved variants (e.g.
"Double with unknown 16 mm or install date more", where the gap splits into
the middle) are unaffected. The gap drives the API-path U-value lookup, not
the site-notes glazing-type enum, so dropping it is loss-free for the
cascade.

Unblocks running our cascade on hand-entered worksheet Summaries — used to
validate the PV β-split against simulated case 18 (our split matches the
P960 worksheet exactly: gen 2684.17, onsite 970.77, export 1713.40).

Suite: 2391 passed, 1 skipped. Zero new pyright errors (mapper 32=32).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-04 11:03:32 +00:00
parent 2f5ca85854
commit 712cc6f3f8
2 changed files with 68 additions and 3 deletions

View file

@ -4858,6 +4858,18 @@ _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE: Final[re.Pattern[str]] = re.compile(
_ELMHURST_GLAZING_LABEL_NOISE_SUFFIX_RE: Final[re.Pattern[str]] = re.compile(
r"\s+Summary Information$|\s+Alternative wall.*$"
)
# Fallback only: pdftotext wraps the §11 glazing-GAP column ("6 mm" /
# "12 mm" / "16 mm or more") onto the glazing-TYPE token on hand-entered
# worksheets, e.g. "Double between 2002 and 2021 16 mm or [1st]". When the
# lightly-cleaned label isn't a known key, strip the trailing gap
# descriptor (and any building-part fragment after it) and retry. Applied
# AFTER the direct lookup so explicitly-mapped interleaved variants (e.g.
# "Double with unknown 16 mm or install date more") are unaffected. The
# gap drives the API-path U-value lookup, not the site-notes glazing-type
# enum, so dropping it here is loss-free for the cascade.
_ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE: Final[re.Pattern[str]] = re.compile(
r"\s+\d+\s*mm\b.*$"
)
def _elmhurst_glazing_type_code(label: Optional[str]) -> int:
@ -4874,9 +4886,15 @@ def _elmhurst_glazing_type_code(label: Optional[str]) -> int:
cleaned = _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE.sub("", label)
cleaned = _ELMHURST_GLAZING_LABEL_NOISE_SUFFIX_RE.sub("", cleaned).strip()
code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(cleaned)
if code is None:
raise UnmappedElmhurstLabel("glazing_type", label)
return code
if code is not None:
return code
# Fallback: strip a trailing wrapped glazing-gap descriptor and retry.
degapped = _ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE.sub("", cleaned).strip()
if degapped != cleaned:
code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(degapped)
if code is not None:
return code
raise UnmappedElmhurstLabel("glazing_type", label)
def _elmhurst_main_heating_category(

View file

@ -748,3 +748,50 @@ class TestApiResolveWallInsulationThickness:
# Assert
assert resolved == "measured"
# ---------------------------------------------------------------------------
# Glazing-type label cleaning — pdftotext gap-column wrap
# ---------------------------------------------------------------------------
class TestElmhurstGlazingTypeWrappedGap:
"""When a hand-entered Elmhurst worksheet is dumped via pdftotext, the
glazing-GAP column ("16 mm or more") wraps onto the glazing-TYPE token,
yielding labels like "Double between 2002 and 2021 16 mm or" (plus a
trailing building-part fragment). The extractor must strip the trailing
gap descriptor and map the clean type, not raise UnmappedElmhurstLabel."""
def test_trailing_gap_descriptor_stripped(self) -> None:
# Arrange
from datatypes.epc.domain.mapper import _elmhurst_glazing_type_code
# Act
code = _elmhurst_glazing_type_code(
"Double between 2002 and 2021 16 mm or"
)
# Assert — clean "Double between 2002 and 2021" → SAP10 code 3
assert code == 3
def test_trailing_gap_plus_building_part_fragment_stripped(self) -> None:
# Arrange
from datatypes.epc.domain.mapper import _elmhurst_glazing_type_code
# Act
code = _elmhurst_glazing_type_code(
"Double between 2002 and 2021 16 mm or 1st"
)
# Assert
assert code == 3
def test_clean_label_still_maps(self) -> None:
# Arrange — regression guard: an un-wrapped label is unaffected.
from datatypes.epc.domain.mapper import _elmhurst_glazing_type_code
# Act
code = _elmhurst_glazing_type_code("Double pre 2002")
# Assert
assert code == 2