diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 032a881e..41c07a6d 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -4858,6 +4858,18 @@ _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE: Final[re.Pattern[str]] = re.compile( _ELMHURST_GLAZING_LABEL_NOISE_SUFFIX_RE: Final[re.Pattern[str]] = re.compile( r"\s+Summary Information$|\s+Alternative wall.*$" ) +# Fallback only: pdftotext wraps the §11 glazing-GAP column ("6 mm" / +# "12 mm" / "16 mm or more") onto the glazing-TYPE token on hand-entered +# worksheets, e.g. "Double between 2002 and 2021 16 mm or [1st]". When the +# lightly-cleaned label isn't a known key, strip the trailing gap +# descriptor (and any building-part fragment after it) and retry. Applied +# AFTER the direct lookup so explicitly-mapped interleaved variants (e.g. +# "Double with unknown 16 mm or install date more") are unaffected. The +# gap drives the API-path U-value lookup, not the site-notes glazing-type +# enum, so dropping it here is loss-free for the cascade. +_ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE: Final[re.Pattern[str]] = re.compile( + r"\s+\d+\s*mm\b.*$" +) def _elmhurst_glazing_type_code(label: Optional[str]) -> int: @@ -4874,9 +4886,15 @@ def _elmhurst_glazing_type_code(label: Optional[str]) -> int: cleaned = _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE.sub("", label) cleaned = _ELMHURST_GLAZING_LABEL_NOISE_SUFFIX_RE.sub("", cleaned).strip() code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(cleaned) - if code is None: - raise UnmappedElmhurstLabel("glazing_type", label) - return code + if code is not None: + return code + # Fallback: strip a trailing wrapped glazing-gap descriptor and retry. + degapped = _ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE.sub("", cleaned).strip() + if degapped != cleaned: + code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(degapped) + if code is not None: + return code + raise UnmappedElmhurstLabel("glazing_type", label) def _elmhurst_main_heating_category( diff --git a/datatypes/epc/domain/tests/test_from_rdsap_schema.py b/datatypes/epc/domain/tests/test_from_rdsap_schema.py index 694726f1..58b9ed1a 100644 --- a/datatypes/epc/domain/tests/test_from_rdsap_schema.py +++ b/datatypes/epc/domain/tests/test_from_rdsap_schema.py @@ -748,3 +748,50 @@ class TestApiResolveWallInsulationThickness: # Assert assert resolved == "measured" + + +# --------------------------------------------------------------------------- +# Glazing-type label cleaning — pdftotext gap-column wrap +# --------------------------------------------------------------------------- + + +class TestElmhurstGlazingTypeWrappedGap: + """When a hand-entered Elmhurst worksheet is dumped via pdftotext, the + glazing-GAP column ("16 mm or more") wraps onto the glazing-TYPE token, + yielding labels like "Double between 2002 and 2021 16 mm or" (plus a + trailing building-part fragment). The extractor must strip the trailing + gap descriptor and map the clean type, not raise UnmappedElmhurstLabel.""" + + def test_trailing_gap_descriptor_stripped(self) -> None: + # Arrange + from datatypes.epc.domain.mapper import _elmhurst_glazing_type_code + + # Act + code = _elmhurst_glazing_type_code( + "Double between 2002 and 2021 16 mm or" + ) + + # Assert — clean "Double between 2002 and 2021" → SAP10 code 3 + assert code == 3 + + def test_trailing_gap_plus_building_part_fragment_stripped(self) -> None: + # Arrange + from datatypes.epc.domain.mapper import _elmhurst_glazing_type_code + + # Act + code = _elmhurst_glazing_type_code( + "Double between 2002 and 2021 16 mm or 1st" + ) + + # Assert + assert code == 3 + + def test_clean_label_still_maps(self) -> None: + # Arrange — regression guard: an un-wrapped label is unaffected. + from datatypes.epc.domain.mapper import _elmhurst_glazing_type_code + + # Act + code = _elmhurst_glazing_type_code("Double pre 2002") + + # Assert + assert code == 2