diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index fb5e8af8..5a0d47d2 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -1639,6 +1639,20 @@ def test_elmhurst_glazing_label_full_coverage_per_sap10_table_6b() -> None: ) +def test_elmhurst_glazing_label_strips_wrapped_building_part_fragment() -> None: + # Arrange — pdftotext wraps the §11 building-part column (e.g. "1st" + # for the 1st Extension) onto the glazing-TYPE token even when no + # glazing-GAP descriptor ("16 mm") sits between them, so the lodged + # label reads "Double between 2002 and 2021 1st". The fragment is a + # building-part marker, not part of the glazing type — it must be + # stripped so the label resolves to its base code. Worksheet + # `simulated case 33` (direct-acting electric boiler + immersion) + # surfaced this. + # Act / Assert — base "Double between 2002 and 2021" → code 3. + assert _elmhurst_glazing_type_code("Double between 2002 and 2021 1st") == 3 + assert _elmhurst_glazing_type_code("Single glazing 2nd") == 1 + + def test_extension_party_wall_type_read_independently_of_as_main_wall() -> None: # Arrange — RdSAP 10 §3.3: "As Main Wall: Yes" inherits only the # external wall CONSTRUCTION; the party wall type is lodged diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 32af358d..39c3365b 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -5435,6 +5435,15 @@ _ELMHURST_GLAZING_LABEL_NOISE_SUFFIX_RE: Final[re.Pattern[str]] = re.compile( _ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE: Final[re.Pattern[str]] = re.compile( r"\s+\d+\s*mm\b.*$" ) +# Fallback only: pdftotext can wrap the §11 building-part column onto the +# glazing-TYPE token WITHOUT an intervening glazing-gap descriptor, e.g. +# "Double between 2002 and 2021 1st" (the "1st" marks the 1st Extension). +# The ordinal / "Main" fragment is a building-part marker, not part of the +# glazing type — strip it and retry. No glazing-type key ends in an ordinal +# or "Main", so this is loss-free. Surfaced by `simulated case 33`. +_ELMHURST_GLAZING_LABEL_TRAILING_BP_RE: Final[re.Pattern[str]] = re.compile( + r"\s+(?:\d+(?:st|nd|rd|th)|Main)$" +) def _elmhurst_glazing_type_code(label: Optional[str]) -> int: @@ -5459,6 +5468,13 @@ def _elmhurst_glazing_type_code(label: Optional[str]) -> int: code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(degapped) if code is not None: return code + # Fallback: strip a trailing wrapped building-part fragment (ordinal / + # "Main") and retry. + debp = _ELMHURST_GLAZING_LABEL_TRAILING_BP_RE.sub("", cleaned).strip() + if debp != cleaned: + code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(debp) + if code is not None: + return code raise UnmappedElmhurstLabel("glazing_type", label)