diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index f89c4c72..a495addd 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -1540,6 +1540,23 @@ def test_summary_mapper_raises_on_unmapped_cylinder_insulation_label() -> None: assert excinfo.value.value == "Polyester wool" +def test_elmhurst_glazing_type_code_strips_interleaved_alternative_wall() -> None: + # Arrange — when a property lodges an Alternative Wall (cert 001431 + # storage-heater variants, "simulated case 34"), pdftotext interleaves + # the §11 "Alternative wall 1" location column into the wrapped + # glazing-type cell, e.g. "Double between 2002 Alternative wall and 2021 + # 1 Alternative wall". The wall-location fragments are not part of the + # glazing type — the helper must recover "Double between 2002 and 2021". + + # Act + code = _elmhurst_glazing_type_code( + "Double between 2002 Alternative wall and 2021 1 Alternative wall" + ) + + # Assert + assert code == _elmhurst_glazing_type_code("Double between 2002 and 2021") + + def test_elmhurst_immersion_type_code_maps_dual_and_single() -> None: # Arrange — Elmhurst Summary §15.1 "Immersion Heater" lodges "Dual" # or "Single". RdSAP 10 §10.5 (PDF p.54): an immersion is "assumed diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index e5c794c8..d6573aac 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -5470,6 +5470,21 @@ _ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE: Final[re.Pattern[str]] = re.compile( _ELMHURST_GLAZING_LABEL_TRAILING_BP_RE: Final[re.Pattern[str]] = re.compile( r"\s+(?:\d+(?:st|nd|rd|th)|Main)$" ) +# Fallback only: when a property lodges an Alternative Wall, pdftotext +# INTERLEAVES the §11 location column ("Alternative wall 1") into the +# wrapped glazing-TYPE cell, e.g. "Double between 2002 Alternative wall +# and 2021 1 Alternative wall" (cert 001431 storage-heater variants, +# `simulated case 34`). The greedy trailing-suffix strip truncates at the +# first "Alternative wall" (losing "and 2021"), so remove EVERY +# wall-location fragment + any stray 1-2 digit location index globally and +# retry. Loss-free: no glazing-type key contains a wall-location phrase or +# a bare 1-2 digit number (install-date years are 4 digits). +_ELMHURST_GLAZING_LABEL_EMBEDDED_WALL_RE: Final[re.Pattern[str]] = re.compile( + r"\s*(?:External|Alternative|Party)\s+wall(?:\s+\d+)?" +) +_ELMHURST_GLAZING_LABEL_STRAY_LOCATION_DIGIT_RE: Final[re.Pattern[str]] = re.compile( + r"\b\d{1,2}\b" +) def _elmhurst_glazing_type_code(label: Optional[str]) -> int: @@ -5501,6 +5516,18 @@ def _elmhurst_glazing_type_code(label: Optional[str]) -> int: code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(debp) if code is not None: return code + # Fallback: remove INTERLEAVED wall-location fragments from the raw + # label (Alternative/External/Party wall + stray location index) and + # collapse whitespace. Operates on `label`, not the greedily-truncated + # `cleaned`, so "Double between 2002 ... and 2021" survives. + dewall = _ELMHURST_GLAZING_LABEL_EMBEDDED_WALL_RE.sub(" ", label) + dewall = _ELMHURST_GLAZING_LABEL_STRAY_LOCATION_DIGIT_RE.sub(" ", dewall) + dewall = _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE.sub("", dewall) + dewall = re.sub(r"\s+", " ", dewall).strip() + if dewall != cleaned: + code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(dewall) + if code is not None: + return code raise UnmappedElmhurstLabel("glazing_type", label)