From b0a47cda05c3ad958929649354931bf20d414c40 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 11 Jun 2026 07:07:08 +0000 Subject: [PATCH] fix(elmhurst-mapper): strip interleaved Alternative-wall fragments from glazing label MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a property lodges an Alternative Wall, pdftotext interleaves the §11 "Location" column ("Alternative wall 1") into the wrapped glazing-TYPE cell, producing labels like "Double between 2002 Alternative wall and 2021 1 Alternative wall" (cert 001431 storage-heater variants, simulated case 34). The existing greedy trailing-suffix strip (\s+Alternative wall.*$) truncates at the FIRST "Alternative wall", losing "and 2021" and yielding the unmatchable "Double between 2002". Added a fallback that removes EVERY " wall [n]" fragment and any stray 1-2 digit location index from the raw label, then retries the lookup. Loss-free: no glazing-type key contains a wall-location phrase or a bare 1-2 digit number (install-date years are 4 digits). Unblocks the Summary cascade for any property with an Alternative Wall; Summary-path only (the API path receives structured glazing codes, so the API gauge is unaffected). Regression gate green (1 pre-existing fail unrelated). Co-Authored-By: Claude Opus 4.8 --- .../tests/test_summary_pdf_mapper_chain.py | 17 ++++++++++++ datatypes/epc/domain/mapper.py | 27 +++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index f89c4c72..a495addd 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -1540,6 +1540,23 @@ def test_summary_mapper_raises_on_unmapped_cylinder_insulation_label() -> None: assert excinfo.value.value == "Polyester wool" +def test_elmhurst_glazing_type_code_strips_interleaved_alternative_wall() -> None: + # Arrange — when a property lodges an Alternative Wall (cert 001431 + # storage-heater variants, "simulated case 34"), pdftotext interleaves + # the §11 "Alternative wall 1" location column into the wrapped + # glazing-type cell, e.g. "Double between 2002 Alternative wall and 2021 + # 1 Alternative wall". The wall-location fragments are not part of the + # glazing type — the helper must recover "Double between 2002 and 2021". + + # Act + code = _elmhurst_glazing_type_code( + "Double between 2002 Alternative wall and 2021 1 Alternative wall" + ) + + # Assert + assert code == _elmhurst_glazing_type_code("Double between 2002 and 2021") + + def test_elmhurst_immersion_type_code_maps_dual_and_single() -> None: # Arrange — Elmhurst Summary §15.1 "Immersion Heater" lodges "Dual" # or "Single". RdSAP 10 §10.5 (PDF p.54): an immersion is "assumed diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index e5c794c8..d6573aac 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -5470,6 +5470,21 @@ _ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE: Final[re.Pattern[str]] = re.compile( _ELMHURST_GLAZING_LABEL_TRAILING_BP_RE: Final[re.Pattern[str]] = re.compile( r"\s+(?:\d+(?:st|nd|rd|th)|Main)$" ) +# Fallback only: when a property lodges an Alternative Wall, pdftotext +# INTERLEAVES the §11 location column ("Alternative wall 1") into the +# wrapped glazing-TYPE cell, e.g. "Double between 2002 Alternative wall +# and 2021 1 Alternative wall" (cert 001431 storage-heater variants, +# `simulated case 34`). The greedy trailing-suffix strip truncates at the +# first "Alternative wall" (losing "and 2021"), so remove EVERY +# wall-location fragment + any stray 1-2 digit location index globally and +# retry. Loss-free: no glazing-type key contains a wall-location phrase or +# a bare 1-2 digit number (install-date years are 4 digits). +_ELMHURST_GLAZING_LABEL_EMBEDDED_WALL_RE: Final[re.Pattern[str]] = re.compile( + r"\s*(?:External|Alternative|Party)\s+wall(?:\s+\d+)?" +) +_ELMHURST_GLAZING_LABEL_STRAY_LOCATION_DIGIT_RE: Final[re.Pattern[str]] = re.compile( + r"\b\d{1,2}\b" +) def _elmhurst_glazing_type_code(label: Optional[str]) -> int: @@ -5501,6 +5516,18 @@ def _elmhurst_glazing_type_code(label: Optional[str]) -> int: code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(debp) if code is not None: return code + # Fallback: remove INTERLEAVED wall-location fragments from the raw + # label (Alternative/External/Party wall + stray location index) and + # collapse whitespace. Operates on `label`, not the greedily-truncated + # `cleaned`, so "Double between 2002 ... and 2021" survives. + dewall = _ELMHURST_GLAZING_LABEL_EMBEDDED_WALL_RE.sub(" ", label) + dewall = _ELMHURST_GLAZING_LABEL_STRAY_LOCATION_DIGIT_RE.sub(" ", dewall) + dewall = _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE.sub("", dewall) + dewall = re.sub(r"\s+", " ", dewall).strip() + if dewall != cleaned: + code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(dewall) + if code is not None: + return code raise UnmappedElmhurstLabel("glazing_type", label)