From ec9ef0e8bb2f1028cdcdd2d974b2afb31743003a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 2 Jun 2026 22:54:49 +0000 Subject: [PATCH] fix(extractor): drop windows-table header remnant from first window glazing type Summary PDFs preprocessed from `pdftotext -layout` wrap the windows-table header across several lines. The third header line's tail ("U value / g value / Draught Proofed / Permanent Shutters") tokenises to "value value Proofed Shutters" and lands directly above the FIRST window's data row. Because the first window in a building part has `before_start = 0`, its prefix block reaches back into that header remnant. The remnant is neither an orientation nor a building-part fragment, so it survived the pops in `_compose_window_descriptors` and leaked into glazing_type as "value value Proofed Shutters Double between 2002 and 2021" (windows 2-3, whose prefix starts after the previous window's manufacturer line, were clean). Fix: the glazing-type phrase always starts with a glazing-start word (Single/Double/Triple/Secondary), so trim any prefix fragments preceding that word before joining the glazing type. Orientation/bp pops still run on the full prefix, so they are unaffected. Reproduced from `sap worksheets/Recommendations Elmhurst Files/ cavity_wall_insulation - main wall/before/Summary_001431.pdf`. Added a regression test driving the real `_extract_windows_from_layout` path with the verbatim tokenised header+rows. 2306 passed (+4), pyright net-zero. Co-Authored-By: Claude Opus 4.8 --- .../documents_parser/elmhurst_extractor.py | 23 +++- .../tests/test_elmhurst_extractor.py | 109 ++++++++++++++++++ 2 files changed, 131 insertions(+), 1 deletion(-) diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 12c33830..b3fde06b 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -1090,7 +1090,28 @@ class ElmhurstSiteNotesExtractor: if inline_glazing_type is not None: glazing_type = inline_glazing_type else: - glazing_type = " ".join([*prefix, *suffix]).strip() + # The glazing-type phrase always starts with a glazing-start + # word (Single/Double/Triple/Secondary). The FIRST window in + # a building part has `before_start = 0`, so its prefix block + # reaches back into the wrapped windows-table header; the + # third header line's tail tokenises to "value value Proofed + # Shutters" (the "U value / g value / Draught Proofed / + # Permanent Shutters" column titles) and is neither an + # orientation nor a bp fragment, so it survives the pops. + # Drop any prefix fragments preceding the glazing-start word + # so they don't leak into the glazing type. + glazing_start = next( + ( + idx + for idx, frag in enumerate(prefix) + if frag.split(" ", 1)[0] in self._GLAZING_TYPE_PREFIX_WORDS + ), + None, + ) + glazing_prefix = ( + prefix[glazing_start:] if glazing_start is not None else prefix + ) + glazing_type = " ".join([*glazing_prefix, *suffix]).strip() # Building part: inline token wins; otherwise join prefix + suffix. if bp_inline is not None: diff --git a/backend/documents_parser/tests/test_elmhurst_extractor.py b/backend/documents_parser/tests/test_elmhurst_extractor.py index e0dca443..62c0e743 100644 --- a/backend/documents_parser/tests/test_elmhurst_extractor.py +++ b/backend/documents_parser/tests/test_elmhurst_extractor.py @@ -513,3 +513,112 @@ class TestLightingLedCflUnknown: def test_cfl_count_zero_when_unknown(self, result2: ElmhurstSiteNotes) -> None: assert result2.lighting.cfl_count == 0 + + +class TestWindowsLayoutHeaderRemnant: + """Regression for the first-window glazing-type header leak. + + Summary PDFs preprocessed from `pdftotext -layout` wrap the windows + table header across several lines. The third header line's tail + ("U value / g value / Draught Proofed / Permanent Shutters") tokenises + to "value value Proofed Shutters" and sits directly above the FIRST + window's data row. Because the first window in a building part has + `before_start = 0`, its prefix block reaches back into that header + remnant, which is neither an orientation nor a building-part fragment + and so survived into `glazing_type` as + "value value Proofed Shutters Double between 2002 and 2021". + + Reproduced from `sap worksheets/Recommendations Elmhurst Files/ + cavity_wall_insulation - main wall/before/Summary_001431.pdf` (3 + Manufacturer-data-source windows; only window 0 was corrupted). + """ + + # Faithful reproduction of the tokenised windows section (one page), + # captured verbatim from the Summary PDF above. The header remnant + # "value value Proofed Shutters" precedes window 0's wrapped glazing + # cell ("Double between 2002" / "and 2021"). + _WINDOWS_PAGE = "\n".join([ + "11.0 Windows:", + "Frame Frame Glazing", + "Building", + "U", + "g Draught Permanent", + "W", + "H", + "Area Glazing Type", + "Location", + "Orient. Data-Source", + "Type Factor Gap", + "Part", + "value value Proofed Shutters", + "Double between 2002", + "North", + "0.97 1.00 0.97", + "PVC", + "0.70", + "Main", + "External wall", + "Manufacturer 2.00", + "0.72", + "Yes", + "None", + "and 2021", + "West", + "Double between 2002", + "South", + "2.66 1.00 2.66", + "PVC", + "0.70", + "Main", + "External wall", + "Manufacturer 2.00", + "0.72", + "Yes", + "None", + "and 2021", + "East", + "Double between 2002", + "South", + "2.66 1.00 2.66", + "PVC", + "0.70", + "Main", + "External wall", + "Manufacturer 2.00", + "0.72", + "Yes", + "None", + "and 2021", + "East", + "12.0 Ventilation", + ]) + + @pytest.fixture(scope="class") + def windows(self) -> list[Window]: + return ElmhurstSiteNotesExtractor([self._WINDOWS_PAGE])._extract_windows() + + def test_window_count(self, windows: list[Window]) -> None: + # Arrange / Act / Assert + assert len(windows) == 3 + + def test_first_window_glazing_type_excludes_header_remnant( + self, windows: list[Window] + ) -> None: + # Arrange / Act / Assert — no "value value Proofed Shutters" leak. + assert windows[0].glazing_type == "Double between 2002 and 2021" + + def test_all_windows_share_clean_glazing_type( + self, windows: list[Window] + ) -> None: + # Arrange / Act / Assert — windows 1 and 2 were already clean; + # all three must agree after the fix. + assert [w.glazing_type for w in windows] == [ + "Double between 2002 and 2021" + ] * 3 + + def test_first_window_orientation_unaffected( + self, windows: list[Window] + ) -> None: + # Arrange / Act / Assert — trimming the glazing prefix must not + # disturb orientation extraction (North + West fragments). + assert windows[0].orientation == "North-West"