fix(extractor): drop windows-table header remnant from first window glazing type

Summary PDFs preprocessed from `pdftotext -layout` wrap the windows-table
header across several lines. The third header line's tail ("U value / g
value / Draught Proofed / Permanent Shutters") tokenises to "value value
Proofed Shutters" and lands directly above the FIRST window's data row.

Because the first window in a building part has `before_start = 0`, its
prefix block reaches back into that header remnant. The remnant is
neither an orientation nor a building-part fragment, so it survived the
pops in `_compose_window_descriptors` and leaked into glazing_type as
"value value Proofed Shutters Double between 2002 and 2021" (windows 2-3,
whose prefix starts after the previous window's manufacturer line, were
clean).

Fix: the glazing-type phrase always starts with a glazing-start word
(Single/Double/Triple/Secondary), so trim any prefix fragments preceding
that word before joining the glazing type. Orientation/bp pops still run
on the full prefix, so they are unaffected.

Reproduced from `sap worksheets/Recommendations Elmhurst Files/
cavity_wall_insulation - main wall/before/Summary_001431.pdf`. Added a
regression test driving the real `_extract_windows_from_layout` path with
the verbatim tokenised header+rows. 2306 passed (+4), pyright net-zero.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-02 22:54:49 +00:00 committed by Jun-te Kim
parent 846952f7cd
commit fc30480506
2 changed files with 131 additions and 1 deletions

View file

@ -1090,7 +1090,28 @@ class ElmhurstSiteNotesExtractor:
if inline_glazing_type is not None:
glazing_type = inline_glazing_type
else:
glazing_type = " ".join([*prefix, *suffix]).strip()
# The glazing-type phrase always starts with a glazing-start
# word (Single/Double/Triple/Secondary). The FIRST window in
# a building part has `before_start = 0`, so its prefix block
# reaches back into the wrapped windows-table header; the
# third header line's tail tokenises to "value value Proofed
# Shutters" (the "U value / g value / Draught Proofed /
# Permanent Shutters" column titles) and is neither an
# orientation nor a bp fragment, so it survives the pops.
# Drop any prefix fragments preceding the glazing-start word
# so they don't leak into the glazing type.
glazing_start = next(
(
idx
for idx, frag in enumerate(prefix)
if frag.split(" ", 1)[0] in self._GLAZING_TYPE_PREFIX_WORDS
),
None,
)
glazing_prefix = (
prefix[glazing_start:] if glazing_start is not None else prefix
)
glazing_type = " ".join([*glazing_prefix, *suffix]).strip()
# Building part: inline token wins; otherwise join prefix + suffix.
if bp_inline is not None:

View file

@ -513,3 +513,112 @@ class TestLightingLedCflUnknown:
def test_cfl_count_zero_when_unknown(self, result2: ElmhurstSiteNotes) -> None:
assert result2.lighting.cfl_count == 0
class TestWindowsLayoutHeaderRemnant:
"""Regression for the first-window glazing-type header leak.
Summary PDFs preprocessed from `pdftotext -layout` wrap the windows
table header across several lines. The third header line's tail
("U value / g value / Draught Proofed / Permanent Shutters") tokenises
to "value value Proofed Shutters" and sits directly above the FIRST
window's data row. Because the first window in a building part has
`before_start = 0`, its prefix block reaches back into that header
remnant, which is neither an orientation nor a building-part fragment
and so survived into `glazing_type` as
"value value Proofed Shutters Double between 2002 and 2021".
Reproduced from `sap worksheets/Recommendations Elmhurst Files/
cavity_wall_insulation - main wall/before/Summary_001431.pdf` (3
Manufacturer-data-source windows; only window 0 was corrupted).
"""
# Faithful reproduction of the tokenised windows section (one page),
# captured verbatim from the Summary PDF above. The header remnant
# "value value Proofed Shutters" precedes window 0's wrapped glazing
# cell ("Double between 2002" / "and 2021").
_WINDOWS_PAGE = "\n".join([
"11.0 Windows:",
"Frame Frame Glazing",
"Building",
"U",
"g Draught Permanent",
"W",
"H",
"Area Glazing Type",
"Location",
"Orient. Data-Source",
"Type Factor Gap",
"Part",
"value value Proofed Shutters",
"Double between 2002",
"North",
"0.97 1.00 0.97",
"PVC",
"0.70",
"Main",
"External wall",
"Manufacturer 2.00",
"0.72",
"Yes",
"None",
"and 2021",
"West",
"Double between 2002",
"South",
"2.66 1.00 2.66",
"PVC",
"0.70",
"Main",
"External wall",
"Manufacturer 2.00",
"0.72",
"Yes",
"None",
"and 2021",
"East",
"Double between 2002",
"South",
"2.66 1.00 2.66",
"PVC",
"0.70",
"Main",
"External wall",
"Manufacturer 2.00",
"0.72",
"Yes",
"None",
"and 2021",
"East",
"12.0 Ventilation",
])
@pytest.fixture(scope="class")
def windows(self) -> list[Window]:
return ElmhurstSiteNotesExtractor([self._WINDOWS_PAGE])._extract_windows()
def test_window_count(self, windows: list[Window]) -> None:
# Arrange / Act / Assert
assert len(windows) == 3
def test_first_window_glazing_type_excludes_header_remnant(
self, windows: list[Window]
) -> None:
# Arrange / Act / Assert — no "value value Proofed Shutters" leak.
assert windows[0].glazing_type == "Double between 2002 and 2021"
def test_all_windows_share_clean_glazing_type(
self, windows: list[Window]
) -> None:
# Arrange / Act / Assert — windows 1 and 2 were already clean;
# all three must agree after the fix.
assert [w.glazing_type for w in windows] == [
"Double between 2002 and 2021"
] * 3
def test_first_window_orientation_unaffected(
self, windows: list[Window]
) -> None:
# Arrange / Act / Assert — trimming the glazing prefix must not
# disturb orientation extraction (North + West fragments).
assert windows[0].orientation == "North-West"