mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
fix(extractor): drop windows-table header remnant from first window glazing type
Summary PDFs preprocessed from `pdftotext -layout` wrap the windows-table
header across several lines. The third header line's tail ("U value / g
value / Draught Proofed / Permanent Shutters") tokenises to "value value
Proofed Shutters" and lands directly above the FIRST window's data row.
Because the first window in a building part has `before_start = 0`, its
prefix block reaches back into that header remnant. The remnant is
neither an orientation nor a building-part fragment, so it survived the
pops in `_compose_window_descriptors` and leaked into glazing_type as
"value value Proofed Shutters Double between 2002 and 2021" (windows 2-3,
whose prefix starts after the previous window's manufacturer line, were
clean).
Fix: the glazing-type phrase always starts with a glazing-start word
(Single/Double/Triple/Secondary), so trim any prefix fragments preceding
that word before joining the glazing type. Orientation/bp pops still run
on the full prefix, so they are unaffected.
Reproduced from `sap worksheets/Recommendations Elmhurst Files/
cavity_wall_insulation - main wall/before/Summary_001431.pdf`. Added a
regression test driving the real `_extract_windows_from_layout` path with
the verbatim tokenised header+rows. 2306 passed (+4), pyright net-zero.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
846952f7cd
commit
fc30480506
2 changed files with 131 additions and 1 deletions
|
|
@ -1090,7 +1090,28 @@ class ElmhurstSiteNotesExtractor:
|
|||
if inline_glazing_type is not None:
|
||||
glazing_type = inline_glazing_type
|
||||
else:
|
||||
glazing_type = " ".join([*prefix, *suffix]).strip()
|
||||
# The glazing-type phrase always starts with a glazing-start
|
||||
# word (Single/Double/Triple/Secondary). The FIRST window in
|
||||
# a building part has `before_start = 0`, so its prefix block
|
||||
# reaches back into the wrapped windows-table header; the
|
||||
# third header line's tail tokenises to "value value Proofed
|
||||
# Shutters" (the "U value / g value / Draught Proofed /
|
||||
# Permanent Shutters" column titles) and is neither an
|
||||
# orientation nor a bp fragment, so it survives the pops.
|
||||
# Drop any prefix fragments preceding the glazing-start word
|
||||
# so they don't leak into the glazing type.
|
||||
glazing_start = next(
|
||||
(
|
||||
idx
|
||||
for idx, frag in enumerate(prefix)
|
||||
if frag.split(" ", 1)[0] in self._GLAZING_TYPE_PREFIX_WORDS
|
||||
),
|
||||
None,
|
||||
)
|
||||
glazing_prefix = (
|
||||
prefix[glazing_start:] if glazing_start is not None else prefix
|
||||
)
|
||||
glazing_type = " ".join([*glazing_prefix, *suffix]).strip()
|
||||
|
||||
# Building part: inline token wins; otherwise join prefix + suffix.
|
||||
if bp_inline is not None:
|
||||
|
|
|
|||
|
|
@ -513,3 +513,112 @@ class TestLightingLedCflUnknown:
|
|||
|
||||
def test_cfl_count_zero_when_unknown(self, result2: ElmhurstSiteNotes) -> None:
|
||||
assert result2.lighting.cfl_count == 0
|
||||
|
||||
|
||||
class TestWindowsLayoutHeaderRemnant:
|
||||
"""Regression for the first-window glazing-type header leak.
|
||||
|
||||
Summary PDFs preprocessed from `pdftotext -layout` wrap the windows
|
||||
table header across several lines. The third header line's tail
|
||||
("U value / g value / Draught Proofed / Permanent Shutters") tokenises
|
||||
to "value value Proofed Shutters" and sits directly above the FIRST
|
||||
window's data row. Because the first window in a building part has
|
||||
`before_start = 0`, its prefix block reaches back into that header
|
||||
remnant, which is neither an orientation nor a building-part fragment
|
||||
and so survived into `glazing_type` as
|
||||
"value value Proofed Shutters Double between 2002 and 2021".
|
||||
|
||||
Reproduced from `sap worksheets/Recommendations Elmhurst Files/
|
||||
cavity_wall_insulation - main wall/before/Summary_001431.pdf` (3
|
||||
Manufacturer-data-source windows; only window 0 was corrupted).
|
||||
"""
|
||||
|
||||
# Faithful reproduction of the tokenised windows section (one page),
|
||||
# captured verbatim from the Summary PDF above. The header remnant
|
||||
# "value value Proofed Shutters" precedes window 0's wrapped glazing
|
||||
# cell ("Double between 2002" / "and 2021").
|
||||
_WINDOWS_PAGE = "\n".join([
|
||||
"11.0 Windows:",
|
||||
"Frame Frame Glazing",
|
||||
"Building",
|
||||
"U",
|
||||
"g Draught Permanent",
|
||||
"W",
|
||||
"H",
|
||||
"Area Glazing Type",
|
||||
"Location",
|
||||
"Orient. Data-Source",
|
||||
"Type Factor Gap",
|
||||
"Part",
|
||||
"value value Proofed Shutters",
|
||||
"Double between 2002",
|
||||
"North",
|
||||
"0.97 1.00 0.97",
|
||||
"PVC",
|
||||
"0.70",
|
||||
"Main",
|
||||
"External wall",
|
||||
"Manufacturer 2.00",
|
||||
"0.72",
|
||||
"Yes",
|
||||
"None",
|
||||
"and 2021",
|
||||
"West",
|
||||
"Double between 2002",
|
||||
"South",
|
||||
"2.66 1.00 2.66",
|
||||
"PVC",
|
||||
"0.70",
|
||||
"Main",
|
||||
"External wall",
|
||||
"Manufacturer 2.00",
|
||||
"0.72",
|
||||
"Yes",
|
||||
"None",
|
||||
"and 2021",
|
||||
"East",
|
||||
"Double between 2002",
|
||||
"South",
|
||||
"2.66 1.00 2.66",
|
||||
"PVC",
|
||||
"0.70",
|
||||
"Main",
|
||||
"External wall",
|
||||
"Manufacturer 2.00",
|
||||
"0.72",
|
||||
"Yes",
|
||||
"None",
|
||||
"and 2021",
|
||||
"East",
|
||||
"12.0 Ventilation",
|
||||
])
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def windows(self) -> list[Window]:
|
||||
return ElmhurstSiteNotesExtractor([self._WINDOWS_PAGE])._extract_windows()
|
||||
|
||||
def test_window_count(self, windows: list[Window]) -> None:
|
||||
# Arrange / Act / Assert
|
||||
assert len(windows) == 3
|
||||
|
||||
def test_first_window_glazing_type_excludes_header_remnant(
|
||||
self, windows: list[Window]
|
||||
) -> None:
|
||||
# Arrange / Act / Assert — no "value value Proofed Shutters" leak.
|
||||
assert windows[0].glazing_type == "Double between 2002 and 2021"
|
||||
|
||||
def test_all_windows_share_clean_glazing_type(
|
||||
self, windows: list[Window]
|
||||
) -> None:
|
||||
# Arrange / Act / Assert — windows 1 and 2 were already clean;
|
||||
# all three must agree after the fix.
|
||||
assert [w.glazing_type for w in windows] == [
|
||||
"Double between 2002 and 2021"
|
||||
] * 3
|
||||
|
||||
def test_first_window_orientation_unaffected(
|
||||
self, windows: list[Window]
|
||||
) -> None:
|
||||
# Arrange / Act / Assert — trimming the glazing prefix must not
|
||||
# disturb orientation extraction (North + West fragments).
|
||||
assert windows[0].orientation == "North-West"
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue