mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
fix(extractor): drop windows-table header remnant from first window glazing type
Summary PDFs preprocessed from `pdftotext -layout` wrap the windows-table
header across several lines. The third header line's tail ("U value / g
value / Draught Proofed / Permanent Shutters") tokenises to "value value
Proofed Shutters" and lands directly above the FIRST window's data row.
Because the first window in a building part has `before_start = 0`, its
prefix block reaches back into that header remnant. The remnant is
neither an orientation nor a building-part fragment, so it survived the
pops in `_compose_window_descriptors` and leaked into glazing_type as
"value value Proofed Shutters Double between 2002 and 2021" (windows 2-3,
whose prefix starts after the previous window's manufacturer line, were
clean).
Fix: the glazing-type phrase always starts with a glazing-start word
(Single/Double/Triple/Secondary), so trim any prefix fragments preceding
that word before joining the glazing type. Orientation/bp pops still run
on the full prefix, so they are unaffected.
Reproduced from `sap worksheets/Recommendations Elmhurst Files/
cavity_wall_insulation - main wall/before/Summary_001431.pdf`. Added a
regression test driving the real `_extract_windows_from_layout` path with
the verbatim tokenised header+rows. 2306 passed (+4), pyright net-zero.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
896b5740c3
commit
ec9ef0e8bb
2 changed files with 131 additions and 1 deletions
|
|
@ -1090,7 +1090,28 @@ class ElmhurstSiteNotesExtractor:
|
||||||
if inline_glazing_type is not None:
|
if inline_glazing_type is not None:
|
||||||
glazing_type = inline_glazing_type
|
glazing_type = inline_glazing_type
|
||||||
else:
|
else:
|
||||||
glazing_type = " ".join([*prefix, *suffix]).strip()
|
# The glazing-type phrase always starts with a glazing-start
|
||||||
|
# word (Single/Double/Triple/Secondary). The FIRST window in
|
||||||
|
# a building part has `before_start = 0`, so its prefix block
|
||||||
|
# reaches back into the wrapped windows-table header; the
|
||||||
|
# third header line's tail tokenises to "value value Proofed
|
||||||
|
# Shutters" (the "U value / g value / Draught Proofed /
|
||||||
|
# Permanent Shutters" column titles) and is neither an
|
||||||
|
# orientation nor a bp fragment, so it survives the pops.
|
||||||
|
# Drop any prefix fragments preceding the glazing-start word
|
||||||
|
# so they don't leak into the glazing type.
|
||||||
|
glazing_start = next(
|
||||||
|
(
|
||||||
|
idx
|
||||||
|
for idx, frag in enumerate(prefix)
|
||||||
|
if frag.split(" ", 1)[0] in self._GLAZING_TYPE_PREFIX_WORDS
|
||||||
|
),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
glazing_prefix = (
|
||||||
|
prefix[glazing_start:] if glazing_start is not None else prefix
|
||||||
|
)
|
||||||
|
glazing_type = " ".join([*glazing_prefix, *suffix]).strip()
|
||||||
|
|
||||||
# Building part: inline token wins; otherwise join prefix + suffix.
|
# Building part: inline token wins; otherwise join prefix + suffix.
|
||||||
if bp_inline is not None:
|
if bp_inline is not None:
|
||||||
|
|
|
||||||
|
|
@ -513,3 +513,112 @@ class TestLightingLedCflUnknown:
|
||||||
|
|
||||||
def test_cfl_count_zero_when_unknown(self, result2: ElmhurstSiteNotes) -> None:
|
def test_cfl_count_zero_when_unknown(self, result2: ElmhurstSiteNotes) -> None:
|
||||||
assert result2.lighting.cfl_count == 0
|
assert result2.lighting.cfl_count == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestWindowsLayoutHeaderRemnant:
|
||||||
|
"""Regression for the first-window glazing-type header leak.
|
||||||
|
|
||||||
|
Summary PDFs preprocessed from `pdftotext -layout` wrap the windows
|
||||||
|
table header across several lines. The third header line's tail
|
||||||
|
("U value / g value / Draught Proofed / Permanent Shutters") tokenises
|
||||||
|
to "value value Proofed Shutters" and sits directly above the FIRST
|
||||||
|
window's data row. Because the first window in a building part has
|
||||||
|
`before_start = 0`, its prefix block reaches back into that header
|
||||||
|
remnant, which is neither an orientation nor a building-part fragment
|
||||||
|
and so survived into `glazing_type` as
|
||||||
|
"value value Proofed Shutters Double between 2002 and 2021".
|
||||||
|
|
||||||
|
Reproduced from `sap worksheets/Recommendations Elmhurst Files/
|
||||||
|
cavity_wall_insulation - main wall/before/Summary_001431.pdf` (3
|
||||||
|
Manufacturer-data-source windows; only window 0 was corrupted).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Faithful reproduction of the tokenised windows section (one page),
|
||||||
|
# captured verbatim from the Summary PDF above. The header remnant
|
||||||
|
# "value value Proofed Shutters" precedes window 0's wrapped glazing
|
||||||
|
# cell ("Double between 2002" / "and 2021").
|
||||||
|
_WINDOWS_PAGE = "\n".join([
|
||||||
|
"11.0 Windows:",
|
||||||
|
"Frame Frame Glazing",
|
||||||
|
"Building",
|
||||||
|
"U",
|
||||||
|
"g Draught Permanent",
|
||||||
|
"W",
|
||||||
|
"H",
|
||||||
|
"Area Glazing Type",
|
||||||
|
"Location",
|
||||||
|
"Orient. Data-Source",
|
||||||
|
"Type Factor Gap",
|
||||||
|
"Part",
|
||||||
|
"value value Proofed Shutters",
|
||||||
|
"Double between 2002",
|
||||||
|
"North",
|
||||||
|
"0.97 1.00 0.97",
|
||||||
|
"PVC",
|
||||||
|
"0.70",
|
||||||
|
"Main",
|
||||||
|
"External wall",
|
||||||
|
"Manufacturer 2.00",
|
||||||
|
"0.72",
|
||||||
|
"Yes",
|
||||||
|
"None",
|
||||||
|
"and 2021",
|
||||||
|
"West",
|
||||||
|
"Double between 2002",
|
||||||
|
"South",
|
||||||
|
"2.66 1.00 2.66",
|
||||||
|
"PVC",
|
||||||
|
"0.70",
|
||||||
|
"Main",
|
||||||
|
"External wall",
|
||||||
|
"Manufacturer 2.00",
|
||||||
|
"0.72",
|
||||||
|
"Yes",
|
||||||
|
"None",
|
||||||
|
"and 2021",
|
||||||
|
"East",
|
||||||
|
"Double between 2002",
|
||||||
|
"South",
|
||||||
|
"2.66 1.00 2.66",
|
||||||
|
"PVC",
|
||||||
|
"0.70",
|
||||||
|
"Main",
|
||||||
|
"External wall",
|
||||||
|
"Manufacturer 2.00",
|
||||||
|
"0.72",
|
||||||
|
"Yes",
|
||||||
|
"None",
|
||||||
|
"and 2021",
|
||||||
|
"East",
|
||||||
|
"12.0 Ventilation",
|
||||||
|
])
|
||||||
|
|
||||||
|
@pytest.fixture(scope="class")
|
||||||
|
def windows(self) -> list[Window]:
|
||||||
|
return ElmhurstSiteNotesExtractor([self._WINDOWS_PAGE])._extract_windows()
|
||||||
|
|
||||||
|
def test_window_count(self, windows: list[Window]) -> None:
|
||||||
|
# Arrange / Act / Assert
|
||||||
|
assert len(windows) == 3
|
||||||
|
|
||||||
|
def test_first_window_glazing_type_excludes_header_remnant(
|
||||||
|
self, windows: list[Window]
|
||||||
|
) -> None:
|
||||||
|
# Arrange / Act / Assert — no "value value Proofed Shutters" leak.
|
||||||
|
assert windows[0].glazing_type == "Double between 2002 and 2021"
|
||||||
|
|
||||||
|
def test_all_windows_share_clean_glazing_type(
|
||||||
|
self, windows: list[Window]
|
||||||
|
) -> None:
|
||||||
|
# Arrange / Act / Assert — windows 1 and 2 were already clean;
|
||||||
|
# all three must agree after the fix.
|
||||||
|
assert [w.glazing_type for w in windows] == [
|
||||||
|
"Double between 2002 and 2021"
|
||||||
|
] * 3
|
||||||
|
|
||||||
|
def test_first_window_orientation_unaffected(
|
||||||
|
self, windows: list[Window]
|
||||||
|
) -> None:
|
||||||
|
# Arrange / Act / Assert — trimming the glazing prefix must not
|
||||||
|
# disturb orientation extraction (North + West fragments).
|
||||||
|
assert windows[0].orientation == "North-West"
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue