fix(extractor): re-join §11 windows whose Area cell split onto its own line

Sim case 20's §11 lodges 5 windows but only 1 surfaced. The "W H Area"
cells tokenize inconsistently: a narrow Area column keeps all three on one
line ("1.80 2.10 3.78" — matches _WIDTH_HEIGHT_AREA_RE), but a wider Area
column triggers pdftotext's 2+-space split, dropping the Area onto its own
line ("5.79 2.00" then "11.58"). The 3-decimal data anchor never matched
those four rows, so they were lost — gutting §6 solar gains (5 windows →
1) and dropping continuous SAP 43.05 → 38.32 vs the worksheet's 43.6322.

Pre-merge a "W H" line + a following lone-decimal Area into the canonical
"W H Area" line, gated on Area ≈ W × H (the §11 Area is always the product)
so a frame factor / g-value / U-value below a dimension line is never
absorbed. One-line layouts (3 decimals) are untouched.

Pins via test_summary_001431_case20_extracts_all_five_section11_windows
(Summary_001431_case20.pdf mirrors sap worksheets/golden fixture debugging/
simulated case 20/). 573 documents_parser tests pass; pyright strict net-zero.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-06 10:35:21 +00:00
parent 7e9231b36b
commit 795d36b732
3 changed files with 50 additions and 1 deletions

View file

@ -861,7 +861,7 @@ class ElmhurstSiteNotesExtractor:
)
if not m:
return []
lines = m.group(1).splitlines()
lines = self._merge_split_dimension_lines(m.group(1).splitlines())
# Locate all (data_line, manufacturer_line) pairs in document
# order. Each pair is one window.
@ -911,6 +911,40 @@ class ElmhurstSiteNotesExtractor:
windows.append(window)
return windows
# A "W H" pair on its own line (e.g. "5.79 2.00") whose Area cell the
# layout preprocessor pushed onto the following line as a lone decimal
# ("11.58"). Wider Area columns in the §11 grid trigger the 2+-space
# split; narrower ones keep all three on one line (the 3-decimal anchor).
_WIDTH_HEIGHT_RE = re.compile(r"^(\d+\.\d+)\s+(\d+\.\d+)$")
_AREA_ONLY_RE = re.compile(r"^(\d+\.\d+)$")
def _merge_split_dimension_lines(self, lines: List[str]) -> List[str]:
"""Re-join a window's "W H" line with a following bare-Area line
into the canonical "W H Area" shape the data anchor expects.
Gated on Area W × H (the §11 Area is always the product), so an
unrelated lone decimal below a "W H" line a frame factor, g-value
or U-value is never absorbed. Layouts that already lodge all
three on one line are untouched (their line has 3 decimals, not 2).
"""
merged: List[str] = []
i = 0
while i < len(lines):
wh = self._WIDTH_HEIGHT_RE.match(lines[i].strip())
area = (
self._AREA_ONLY_RE.match(lines[i + 1].strip())
if wh is not None and i + 1 < len(lines) else None
)
if wh is not None and area is not None:
w, h, a = float(wh.group(1)), float(wh.group(2)), float(area.group(1))
if abs(w * h - a) <= 0.05:
merged.append(f"{wh.group(1)} {wh.group(2)} {area.group(1)}")
i += 2
continue
merged.append(lines[i])
i += 1
return merged
def _find_manufacturer_after(self, lines: List[str], data_idx: int) -> Optional[int]:
for j in range(data_idx + 1, min(data_idx + 12, len(lines))):
stripped = lines[j].strip()

Binary file not shown.

View file

@ -78,6 +78,7 @@ _SUMMARY_000884_PDF = _FIXTURES / "Summary_000884.pdf" # cert 9421 (Normal cyli
_SUMMARY_000910_PDF = _FIXTURES / "Summary_000910.pdf" # cert 0036 (Flat, party wall U=0)
_SUMMARY_000890_PDF = _FIXTURES / "Summary_000890.pdf" # cert 7800 (two electric showers)
_SUMMARY_000565_PDF = _FIXTURES / "Summary_000565.pdf" # cert 000565 (5-bp Elmhurst-only)
_SUMMARY_001431_CASE20_PDF = _FIXTURES / "Summary_001431_case20.pdf" # sim case 20 (storage heaters + RR type-2 + wrapped "Double between 2002 and 2021" glazing)
# GOV.UK EPB API JSON for cert 001479 — the API-path counterpart of the
# Summary_001479.pdf fixture. Together they drive the API ≡ Summary
@ -127,6 +128,20 @@ def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]:
return pages
def test_summary_001431_case20_extracts_all_five_section11_windows() -> None:
# Arrange — sim case 20's §11 lodges 5 windows, each with the glazing
# label "Double between 2002 and 2021". That phrase wraps to two PDF
# lines, so pdftotext interleaves its continuation ("and 2021") with
# the next row's cells — a layout the window parser must survive.
pages = _summary_pdf_to_textract_style_pages(_SUMMARY_001431_CASE20_PDF)
# Act
survey = ElmhurstSiteNotesExtractor(pages).extract()
# Assert
assert len(survey.windows) == 5
def test_summary_000474_mapper_produces_three_building_parts() -> None:
# Arrange — cert U985-0001-000474 is a mid-terrace with 3 building
# parts (Main + 2 extensions) per the hand-built worksheet fixture