mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
fix(extractor): re-join §11 windows whose Area cell split onto its own line
Sim case 20's §11 lodges 5 windows but only 1 surfaced. The "W H Area"
cells tokenize inconsistently: a narrow Area column keeps all three on one
line ("1.80 2.10 3.78" — matches _WIDTH_HEIGHT_AREA_RE), but a wider Area
column triggers pdftotext's 2+-space split, dropping the Area onto its own
line ("5.79 2.00" then "11.58"). The 3-decimal data anchor never matched
those four rows, so they were lost — gutting §6 solar gains (5 windows →
1) and dropping continuous SAP 43.05 → 38.32 vs the worksheet's 43.6322.
Pre-merge a "W H" line + a following lone-decimal Area into the canonical
"W H Area" line, gated on Area ≈ W × H (the §11 Area is always the product)
so a frame factor / g-value / U-value below a dimension line is never
absorbed. One-line layouts (3 decimals) are untouched.
Pins via test_summary_001431_case20_extracts_all_five_section11_windows
(Summary_001431_case20.pdf mirrors sap worksheets/golden fixture debugging/
simulated case 20/). 573 documents_parser tests pass; pyright strict net-zero.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
7e9231b36b
commit
795d36b732
3 changed files with 50 additions and 1 deletions
|
|
@ -861,7 +861,7 @@ class ElmhurstSiteNotesExtractor:
|
|||
)
|
||||
if not m:
|
||||
return []
|
||||
lines = m.group(1).splitlines()
|
||||
lines = self._merge_split_dimension_lines(m.group(1).splitlines())
|
||||
|
||||
# Locate all (data_line, manufacturer_line) pairs in document
|
||||
# order. Each pair is one window.
|
||||
|
|
@ -911,6 +911,40 @@ class ElmhurstSiteNotesExtractor:
|
|||
windows.append(window)
|
||||
return windows
|
||||
|
||||
# A "W H" pair on its own line (e.g. "5.79 2.00") whose Area cell the
|
||||
# layout preprocessor pushed onto the following line as a lone decimal
|
||||
# ("11.58"). Wider Area columns in the §11 grid trigger the 2+-space
|
||||
# split; narrower ones keep all three on one line (the 3-decimal anchor).
|
||||
_WIDTH_HEIGHT_RE = re.compile(r"^(\d+\.\d+)\s+(\d+\.\d+)$")
|
||||
_AREA_ONLY_RE = re.compile(r"^(\d+\.\d+)$")
|
||||
|
||||
def _merge_split_dimension_lines(self, lines: List[str]) -> List[str]:
|
||||
"""Re-join a window's "W H" line with a following bare-Area line
|
||||
into the canonical "W H Area" shape the data anchor expects.
|
||||
|
||||
Gated on Area ≈ W × H (the §11 Area is always the product), so an
|
||||
unrelated lone decimal below a "W H" line — a frame factor, g-value
|
||||
or U-value — is never absorbed. Layouts that already lodge all
|
||||
three on one line are untouched (their line has 3 decimals, not 2).
|
||||
"""
|
||||
merged: List[str] = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
wh = self._WIDTH_HEIGHT_RE.match(lines[i].strip())
|
||||
area = (
|
||||
self._AREA_ONLY_RE.match(lines[i + 1].strip())
|
||||
if wh is not None and i + 1 < len(lines) else None
|
||||
)
|
||||
if wh is not None and area is not None:
|
||||
w, h, a = float(wh.group(1)), float(wh.group(2)), float(area.group(1))
|
||||
if abs(w * h - a) <= 0.05:
|
||||
merged.append(f"{wh.group(1)} {wh.group(2)} {area.group(1)}")
|
||||
i += 2
|
||||
continue
|
||||
merged.append(lines[i])
|
||||
i += 1
|
||||
return merged
|
||||
|
||||
def _find_manufacturer_after(self, lines: List[str], data_idx: int) -> Optional[int]:
|
||||
for j in range(data_idx + 1, min(data_idx + 12, len(lines))):
|
||||
stripped = lines[j].strip()
|
||||
|
|
|
|||
BIN
backend/documents_parser/tests/fixtures/Summary_001431_case20.pdf
vendored
Normal file
BIN
backend/documents_parser/tests/fixtures/Summary_001431_case20.pdf
vendored
Normal file
Binary file not shown.
|
|
@ -78,6 +78,7 @@ _SUMMARY_000884_PDF = _FIXTURES / "Summary_000884.pdf" # cert 9421 (Normal cyli
|
|||
_SUMMARY_000910_PDF = _FIXTURES / "Summary_000910.pdf" # cert 0036 (Flat, party wall U=0)
|
||||
_SUMMARY_000890_PDF = _FIXTURES / "Summary_000890.pdf" # cert 7800 (two electric showers)
|
||||
_SUMMARY_000565_PDF = _FIXTURES / "Summary_000565.pdf" # cert 000565 (5-bp Elmhurst-only)
|
||||
_SUMMARY_001431_CASE20_PDF = _FIXTURES / "Summary_001431_case20.pdf" # sim case 20 (storage heaters + RR type-2 + wrapped "Double between 2002 and 2021" glazing)
|
||||
|
||||
# GOV.UK EPB API JSON for cert 001479 — the API-path counterpart of the
|
||||
# Summary_001479.pdf fixture. Together they drive the API ≡ Summary
|
||||
|
|
@ -127,6 +128,20 @@ def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]:
|
|||
return pages
|
||||
|
||||
|
||||
def test_summary_001431_case20_extracts_all_five_section11_windows() -> None:
|
||||
# Arrange — sim case 20's §11 lodges 5 windows, each with the glazing
|
||||
# label "Double between 2002 and 2021". That phrase wraps to two PDF
|
||||
# lines, so pdftotext interleaves its continuation ("and 2021") with
|
||||
# the next row's cells — a layout the window parser must survive.
|
||||
pages = _summary_pdf_to_textract_style_pages(_SUMMARY_001431_CASE20_PDF)
|
||||
|
||||
# Act
|
||||
survey = ElmhurstSiteNotesExtractor(pages).extract()
|
||||
|
||||
# Assert
|
||||
assert len(survey.windows) == 5
|
||||
|
||||
|
||||
def test_summary_000474_mapper_produces_three_building_parts() -> None:
|
||||
# Arrange — cert U985-0001-000474 is a mid-terrace with 3 building
|
||||
# parts (Main + 2 extensions) per the hand-built worksheet fixture
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue