mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
The legacy `_extract_windows` regex anchors on "Permanent Shutters\n" which is broken across lines by the pdftotext-layout preprocessor. New fallback `_extract_windows_from_layout` anchors on the two stable per-window markers — a "W H Area" data line and the "Manufacturer <U_value>" line a few lines further down — and tolerates the variable-order optional fields (glazing_gap, inline building_part, inline orientation) between them. Prefix/suffix tokens around the data block are re-joined into glazing_type / building_part / orientation strings. Cert U985-0001-000474's 7 windows across Main + 2 extensions now flow through the mapper to EpcPropertyData.sap_windows (was 0). Textract-style extraction (existing fixture) is unchanged — the legacy path runs first and only falls through when its regex misses. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
110 lines
4.5 KiB
Python
110 lines
4.5 KiB
Python
"""End-to-end scaffold for the Elmhurst Summary→EpcPropertyData chain.
|
|
|
|
The 6 Elmhurst worksheet fixtures in `domain.sap.worksheet.tests`
|
|
build their `EpcPropertyData` synthetically — they validate the
|
|
calculator + cascade in isolation from the mapper. This file pins
|
|
the OTHER half of the chain: `from_elmhurst_site_notes` must produce
|
|
a calculator-equivalent `EpcPropertyData` when fed the Summary
|
|
PDF the worksheet was generated from. If the two halves agree, the
|
|
WHOLE pipeline (extractor + mapper + cascade + calculator) is
|
|
validated end-to-end against authoritative Elmhurst documents.
|
|
|
|
Status: xfail. Today's audit (2026-05-24) surfaced a 28-field diff
|
|
between `from_elmhurst_site_notes(Summary_000474)` and the hand-
|
|
built `_elmhurst_worksheet_000474.build_epc()`. The load-bearing
|
|
gaps (calculator-relevant):
|
|
- sap_building_parts: 1 instead of 3 — mapper produces a single
|
|
bp via `[_map_elmhurst_building_part(survey)]` at [mapper.py:288](datatypes/epc/domain/mapper.py#L288)
|
|
- sap_windows: 0 instead of 5 — mapper plumbs no windows
|
|
- renewable_heat_incentive: None instead of RenewableHeatIncentive
|
|
- sap_heating / sap_ventilation differ in details
|
|
|
|
Preprocessing: the existing `ElmhurstSiteNotesExtractor` was written
|
|
against Textract-style output (label\\nvalue pairs in spatial
|
|
reading order). We don't have Textract in the test environment, so
|
|
this helper converts `pdftotext -layout` output (label-whitespace-
|
|
value on a single line) into the Textract-style sequence the
|
|
extractor expects. Test-only preprocessing; production runs through
|
|
Textract directly.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
|
|
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
|
|
|
_FIXTURES = Path(__file__).parent / "fixtures"
|
|
_SUMMARY_000474_PDF = _FIXTURES / "Summary_000474.pdf"
|
|
|
|
|
|
def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]:
|
|
"""Convert a Summary PDF into the per-page text format the existing
|
|
`ElmhurstSiteNotesExtractor` expects (label\\nvalue sequences).
|
|
|
|
`pdftotext -layout` preserves the spatial pairing of label and value
|
|
on each line; we split each line on 2+ spaces to surface the
|
|
label/value tokens, then concatenate them back into a single
|
|
newline-delimited stream per page.
|
|
"""
|
|
info = subprocess.run(
|
|
["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True
|
|
).stdout
|
|
m = re.search(r"Pages:\s+(\d+)", info)
|
|
if m is None:
|
|
raise RuntimeError(f"Could not parse page count from {pdf_path}")
|
|
page_count = int(m.group(1))
|
|
|
|
pages: list[str] = []
|
|
for i in range(1, page_count + 1):
|
|
layout = subprocess.run(
|
|
[
|
|
"pdftotext", "-layout", "-f", str(i), "-l", str(i),
|
|
str(pdf_path), "-",
|
|
],
|
|
capture_output=True, text=True, check=True,
|
|
).stdout
|
|
tokens: list[str] = []
|
|
for line in layout.splitlines():
|
|
if not line.strip():
|
|
tokens.append("")
|
|
continue
|
|
parts = [p for p in re.split(r"\s{2,}", line.strip()) if p]
|
|
tokens.extend(parts)
|
|
pages.append("\n".join(tokens))
|
|
return pages
|
|
|
|
|
|
def test_summary_000474_mapper_produces_three_building_parts() -> None:
|
|
# Arrange — cert U985-0001-000474 is a mid-terrace with 3 building
|
|
# parts (Main + 2 extensions) per the hand-built worksheet fixture
|
|
# at packages/domain/src/domain/sap/worksheet/tests/
|
|
# _elmhurst_worksheet_000474.py. Routing the Summary PDF through
|
|
# extractor + mapper must yield the same count.
|
|
pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF)
|
|
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
|
|
|
|
# Act
|
|
epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
|
|
|
|
# Assert
|
|
assert len(epc.sap_building_parts) == 3
|
|
|
|
|
|
def test_summary_000474_mapper_extracts_seven_windows() -> None:
|
|
# Arrange — cert U985-0001-000474's §11 table lodges 7 windows
|
|
# across Main + 1st Extension + 2nd Extension. The legacy Textract-
|
|
# style window parser couldn't anchor on the Summary PDF's tabular
|
|
# layout; the new W/H/Area-plus-Manufacturer anchor pair picks them
|
|
# all up.
|
|
pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF)
|
|
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
|
|
|
|
# Act
|
|
epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
|
|
|
|
# Assert
|
|
assert len(epc.sap_windows) == 7
|