Scaffold: end-to-end Summary→EpcPropertyData chain test for 000474 (xfail)

The 6 worksheet fixtures build EpcPropertyData by hand, validating the cascade in isolation from the mapper. This commit lands the first half of the OTHER validation: Summary_000474.pdf → ElmhurstSiteNotesExtractor → from_elmhurst_site_notes → EpcPropertyData, asserting it produces the same shape as the hand-built fixture. Test is strict-xfail on sap_building_parts count (mapper produces 1, cert lodges 3). Includes a pdftotext-layout preprocessor that converts spatial label/value layout into the Textract-style sequence the existing extractor expects (test-only). Full punch list of 28 mapper-output diffs captured in project memory. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-07-27 23:35:01 +00:00 · 2026-05-24 17:40:06 +00:00 · 2026-05-24 17:40:06 +00:00 · ccf7aa2118
commit ccf7aa2118
parent 8ac548ca2a
2 changed files with 105 additions and 0 deletions
--- a/backend/documents_parser/tests/fixtures/Summary_000474.pdf
+++ b/backend/documents_parser/tests/fixtures/Summary_000474.pdf
--- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py
+++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py
@ -0,0 +1,105 @@
+"""End-to-end scaffold for the Elmhurst Summary→EpcPropertyData chain.
+
+The 6 Elmhurst worksheet fixtures in `domain.sap.worksheet.tests`
+build their `EpcPropertyData` synthetically — they validate the
+calculator + cascade in isolation from the mapper. This file pins
+the OTHER half of the chain: `from_elmhurst_site_notes` must produce
+a calculator-equivalent `EpcPropertyData` when fed the Summary
+PDF the worksheet was generated from. If the two halves agree, the
+WHOLE pipeline (extractor + mapper + cascade + calculator) is
+validated end-to-end against authoritative Elmhurst documents.
+
+Status: xfail. Today's audit (2026-05-24) surfaced a 28-field diff
+between `from_elmhurst_site_notes(Summary_000474)` and the hand-
+built `_elmhurst_worksheet_000474.build_epc()`. The load-bearing
+gaps (calculator-relevant):
+  - sap_building_parts: 1 instead of 3 — mapper produces a single
+    bp via `[_map_elmhurst_building_part(survey)]` at [mapper.py:288](datatypes/epc/domain/mapper.py#L288)
+  - sap_windows: 0 instead of 5 — mapper plumbs no windows
+  - renewable_heat_incentive: None instead of RenewableHeatIncentive
+  - sap_heating / sap_ventilation differ in details
+
+Preprocessing: the existing `ElmhurstSiteNotesExtractor` was written
+against Textract-style output (label\\nvalue pairs in spatial
+reading order). We don't have Textract in the test environment, so
+this helper converts `pdftotext -layout` output (label-whitespace-
+value on a single line) into the Textract-style sequence the
+extractor expects. Test-only preprocessing; production runs through
+Textract directly.
+"""
+
+from __future__ import annotations
+
+import re
+import subprocess
+from pathlib import Path
+
+import pytest
+
+from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
+from datatypes.epc.domain.mapper import EpcPropertyDataMapper
+
+_FIXTURES = Path(__file__).parent / "fixtures"
+_SUMMARY_000474_PDF = _FIXTURES / "Summary_000474.pdf"
+
+
+def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]:
+    """Convert a Summary PDF into the per-page text format the existing
+    `ElmhurstSiteNotesExtractor` expects (label\\nvalue sequences).
+
+    `pdftotext -layout` preserves the spatial pairing of label and value
+    on each line; we split each line on 2+ spaces to surface the
+    label/value tokens, then concatenate them back into a single
+    newline-delimited stream per page.
+    """
+    info = subprocess.run(
+        ["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True
+    ).stdout
+    m = re.search(r"Pages:\s+(\d+)", info)
+    if m is None:
+        raise RuntimeError(f"Could not parse page count from {pdf_path}")
+    page_count = int(m.group(1))
+
+    pages: list[str] = []
+    for i in range(1, page_count + 1):
+        layout = subprocess.run(
+            [
+                "pdftotext", "-layout", "-f", str(i), "-l", str(i),
+                str(pdf_path), "-",
+            ],
+            capture_output=True, text=True, check=True,
+        ).stdout
+        tokens: list[str] = []
+        for line in layout.splitlines():
+            if not line.strip():
+                tokens.append("")
+                continue
+            parts = [p for p in re.split(r"\s{2,}", line.strip()) if p]
+            tokens.extend(parts)
+        pages.append("\n".join(tokens))
+    return pages
+
+
+@pytest.mark.xfail(
+    reason=(
+        "Elmhurst mapper `from_elmhurst_site_notes` currently produces a "
+        "single SapBuildingPart regardless of the cert's actual count; "
+        "cert 000474 lodges Main + Extension 1 + Extension 2 (3 bps). "
+        "See module docstring for full punch list."
+    ),
+    strict=True,
+)
+def test_summary_000474_mapper_produces_three_building_parts() -> None:
+    # Arrange — cert U985-0001-000474 is a mid-terrace with 3 building
+    # parts (Main + 2 extensions) per the hand-built worksheet fixture
+    # at packages/domain/src/domain/sap/worksheet/tests/
+    # _elmhurst_worksheet_000474.py. Routing the Summary PDF through
+    # extractor + mapper must yield the same count.
+    pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF)
+    site_notes = ElmhurstSiteNotesExtractor(pages).extract()
+
+    # Act
+    epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
+
+    # Assert
+    assert len(epc.sap_building_parts) == 3