diff --git a/backend/documents_parser/tests/fixtures/Summary_000474.pdf b/backend/documents_parser/tests/fixtures/Summary_000474.pdf new file mode 100644 index 00000000..be39243b Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_000474.pdf differ diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py new file mode 100644 index 00000000..ed32dafc --- /dev/null +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -0,0 +1,105 @@ +"""End-to-end scaffold for the Elmhurst Summary→EpcPropertyData chain. + +The 6 Elmhurst worksheet fixtures in `domain.sap.worksheet.tests` +build their `EpcPropertyData` synthetically — they validate the +calculator + cascade in isolation from the mapper. This file pins +the OTHER half of the chain: `from_elmhurst_site_notes` must produce +a calculator-equivalent `EpcPropertyData` when fed the Summary +PDF the worksheet was generated from. If the two halves agree, the +WHOLE pipeline (extractor + mapper + cascade + calculator) is +validated end-to-end against authoritative Elmhurst documents. + +Status: xfail. Today's audit (2026-05-24) surfaced a 28-field diff +between `from_elmhurst_site_notes(Summary_000474)` and the hand- +built `_elmhurst_worksheet_000474.build_epc()`. The load-bearing +gaps (calculator-relevant): + - sap_building_parts: 1 instead of 3 — mapper produces a single + bp via `[_map_elmhurst_building_part(survey)]` at [mapper.py:288](datatypes/epc/domain/mapper.py#L288) + - sap_windows: 0 instead of 5 — mapper plumbs no windows + - renewable_heat_incentive: None instead of RenewableHeatIncentive + - sap_heating / sap_ventilation differ in details + +Preprocessing: the existing `ElmhurstSiteNotesExtractor` was written +against Textract-style output (label\\nvalue pairs in spatial +reading order). We don't have Textract in the test environment, so +this helper converts `pdftotext -layout` output (label-whitespace- +value on a single line) into the Textract-style sequence the +extractor expects. Test-only preprocessing; production runs through +Textract directly. +""" + +from __future__ import annotations + +import re +import subprocess +from pathlib import Path + +import pytest + +from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor +from datatypes.epc.domain.mapper import EpcPropertyDataMapper + +_FIXTURES = Path(__file__).parent / "fixtures" +_SUMMARY_000474_PDF = _FIXTURES / "Summary_000474.pdf" + + +def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]: + """Convert a Summary PDF into the per-page text format the existing + `ElmhurstSiteNotesExtractor` expects (label\\nvalue sequences). + + `pdftotext -layout` preserves the spatial pairing of label and value + on each line; we split each line on 2+ spaces to surface the + label/value tokens, then concatenate them back into a single + newline-delimited stream per page. + """ + info = subprocess.run( + ["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True + ).stdout + m = re.search(r"Pages:\s+(\d+)", info) + if m is None: + raise RuntimeError(f"Could not parse page count from {pdf_path}") + page_count = int(m.group(1)) + + pages: list[str] = [] + for i in range(1, page_count + 1): + layout = subprocess.run( + [ + "pdftotext", "-layout", "-f", str(i), "-l", str(i), + str(pdf_path), "-", + ], + capture_output=True, text=True, check=True, + ).stdout + tokens: list[str] = [] + for line in layout.splitlines(): + if not line.strip(): + tokens.append("") + continue + parts = [p for p in re.split(r"\s{2,}", line.strip()) if p] + tokens.extend(parts) + pages.append("\n".join(tokens)) + return pages + + +@pytest.mark.xfail( + reason=( + "Elmhurst mapper `from_elmhurst_site_notes` currently produces a " + "single SapBuildingPart regardless of the cert's actual count; " + "cert 000474 lodges Main + Extension 1 + Extension 2 (3 bps). " + "See module docstring for full punch list." + ), + strict=True, +) +def test_summary_000474_mapper_produces_three_building_parts() -> None: + # Arrange — cert U985-0001-000474 is a mid-terrace with 3 building + # parts (Main + 2 extensions) per the hand-built worksheet fixture + # at packages/domain/src/domain/sap/worksheet/tests/ + # _elmhurst_worksheet_000474.py. Routing the Summary PDF through + # extractor + mapper must yield the same count. + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + + # Act + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Assert + assert len(epc.sap_building_parts) == 3