Model/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py

"""End-to-end validation for the Elmhurst Summary→EpcPropertyData chain.

The 6 Elmhurst worksheet fixtures in `domain.sap.worksheet.tests`
build their `EpcPropertyData` synthetically — they validate the
calculator + cascade in isolation from the mapper. This file pins
the OTHER half of the chain: `from_elmhurst_site_notes` must produce
a calculator-equivalent `EpcPropertyData` when fed the Summary PDF
the worksheet was generated from. Together with the worksheet
cascade tests, this closes the loop: extractor + mapper + cascade
+ calculator validated end-to-end against the authoritative
Elmhurst documents.

Status: GREEN. For cert U985-0001-000474, this pipeline produces an
unrounded SAP within 0.5 of the worksheet PDF's `62.2584` (line 257).
The cascade itself reproduces Elmhurst's calculator exactly on
hand-built inputs (handbuilt → 62.2584 to 4 d.p.); the remaining
sub-half-point gap from the mapped path is non-load-bearing field
drift (e.g. central_heating_pump_age the Summary PDF doesn't lodge).

Preprocessing: the existing `ElmhurstSiteNotesExtractor` was written
against Textract-style output (label\\nvalue pairs in spatial
reading order). We don't have Textract in the test environment, so
this helper converts `pdftotext -layout` output (label-whitespace-
value on a single line) into the Textract-style sequence the
extractor expects. Test-only preprocessing; production runs through
Textract directly.
"""

from __future__ import annotations

import re
import subprocess
from pathlib import Path

from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.sap.calculator import calculate_sap_from_inputs
from domain.sap.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES, cert_to_inputs

_FIXTURES = Path(__file__).parent / "fixtures"
_SUMMARY_000474_PDF = _FIXTURES / "Summary_000474.pdf"


def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]:
    """Convert a Summary PDF into the per-page text format the existing
    `ElmhurstSiteNotesExtractor` expects (label\\nvalue sequences).

    `pdftotext -layout` preserves the spatial pairing of label and value
    on each line; we split each line on 2+ spaces to surface the
    label/value tokens, then concatenate them back into a single
    newline-delimited stream per page.
    """
    info = subprocess.run(
        ["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True
    ).stdout
    m = re.search(r"Pages:\s+(\d+)", info)
    if m is None:
        raise RuntimeError(f"Could not parse page count from {pdf_path}")
    page_count = int(m.group(1))

    pages: list[str] = []
    for i in range(1, page_count + 1):
        layout = subprocess.run(
            [
                "pdftotext", "-layout", "-f", str(i), "-l", str(i),
                str(pdf_path), "-",
            ],
            capture_output=True, text=True, check=True,
        ).stdout
        tokens: list[str] = []
        for line in layout.splitlines():
            if not line.strip():
                tokens.append("")
                continue
            parts = [p for p in re.split(r"\s{2,}", line.strip()) if p]
            tokens.extend(parts)
        pages.append("\n".join(tokens))
    return pages


def test_summary_000474_mapper_produces_three_building_parts() -> None:
    # Arrange — cert U985-0001-000474 is a mid-terrace with 3 building
    # parts (Main + 2 extensions) per the hand-built worksheet fixture
    # at packages/domain/src/domain/sap/worksheet/tests/
    # _elmhurst_worksheet_000474.py. Routing the Summary PDF through
    # extractor + mapper must yield the same count.
    pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF)
    site_notes = ElmhurstSiteNotesExtractor(pages).extract()

    # Act
    epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)

    # Assert
    assert len(epc.sap_building_parts) == 3


def test_summary_000474_mapper_extracts_seven_windows() -> None:
    # Arrange — cert U985-0001-000474's §11 table lodges 7 windows
    # across Main + 1st Extension + 2nd Extension. The legacy Textract-
    # style window parser couldn't anchor on the Summary PDF's tabular
    # layout; the new W/H/Area-plus-Manufacturer anchor pair picks them
    # all up.
    pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF)
    site_notes = ElmhurstSiteNotesExtractor(pages).extract()

    # Act
    epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)

    # Assert
    assert len(epc.sap_windows) == 7


def test_summary_000474_full_chain_sap_matches_worksheet_pdf_exactly() -> None:
    # Arrange — the full Summary→ElmhurstSiteNotes→EpcPropertyData→cascade
    # →SAP path against the U985-0001-000474 worksheet PDF's unrounded
    # SAP rating (line 257: SAP value 62.2584, rating (258) = 62).
    # Because the Summary PDF carries the same source-of-truth data that
    # the hand-built worksheet fixture encodes by hand, and because the
    # cascade matches Elmhurst's calculator to 4 d.p. on those hand-
    # built inputs, this end-to-end path MUST produce the same unrounded
    # SAP value. Any non-trivial drift = a real mapper bug dropping
    # information from the Summary PDF.
    pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF)
    site_notes = ElmhurstSiteNotesExtractor(pages).extract()
    epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)

    # Act
    result = calculate_sap_from_inputs(
        cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES)
    )

    # Assert — within the same 1e-4 tolerance the other Elmhurst worksheet
    # tests pin against. 0.5 is the API-cert residual tolerance (the API
    # publishes rounded SAP integers, so up to half a SAP point is just
    # rounding); for Elmhurst worksheet inputs the cascade reproduces
    # Elmhurst exactly and we expect identical outputs.
    worksheet_unrounded_sap = 62.2584
    assert abs(result.sap_score_continuous - worksheet_unrounded_sap) < 1e-4