test(modelling): Elmhurst before/after cascade pin for cavity wall (#1154)

Closes #1154 — the Package Scorer's Elmhurst cascade pin. Drives
recommend_cavity_wall on the parsed `before` Summary, scores its
Option's overlay through PackageScorer, and asserts delta 0 (abs<=1e-4
on SAP/CO2/PE) vs the calculator's score on the re-lodged `after`
Summary.

Key finding: the handover's stated parser gate (parse_site_notes_pdf
throwing 'Manufacturer' on cert 001431) does NOT block these pins. The
Elmhurst recommendation Summaries route cleanly through the same
ElmhurstSiteNotesExtractor + EpcPropertyDataMapper chain the worksheet
e2e fixtures use (_elmhurst_worksheet_001431.build_epc). The Textract
path's window bug is unrelated and unused here.

The before→after field change is exactly wall_insulation_type 4
(uninsulated) → 2 (filled cavity), which is precisely the overlay
recommend_cavity_wall emits; the cascade closes at delta 0.000000 on
all three metrics. Before/after Summaries mirrored into
tests/domain/modelling/fixtures/ so the pin does not depend on the
unstaged workspace.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-03 09:36:53 +00:00
parent 9ed4ccc28e
commit 4c0a907a54
4 changed files with 157 additions and 0 deletions

View file

@ -0,0 +1,76 @@
"""Parse an Elmhurst *recommendation* Summary PDF into an EpcPropertyData.
The Modelling cascade pins use Elmhurst's own before/after measure
re-lodgements as deterministic test vectors: each measure folder under
`sap worksheets/Recommendations Elmhurst Files/` holds a `before` Summary
(the baseline cert) and an `after` Summary (the same cert re-lodged with the
measure applied). Applying the matching Recommendation Generator's overlay to
the parsed `before` must reproduce the calculator's score on the parsed
`after` at delta 0 proving the overlay is the exact field change Elmhurst
made.
This routes the Summary PDF through the same extractor + mapper chain the
worksheet e2e fixtures use (`_elmhurst_worksheet_001431.build_epc`), NOT the
Textract `parse_site_notes_pdf` path that path has an unrelated window
extraction bug on cert 001431. The before/after Summaries are mirrored into
`tests/domain/modelling/fixtures/` so the pins do not depend on the unstaged
workspace.
"""
from __future__ import annotations
import re
import subprocess
from pathlib import Path
from typing import Final
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
from datatypes.epc.domain.epc_property_data import EpcPropertyData
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
_FIXTURES_DIR: Final[Path] = Path(__file__).resolve().parent / "fixtures"
def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]:
"""Convert a Summary PDF into the per-page text format the
`ElmhurstSiteNotesExtractor` expects (label\\nvalue sequences).
Mirror of the helper in `_elmhurst_worksheet_001431.py`: `pdftotext
-layout` preserves the spatial label/value pairing on each line; we split
on 2+ spaces to surface the tokens, then rejoin newline-delimited.
"""
info: str = subprocess.run(
["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True,
).stdout
match = re.search(r"Pages:\s+(\d+)", info)
if match is None:
raise RuntimeError(f"Could not parse page count from {pdf_path}")
page_count = int(match.group(1))
pages: list[str] = []
for i in range(1, page_count + 1):
layout: str = subprocess.run(
[
"pdftotext", "-layout", "-f", str(i), "-l", str(i),
str(pdf_path), "-",
],
capture_output=True, text=True, check=True,
).stdout
tokens: list[str] = []
for line in layout.splitlines():
if not line.strip():
tokens.append("")
continue
parts = [p for p in re.split(r"\s{2,}", line.strip()) if p]
tokens.extend(parts)
pages.append("\n".join(tokens))
return pages
def parse_recommendation_summary(fixture_name: str) -> EpcPropertyData:
"""Parse a before/after recommendation Summary fixture (by file name in
`tests/domain/modelling/fixtures/`) into an EpcPropertyData."""
pdf_path: Path = _FIXTURES_DIR / fixture_name
pages: list[str] = _summary_pdf_to_textract_style_pages(pdf_path)
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)

View file

@ -0,0 +1,81 @@
"""Elmhurst before/after cascade pins for the Recommendation Generators.
Each measure has an Elmhurst `before` Summary (baseline cert) and an `after`
Summary (the same cert re-lodged with the measure applied). The pin drives the
matching generator on the parsed `before`, scores its Option's overlay through
the `PackageScorer`, and asserts the result equals the calculator's score on
the parsed `after` at `abs(diff) <= 1e-4` for SAP / CO2 / primary energy.
This is the real certgeneratoroverlaycalculator cascade, not a per-section
isolation test (see `[[feedback-cascade-pin-methodology]]`): a non-zero delta
is a named generator/overlay/calculator gap to fix, never a tolerance to widen
(`[[feedback-zero-error-strict]]`).
"""
from __future__ import annotations
from typing import Final
from datatypes.epc.domain.epc_property_data import EpcPropertyData
from domain.modelling.package_scorer import PackageScorer, Score
from domain.modelling.product import Product
from domain.modelling.recommendation import Recommendation
from domain.modelling.simulation import EpcSimulation
from domain.modelling.wall_recommendation import recommend_cavity_wall
from domain.sap10_calculator.calculator import Sap10Calculator, SapResult
from repositories.product.product_repository import ProductRepository
from tests.domain.modelling._elmhurst_recommendation import (
parse_recommendation_summary,
)
# Pin tolerance: the Summary PDFs are deterministic test vectors, so the
# overlay must reproduce the re-lodged cert exactly. Matches the worksheet
# e2e tolerance.
_PIN_ABS: Final[float] = 1e-4
class _AnyProduct(ProductRepository):
"""In-memory ProductRepository returning a fixed Product for any Measure
Type. The pins assert the SAP cascade, not Cost, so the unit cost is
immaterial only the generator's overlay is exercised."""
def get(self, measure_type: str) -> Product:
return Product(
measure_type=measure_type, unit_cost_per_m2=1.0, contingency_rate=0.0
)
def _assert_overlay_reproduces_after(
before: EpcPropertyData, after: EpcPropertyData, overlay: EpcSimulation
) -> None:
"""Score ``overlay`` on ``before`` and assert it matches the calculator's
score on the re-lodged ``after`` across all three metrics."""
calculator = Sap10Calculator()
relodged: SapResult = calculator.calculate(after)
scored: Score = PackageScorer(calculator).score(before, [overlay])
assert abs(scored.sap_continuous - relodged.sap_score_continuous) <= _PIN_ABS
assert abs(scored.co2_kg_per_yr - relodged.co2_kg_per_yr) <= _PIN_ABS
assert (
abs(scored.primary_energy_kwh_per_yr - relodged.primary_energy_kwh_per_yr)
<= _PIN_ABS
)
def test_cavity_wall_overlay_reproduces_the_relodged_after() -> None:
# Arrange
before: EpcPropertyData = parse_recommendation_summary(
"cavity_wall_001431_before.pdf"
)
after: EpcPropertyData = parse_recommendation_summary(
"cavity_wall_001431_after.pdf"
)
recommendation: Recommendation | None = recommend_cavity_wall(
before, _AnyProduct()
)
assert recommendation is not None
# Act / Assert
_assert_overlay_reproduces_after(
before, after, recommendation.options[0].overlay
)