slice S-B1: parity-validation report aggregator

Pure-function ParityCase / ParityReport / build_parity_report for the Session B 1000-cert parity check (ADR-0009). Aggregates per-cert (predicted, actual) sap pairs into global + typical-subset MAE, RMSE, bias, and the worst-N residuals for spec-iteration. Cert→case mapping (corpus load, calculator run, actual-sap lookup) sits at a higher layer; this module is trivial to test so the harder integration code inherits its testing. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-07-27 23:35:01 +00:00 · 2026-05-18 13:22:45 +00:00 · 2026-05-18 13:22:45 +00:00 · 57f18a8773
commit 57f18a8773
parent a243055de7
4 changed files with 231 additions and 0 deletions
--- a/packages/domain/src/domain/sap/validation/init.py
+++ b/packages/domain/src/domain/sap/validation/init.py
--- a/packages/domain/src/domain/sap/validation/parity_report.py
+++ b/packages/domain/src/domain/sap/validation/parity_report.py
@ -0,0 +1,95 @@
+"""Parity-validation report for the deterministic SAP 10.3 calculator.
+
+ADR-0009 Session B compares `Sap10Calculator.calculate(epc).sap_score`
+to the cert's `energy_rating_current` across a 1000-cert stratified
+sample. The success criterion is MAE ≤ 1.0 SAP-point on the *typical
+subset* (cohort excluding catastrophic-tail certs, multi-heating,
+conservatory, room-in-roof) — those edge cases are themselves the
+backlog Session B iterates against.
+
+This module is the pure aggregation step: given a list of per-cert
+`ParityCase` records, it emits a typed `ParityReport` with global +
+typical-subset MAE/RMSE/bias and the worst-N cases by |residual| for
+investigation. The cert→case mapping itself (loading from the corpus,
+running the calculator, looking up the cert's actual sap) lives at a
+higher layer — keeps this report module trivial to test.
+
+Reference: ADR-0009 §"Validation" + Session B plan.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from math import sqrt
+from typing import Final
+
+
+_DEFAULT_WORST_N: Final[int] = 25
+
+
+@dataclass(frozen=True)
+class ParityCase:
+    """One certificate's calculator-vs-cert SAP comparison.
+
+    `is_typical` marks whether the cert belongs to the typical subset
+    the Session B success criterion is measured against. Catastrophic-
+    tail certs (sap ≤ 5 or ≥ 100), multi-heating, conservatory, and
+    room-in-roof cases set this False — they show up in the global
+    aggregate but not the typical-subset MAE.
+    """
+
+    certificate_number: str
+    actual_sap: int
+    predicted_sap: float
+    is_typical: bool
+
+
+@dataclass(frozen=True)
+class ParityReport:
+    case_count: int
+    typical_case_count: int
+    global_mae: float
+    typical_mae: float
+    global_rmse: float
+    global_bias: float
+    worst_cases: tuple[ParityCase, ...]
+
+
+def _residual(case: ParityCase) -> float:
+    """Predicted − actual. Positive = calculator over-predicts."""
+    return case.predicted_sap - case.actual_sap
+
+
+def _mean_abs(cases: list[ParityCase]) -> float:
+    if not cases:
+        return 0.0
+    return sum(abs(_residual(c)) for c in cases) / len(cases)
+
+
+def _rmse(cases: list[ParityCase]) -> float:
+    if not cases:
+        return 0.0
+    return sqrt(sum(_residual(c) ** 2 for c in cases) / len(cases))
+
+
+def _bias(cases: list[ParityCase]) -> float:
+    if not cases:
+        return 0.0
+    return sum(_residual(c) for c in cases) / len(cases)
+
+
+def build_parity_report(
+    cases: list[ParityCase], *, worst_n: int = _DEFAULT_WORST_N
+) -> ParityReport:
+    """Aggregate a list of `ParityCase` into a typed `ParityReport`."""
+    typical = [c for c in cases if c.is_typical]
+    worst = tuple(sorted(cases, key=lambda c: abs(_residual(c)), reverse=True)[:worst_n])
+    return ParityReport(
+        case_count=len(cases),
+        typical_case_count=len(typical),
+        global_mae=_mean_abs(cases),
+        typical_mae=_mean_abs(typical),
+        global_rmse=_rmse(cases),
+        global_bias=_bias(cases),
+        worst_cases=worst,
+    )
--- a/packages/domain/src/domain/sap/validation/tests/init.py
+++ b/packages/domain/src/domain/sap/validation/tests/init.py
--- a/packages/domain/src/domain/sap/validation/tests/test_parity_report.py
+++ b/packages/domain/src/domain/sap/validation/tests/test_parity_report.py
@ -0,0 +1,136 @@
+"""Tests for the parity-validation report.
+
+The report aggregates per-cert (predicted, actual) sap-score pairs into
+the audit shape ADR-0009 Session B specifies: global MAE/RMSE/bias, MAE
+on the "typical subset" (excluding catastrophic-tail certs), and the
+worst-N residual cases for spec-interpretation iteration.
+
+Tests use synthetic pair lists so the math is hand-verifiable.
+
+Reference: ADR-0009 (Accepted) §"Validation" and Session B success
+criterion (MAE ≤ 1.0 SAP-point on the typical subset).
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from domain.sap.validation.parity_report import (
+    ParityCase,
+    ParityReport,
+    build_parity_report,
+)
+
+
+def _cases(*pairs: tuple[int, float]) -> list[ParityCase]:
+    """Build typical (non-tail) ParityCase objects from (actual, predicted)
+    pairs so tests can stay terse."""
+    return [
+        ParityCase(
+            certificate_number=str(i),
+            actual_sap=a,
+            predicted_sap=p,
+            is_typical=True,
+        )
+        for i, (a, p) in enumerate(pairs)
+    ]
+
+
+def test_global_mae_is_mean_absolute_residual_across_all_cases() -> None:
+    # Arrange — three certs, residuals 5, 3, 4 → MAE = (5+3+4) / 3 = 4.
+    cases = _cases((60, 55.0), (70, 73.0), (80, 76.0))
+
+    # Act
+    report = build_parity_report(cases)
+
+    # Assert
+    assert report.global_mae == pytest.approx(4.0, abs=1e-6)
+
+
+def test_global_rmse_uses_root_mean_square_of_residuals() -> None:
+    # Arrange — residuals 5, 3, 4 → RMSE = sqrt((25+9+16)/3) = sqrt(50/3) ≈ 4.082.
+    cases = _cases((60, 55.0), (70, 73.0), (80, 76.0))
+
+    # Act
+    report = build_parity_report(cases)
+
+    # Assert
+    assert report.global_rmse == pytest.approx(4.082, abs=0.01)
+
+
+def test_global_bias_is_signed_mean_residual_predicted_minus_actual() -> None:
+    # Arrange — residuals -5, +3, -4 → bias = mean = -2.0 (under-prediction).
+    cases = _cases((60, 55.0), (70, 73.0), (80, 76.0))
+
+    # Act
+    report = build_parity_report(cases)
+
+    # Assert
+    assert report.global_bias == pytest.approx(-2.0, abs=1e-6)
+
+
+def test_typical_subset_mae_ignores_cases_flagged_not_typical() -> None:
+    # Arrange — three typical-bucket residuals plus one catastrophic-tail
+    # cert (sap < 5) that should be excluded from the typical MAE.
+    cases = [
+        ParityCase("a", 60, 59.0, is_typical=True),
+        ParityCase("b", 70, 71.0, is_typical=True),
+        ParityCase("c", 80, 78.0, is_typical=True),
+        ParityCase("d", 3, 35.0, is_typical=False),
+    ]
+
+    # Act
+    report = build_parity_report(cases)
+
+    # Assert — typical residuals are 1, 1, 2 → MAE = 4/3 ≈ 1.333.
+    # Global MAE includes the d-cert blowout: residuals 1,1,2,32 → 9.0.
+    assert report.typical_mae == pytest.approx(1.333, abs=0.01)
+    assert report.global_mae == pytest.approx(9.0, abs=1e-6)
+    assert report.case_count == 4
+    assert report.typical_case_count == 3
+
+
+def test_worst_cases_returns_largest_absolute_residuals_first() -> None:
+    # Arrange — residuals 1, 10, 3, 7, 2 (signs vary). Worst 3 by |residual|
+    # must be 10, 7, 3.
+    cases = [
+        ParityCase("a", 60, 59.0, is_typical=True),
+        ParityCase("b", 70, 80.0, is_typical=True),
+        ParityCase("c", 80, 77.0, is_typical=True),
+        ParityCase("d", 50, 57.0, is_typical=True),
+        ParityCase("e", 65, 63.0, is_typical=True),
+    ]
+
+    # Act
+    report = build_parity_report(cases, worst_n=3)
+
+    # Assert
+    worst_ids = [c.certificate_number for c in report.worst_cases]
+    assert worst_ids == ["b", "d", "c"]
+
+
+def test_empty_case_list_yields_zeroed_report_without_division_error() -> None:
+    # Arrange — running parity validation before the cohort is loaded must
+    # not crash; the report just reports zeros.
+
+    # Act
+    report = build_parity_report([])
+
+    # Assert
+    assert report.case_count == 0
+    assert report.typical_case_count == 0
+    assert report.global_mae == 0.0
+    assert report.typical_mae == 0.0
+    assert report.global_rmse == 0.0
+    assert report.global_bias == 0.0
+    assert report.worst_cases == ()
+
+
+def test_parity_report_is_immutable_dataclass() -> None:
+    # Arrange — frozen dataclass guarantees the report's audit values
+    # cannot be retroactively mutated after construction.
+    report = build_parity_report(_cases((60, 60.0)))
+
+    # Act / Assert
+    with pytest.raises(Exception):
+        report.global_mae = 99.9  # type: ignore[misc]