slice S-B1: parity-validation report aggregator

Pure-function ParityCase / ParityReport / build_parity_report for the
Session B 1000-cert parity check (ADR-0009). Aggregates per-cert
(predicted, actual) sap pairs into global + typical-subset MAE, RMSE,
bias, and the worst-N residuals for spec-iteration. Cert→case mapping
(corpus load, calculator run, actual-sap lookup) sits at a higher
layer; this module is trivial to test so the harder integration code
inherits its testing.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-05-18 13:22:45 +00:00
parent a243055de7
commit 57f18a8773
4 changed files with 231 additions and 0 deletions

View file

@ -0,0 +1,95 @@
"""Parity-validation report for the deterministic SAP 10.3 calculator.
ADR-0009 Session B compares `Sap10Calculator.calculate(epc).sap_score`
to the cert's `energy_rating_current` across a 1000-cert stratified
sample. The success criterion is MAE 1.0 SAP-point on the *typical
subset* (cohort excluding catastrophic-tail certs, multi-heating,
conservatory, room-in-roof) those edge cases are themselves the
backlog Session B iterates against.
This module is the pure aggregation step: given a list of per-cert
`ParityCase` records, it emits a typed `ParityReport` with global +
typical-subset MAE/RMSE/bias and the worst-N cases by |residual| for
investigation. The certcase mapping itself (loading from the corpus,
running the calculator, looking up the cert's actual sap) lives at a
higher layer keeps this report module trivial to test.
Reference: ADR-0009 §"Validation" + Session B plan.
"""
from __future__ import annotations
from dataclasses import dataclass
from math import sqrt
from typing import Final
_DEFAULT_WORST_N: Final[int] = 25
@dataclass(frozen=True)
class ParityCase:
"""One certificate's calculator-vs-cert SAP comparison.
`is_typical` marks whether the cert belongs to the typical subset
the Session B success criterion is measured against. Catastrophic-
tail certs (sap 5 or 100), multi-heating, conservatory, and
room-in-roof cases set this False they show up in the global
aggregate but not the typical-subset MAE.
"""
certificate_number: str
actual_sap: int
predicted_sap: float
is_typical: bool
@dataclass(frozen=True)
class ParityReport:
case_count: int
typical_case_count: int
global_mae: float
typical_mae: float
global_rmse: float
global_bias: float
worst_cases: tuple[ParityCase, ...]
def _residual(case: ParityCase) -> float:
"""Predicted actual. Positive = calculator over-predicts."""
return case.predicted_sap - case.actual_sap
def _mean_abs(cases: list[ParityCase]) -> float:
if not cases:
return 0.0
return sum(abs(_residual(c)) for c in cases) / len(cases)
def _rmse(cases: list[ParityCase]) -> float:
if not cases:
return 0.0
return sqrt(sum(_residual(c) ** 2 for c in cases) / len(cases))
def _bias(cases: list[ParityCase]) -> float:
if not cases:
return 0.0
return sum(_residual(c) for c in cases) / len(cases)
def build_parity_report(
cases: list[ParityCase], *, worst_n: int = _DEFAULT_WORST_N
) -> ParityReport:
"""Aggregate a list of `ParityCase` into a typed `ParityReport`."""
typical = [c for c in cases if c.is_typical]
worst = tuple(sorted(cases, key=lambda c: abs(_residual(c)), reverse=True)[:worst_n])
return ParityReport(
case_count=len(cases),
typical_case_count=len(typical),
global_mae=_mean_abs(cases),
typical_mae=_mean_abs(typical),
global_rmse=_rmse(cases),
global_bias=_bias(cases),
worst_cases=worst,
)

View file

@ -0,0 +1,136 @@
"""Tests for the parity-validation report.
The report aggregates per-cert (predicted, actual) sap-score pairs into
the audit shape ADR-0009 Session B specifies: global MAE/RMSE/bias, MAE
on the "typical subset" (excluding catastrophic-tail certs), and the
worst-N residual cases for spec-interpretation iteration.
Tests use synthetic pair lists so the math is hand-verifiable.
Reference: ADR-0009 (Accepted) §"Validation" and Session B success
criterion (MAE 1.0 SAP-point on the typical subset).
"""
from __future__ import annotations
import pytest
from domain.sap.validation.parity_report import (
ParityCase,
ParityReport,
build_parity_report,
)
def _cases(*pairs: tuple[int, float]) -> list[ParityCase]:
"""Build typical (non-tail) ParityCase objects from (actual, predicted)
pairs so tests can stay terse."""
return [
ParityCase(
certificate_number=str(i),
actual_sap=a,
predicted_sap=p,
is_typical=True,
)
for i, (a, p) in enumerate(pairs)
]
def test_global_mae_is_mean_absolute_residual_across_all_cases() -> None:
# Arrange — three certs, residuals 5, 3, 4 → MAE = (5+3+4) / 3 = 4.
cases = _cases((60, 55.0), (70, 73.0), (80, 76.0))
# Act
report = build_parity_report(cases)
# Assert
assert report.global_mae == pytest.approx(4.0, abs=1e-6)
def test_global_rmse_uses_root_mean_square_of_residuals() -> None:
# Arrange — residuals 5, 3, 4 → RMSE = sqrt((25+9+16)/3) = sqrt(50/3) ≈ 4.082.
cases = _cases((60, 55.0), (70, 73.0), (80, 76.0))
# Act
report = build_parity_report(cases)
# Assert
assert report.global_rmse == pytest.approx(4.082, abs=0.01)
def test_global_bias_is_signed_mean_residual_predicted_minus_actual() -> None:
# Arrange — residuals -5, +3, -4 → bias = mean = -2.0 (under-prediction).
cases = _cases((60, 55.0), (70, 73.0), (80, 76.0))
# Act
report = build_parity_report(cases)
# Assert
assert report.global_bias == pytest.approx(-2.0, abs=1e-6)
def test_typical_subset_mae_ignores_cases_flagged_not_typical() -> None:
# Arrange — three typical-bucket residuals plus one catastrophic-tail
# cert (sap < 5) that should be excluded from the typical MAE.
cases = [
ParityCase("a", 60, 59.0, is_typical=True),
ParityCase("b", 70, 71.0, is_typical=True),
ParityCase("c", 80, 78.0, is_typical=True),
ParityCase("d", 3, 35.0, is_typical=False),
]
# Act
report = build_parity_report(cases)
# Assert — typical residuals are 1, 1, 2 → MAE = 4/3 ≈ 1.333.
# Global MAE includes the d-cert blowout: residuals 1,1,2,32 → 9.0.
assert report.typical_mae == pytest.approx(1.333, abs=0.01)
assert report.global_mae == pytest.approx(9.0, abs=1e-6)
assert report.case_count == 4
assert report.typical_case_count == 3
def test_worst_cases_returns_largest_absolute_residuals_first() -> None:
# Arrange — residuals 1, 10, 3, 7, 2 (signs vary). Worst 3 by |residual|
# must be 10, 7, 3.
cases = [
ParityCase("a", 60, 59.0, is_typical=True),
ParityCase("b", 70, 80.0, is_typical=True),
ParityCase("c", 80, 77.0, is_typical=True),
ParityCase("d", 50, 57.0, is_typical=True),
ParityCase("e", 65, 63.0, is_typical=True),
]
# Act
report = build_parity_report(cases, worst_n=3)
# Assert
worst_ids = [c.certificate_number for c in report.worst_cases]
assert worst_ids == ["b", "d", "c"]
def test_empty_case_list_yields_zeroed_report_without_division_error() -> None:
# Arrange — running parity validation before the cohort is loaded must
# not crash; the report just reports zeros.
# Act
report = build_parity_report([])
# Assert
assert report.case_count == 0
assert report.typical_case_count == 0
assert report.global_mae == 0.0
assert report.typical_mae == 0.0
assert report.global_rmse == 0.0
assert report.global_bias == 0.0
assert report.worst_cases == ()
def test_parity_report_is_immutable_dataclass() -> None:
# Arrange — frozen dataclass guarantees the report's audit values
# cannot be retroactively mutated after construction.
report = build_parity_report(_cases((60, 60.0)))
# Act / Assert
with pytest.raises(Exception):
report.global_mae = 99.9 # type: ignore[misc]