"""Parity-validation report for the deterministic SAP 10.2 calculator. ADR-0009 Session B compares `Sap10Calculator.calculate(epc).sap_score` to the cert's `energy_rating_current` across a 1000-cert stratified sample. The success criterion is MAE ≤ 1.0 SAP-point on the *typical subset* (cohort excluding catastrophic-tail certs, multi-heating, conservatory, room-in-roof) — those edge cases are themselves the backlog Session B iterates against. This module is the pure aggregation step: given a list of per-cert `ParityCase` records, it emits a typed `ParityReport` with global + typical-subset MAE/RMSE/bias and the worst-N cases by |residual| for investigation. The cert→case mapping itself (loading from the corpus, running the calculator, looking up the cert's actual sap) lives at a higher layer — keeps this report module trivial to test. Reference: ADR-0009 §"Validation" + Session B plan. """ from __future__ import annotations from dataclasses import dataclass from math import sqrt from typing import Final _DEFAULT_WORST_N: Final[int] = 25 @dataclass(frozen=True) class ParityCase: """One certificate's calculator-vs-cert SAP comparison. `is_typical` marks whether the cert belongs to the typical subset the Session B success criterion is measured against. Catastrophic- tail certs (sap ≤ 5 or ≥ 100), multi-heating, conservatory, and room-in-roof cases set this False — they show up in the global aggregate but not the typical-subset MAE. """ certificate_number: str actual_sap: int predicted_sap: float is_typical: bool @dataclass(frozen=True) class ParityReport: case_count: int typical_case_count: int global_mae: float typical_mae: float global_rmse: float global_bias: float worst_cases: tuple[ParityCase, ...] def _residual(case: ParityCase) -> float: """Predicted − actual. Positive = calculator over-predicts.""" return case.predicted_sap - case.actual_sap def _mean_abs(cases: list[ParityCase]) -> float: if not cases: return 0.0 return sum(abs(_residual(c)) for c in cases) / len(cases) def _rmse(cases: list[ParityCase]) -> float: if not cases: return 0.0 return sqrt(sum(_residual(c) ** 2 for c in cases) / len(cases)) def _bias(cases: list[ParityCase]) -> float: if not cases: return 0.0 return sum(_residual(c) for c in cases) / len(cases) def build_parity_report( cases: list[ParityCase], *, worst_n: int = _DEFAULT_WORST_N ) -> ParityReport: """Aggregate a list of `ParityCase` into a typed `ParityReport`.""" typical = [c for c in cases if c.is_typical] worst = tuple(sorted(cases, key=lambda c: abs(_residual(c)), reverse=True)[:worst_n]) return ParityReport( case_count=len(cases), typical_case_count=len(typical), global_mae=_mean_abs(cases), typical_mae=_mean_abs(typical), global_rmse=_rmse(cases), global_bias=_bias(cases), worst_cases=worst, )