Model/domain/sap10_calculator/validation/parity_report.py

"""Parity-validation report for the deterministic SAP 10.2 calculator.

ADR-0009 Session B compares `Sap10Calculator.calculate(epc).sap_score`
to the cert's `energy_rating_current` across a 1000-cert stratified
sample. The success criterion is MAE ≤ 1.0 SAP-point on the *typical
subset* (cohort excluding catastrophic-tail certs, multi-heating,
conservatory, room-in-roof) — those edge cases are themselves the
backlog Session B iterates against.

This module is the pure aggregation step: given a list of per-cert
`ParityCase` records, it emits a typed `ParityReport` with global +
typical-subset MAE/RMSE/bias and the worst-N cases by |residual| for
investigation. The cert→case mapping itself (loading from the corpus,
running the calculator, looking up the cert's actual sap) lives at a
higher layer — keeps this report module trivial to test.

Reference: ADR-0009 §"Validation" + Session B plan.
"""

from __future__ import annotations

from dataclasses import dataclass
from math import sqrt
from typing import Final


_DEFAULT_WORST_N: Final[int] = 25


@dataclass(frozen=True)
class ParityCase:
    """One certificate's calculator-vs-cert SAP comparison.

    `is_typical` marks whether the cert belongs to the typical subset
    the Session B success criterion is measured against. Catastrophic-
    tail certs (sap ≤ 5 or ≥ 100), multi-heating, conservatory, and
    room-in-roof cases set this False — they show up in the global
    aggregate but not the typical-subset MAE.
    """

    certificate_number: str
    actual_sap: int
    predicted_sap: float
    is_typical: bool


@dataclass(frozen=True)
class ParityReport:
    case_count: int
    typical_case_count: int
    global_mae: float
    typical_mae: float
    global_rmse: float
    global_bias: float
    worst_cases: tuple[ParityCase, ...]


def _residual(case: ParityCase) -> float:
    """Predicted − actual. Positive = calculator over-predicts."""
    return case.predicted_sap - case.actual_sap


def _mean_abs(cases: list[ParityCase]) -> float:
    if not cases:
        return 0.0
    return sum(abs(_residual(c)) for c in cases) / len(cases)


def _rmse(cases: list[ParityCase]) -> float:
    if not cases:
        return 0.0
    return sqrt(sum(_residual(c) ** 2 for c in cases) / len(cases))


def _bias(cases: list[ParityCase]) -> float:
    if not cases:
        return 0.0
    return sum(_residual(c) for c in cases) / len(cases)


def build_parity_report(
    cases: list[ParityCase], *, worst_n: int = _DEFAULT_WORST_N
) -> ParityReport:
    """Aggregate a list of `ParityCase` into a typed `ParityReport`."""
    typical = [c for c in cases if c.is_typical]
    worst = tuple(sorted(cases, key=lambda c: abs(_residual(c)), reverse=True)[:worst_n])
    return ParityReport(
        case_count=len(cases),
        typical_case_count=len(typical),
        global_mae=_mean_abs(cases),
        typical_mae=_mean_abs(typical),
        global_rmse=_rmse(cases),
        global_bias=_bias(cases),
        worst_cases=worst,
    )