mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
slice S-B1: parity-validation report aggregator
Pure-function ParityCase / ParityReport / build_parity_report for the Session B 1000-cert parity check (ADR-0009). Aggregates per-cert (predicted, actual) sap pairs into global + typical-subset MAE, RMSE, bias, and the worst-N residuals for spec-iteration. Cert→case mapping (corpus load, calculator run, actual-sap lookup) sits at a higher layer; this module is trivial to test so the harder integration code inherits its testing. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
a243055de7
commit
57f18a8773
4 changed files with 231 additions and 0 deletions
0
packages/domain/src/domain/sap/validation/__init__.py
Normal file
0
packages/domain/src/domain/sap/validation/__init__.py
Normal file
95
packages/domain/src/domain/sap/validation/parity_report.py
Normal file
95
packages/domain/src/domain/sap/validation/parity_report.py
Normal file
|
|
@ -0,0 +1,95 @@
|
|||
"""Parity-validation report for the deterministic SAP 10.3 calculator.
|
||||
|
||||
ADR-0009 Session B compares `Sap10Calculator.calculate(epc).sap_score`
|
||||
to the cert's `energy_rating_current` across a 1000-cert stratified
|
||||
sample. The success criterion is MAE ≤ 1.0 SAP-point on the *typical
|
||||
subset* (cohort excluding catastrophic-tail certs, multi-heating,
|
||||
conservatory, room-in-roof) — those edge cases are themselves the
|
||||
backlog Session B iterates against.
|
||||
|
||||
This module is the pure aggregation step: given a list of per-cert
|
||||
`ParityCase` records, it emits a typed `ParityReport` with global +
|
||||
typical-subset MAE/RMSE/bias and the worst-N cases by |residual| for
|
||||
investigation. The cert→case mapping itself (loading from the corpus,
|
||||
running the calculator, looking up the cert's actual sap) lives at a
|
||||
higher layer — keeps this report module trivial to test.
|
||||
|
||||
Reference: ADR-0009 §"Validation" + Session B plan.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from math import sqrt
|
||||
from typing import Final
|
||||
|
||||
|
||||
_DEFAULT_WORST_N: Final[int] = 25
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ParityCase:
|
||||
"""One certificate's calculator-vs-cert SAP comparison.
|
||||
|
||||
`is_typical` marks whether the cert belongs to the typical subset
|
||||
the Session B success criterion is measured against. Catastrophic-
|
||||
tail certs (sap ≤ 5 or ≥ 100), multi-heating, conservatory, and
|
||||
room-in-roof cases set this False — they show up in the global
|
||||
aggregate but not the typical-subset MAE.
|
||||
"""
|
||||
|
||||
certificate_number: str
|
||||
actual_sap: int
|
||||
predicted_sap: float
|
||||
is_typical: bool
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ParityReport:
|
||||
case_count: int
|
||||
typical_case_count: int
|
||||
global_mae: float
|
||||
typical_mae: float
|
||||
global_rmse: float
|
||||
global_bias: float
|
||||
worst_cases: tuple[ParityCase, ...]
|
||||
|
||||
|
||||
def _residual(case: ParityCase) -> float:
|
||||
"""Predicted − actual. Positive = calculator over-predicts."""
|
||||
return case.predicted_sap - case.actual_sap
|
||||
|
||||
|
||||
def _mean_abs(cases: list[ParityCase]) -> float:
|
||||
if not cases:
|
||||
return 0.0
|
||||
return sum(abs(_residual(c)) for c in cases) / len(cases)
|
||||
|
||||
|
||||
def _rmse(cases: list[ParityCase]) -> float:
|
||||
if not cases:
|
||||
return 0.0
|
||||
return sqrt(sum(_residual(c) ** 2 for c in cases) / len(cases))
|
||||
|
||||
|
||||
def _bias(cases: list[ParityCase]) -> float:
|
||||
if not cases:
|
||||
return 0.0
|
||||
return sum(_residual(c) for c in cases) / len(cases)
|
||||
|
||||
|
||||
def build_parity_report(
|
||||
cases: list[ParityCase], *, worst_n: int = _DEFAULT_WORST_N
|
||||
) -> ParityReport:
|
||||
"""Aggregate a list of `ParityCase` into a typed `ParityReport`."""
|
||||
typical = [c for c in cases if c.is_typical]
|
||||
worst = tuple(sorted(cases, key=lambda c: abs(_residual(c)), reverse=True)[:worst_n])
|
||||
return ParityReport(
|
||||
case_count=len(cases),
|
||||
typical_case_count=len(typical),
|
||||
global_mae=_mean_abs(cases),
|
||||
typical_mae=_mean_abs(typical),
|
||||
global_rmse=_rmse(cases),
|
||||
global_bias=_bias(cases),
|
||||
worst_cases=worst,
|
||||
)
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
"""Tests for the parity-validation report.
|
||||
|
||||
The report aggregates per-cert (predicted, actual) sap-score pairs into
|
||||
the audit shape ADR-0009 Session B specifies: global MAE/RMSE/bias, MAE
|
||||
on the "typical subset" (excluding catastrophic-tail certs), and the
|
||||
worst-N residual cases for spec-interpretation iteration.
|
||||
|
||||
Tests use synthetic pair lists so the math is hand-verifiable.
|
||||
|
||||
Reference: ADR-0009 (Accepted) §"Validation" and Session B success
|
||||
criterion (MAE ≤ 1.0 SAP-point on the typical subset).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from domain.sap.validation.parity_report import (
|
||||
ParityCase,
|
||||
ParityReport,
|
||||
build_parity_report,
|
||||
)
|
||||
|
||||
|
||||
def _cases(*pairs: tuple[int, float]) -> list[ParityCase]:
|
||||
"""Build typical (non-tail) ParityCase objects from (actual, predicted)
|
||||
pairs so tests can stay terse."""
|
||||
return [
|
||||
ParityCase(
|
||||
certificate_number=str(i),
|
||||
actual_sap=a,
|
||||
predicted_sap=p,
|
||||
is_typical=True,
|
||||
)
|
||||
for i, (a, p) in enumerate(pairs)
|
||||
]
|
||||
|
||||
|
||||
def test_global_mae_is_mean_absolute_residual_across_all_cases() -> None:
|
||||
# Arrange — three certs, residuals 5, 3, 4 → MAE = (5+3+4) / 3 = 4.
|
||||
cases = _cases((60, 55.0), (70, 73.0), (80, 76.0))
|
||||
|
||||
# Act
|
||||
report = build_parity_report(cases)
|
||||
|
||||
# Assert
|
||||
assert report.global_mae == pytest.approx(4.0, abs=1e-6)
|
||||
|
||||
|
||||
def test_global_rmse_uses_root_mean_square_of_residuals() -> None:
|
||||
# Arrange — residuals 5, 3, 4 → RMSE = sqrt((25+9+16)/3) = sqrt(50/3) ≈ 4.082.
|
||||
cases = _cases((60, 55.0), (70, 73.0), (80, 76.0))
|
||||
|
||||
# Act
|
||||
report = build_parity_report(cases)
|
||||
|
||||
# Assert
|
||||
assert report.global_rmse == pytest.approx(4.082, abs=0.01)
|
||||
|
||||
|
||||
def test_global_bias_is_signed_mean_residual_predicted_minus_actual() -> None:
|
||||
# Arrange — residuals -5, +3, -4 → bias = mean = -2.0 (under-prediction).
|
||||
cases = _cases((60, 55.0), (70, 73.0), (80, 76.0))
|
||||
|
||||
# Act
|
||||
report = build_parity_report(cases)
|
||||
|
||||
# Assert
|
||||
assert report.global_bias == pytest.approx(-2.0, abs=1e-6)
|
||||
|
||||
|
||||
def test_typical_subset_mae_ignores_cases_flagged_not_typical() -> None:
|
||||
# Arrange — three typical-bucket residuals plus one catastrophic-tail
|
||||
# cert (sap < 5) that should be excluded from the typical MAE.
|
||||
cases = [
|
||||
ParityCase("a", 60, 59.0, is_typical=True),
|
||||
ParityCase("b", 70, 71.0, is_typical=True),
|
||||
ParityCase("c", 80, 78.0, is_typical=True),
|
||||
ParityCase("d", 3, 35.0, is_typical=False),
|
||||
]
|
||||
|
||||
# Act
|
||||
report = build_parity_report(cases)
|
||||
|
||||
# Assert — typical residuals are 1, 1, 2 → MAE = 4/3 ≈ 1.333.
|
||||
# Global MAE includes the d-cert blowout: residuals 1,1,2,32 → 9.0.
|
||||
assert report.typical_mae == pytest.approx(1.333, abs=0.01)
|
||||
assert report.global_mae == pytest.approx(9.0, abs=1e-6)
|
||||
assert report.case_count == 4
|
||||
assert report.typical_case_count == 3
|
||||
|
||||
|
||||
def test_worst_cases_returns_largest_absolute_residuals_first() -> None:
|
||||
# Arrange — residuals 1, 10, 3, 7, 2 (signs vary). Worst 3 by |residual|
|
||||
# must be 10, 7, 3.
|
||||
cases = [
|
||||
ParityCase("a", 60, 59.0, is_typical=True),
|
||||
ParityCase("b", 70, 80.0, is_typical=True),
|
||||
ParityCase("c", 80, 77.0, is_typical=True),
|
||||
ParityCase("d", 50, 57.0, is_typical=True),
|
||||
ParityCase("e", 65, 63.0, is_typical=True),
|
||||
]
|
||||
|
||||
# Act
|
||||
report = build_parity_report(cases, worst_n=3)
|
||||
|
||||
# Assert
|
||||
worst_ids = [c.certificate_number for c in report.worst_cases]
|
||||
assert worst_ids == ["b", "d", "c"]
|
||||
|
||||
|
||||
def test_empty_case_list_yields_zeroed_report_without_division_error() -> None:
|
||||
# Arrange — running parity validation before the cohort is loaded must
|
||||
# not crash; the report just reports zeros.
|
||||
|
||||
# Act
|
||||
report = build_parity_report([])
|
||||
|
||||
# Assert
|
||||
assert report.case_count == 0
|
||||
assert report.typical_case_count == 0
|
||||
assert report.global_mae == 0.0
|
||||
assert report.typical_mae == 0.0
|
||||
assert report.global_rmse == 0.0
|
||||
assert report.global_bias == 0.0
|
||||
assert report.worst_cases == ()
|
||||
|
||||
|
||||
def test_parity_report_is_immutable_dataclass() -> None:
|
||||
# Arrange — frozen dataclass guarantees the report's audit values
|
||||
# cannot be retroactively mutated after construction.
|
||||
report = build_parity_report(_cases((60, 60.0)))
|
||||
|
||||
# Act / Assert
|
||||
with pytest.raises(Exception):
|
||||
report.global_mae = 99.9 # type: ignore[misc]
|
||||
Loading…
Add table
Reference in a new issue