From 57f18a87730ec1951102f9c25fdde9e9f3d030d7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 May 2026 13:22:45 +0000 Subject: [PATCH] slice S-B1: parity-validation report aggregator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure-function ParityCase / ParityReport / build_parity_report for the Session B 1000-cert parity check (ADR-0009). Aggregates per-cert (predicted, actual) sap pairs into global + typical-subset MAE, RMSE, bias, and the worst-N residuals for spec-iteration. Cert→case mapping (corpus load, calculator run, actual-sap lookup) sits at a higher layer; this module is trivial to test so the harder integration code inherits its testing. Co-Authored-By: Claude Opus 4.7 --- .../src/domain/sap/validation/__init__.py | 0 .../domain/sap/validation/parity_report.py | 95 ++++++++++++ .../domain/sap/validation/tests/__init__.py | 0 .../validation/tests/test_parity_report.py | 136 ++++++++++++++++++ 4 files changed, 231 insertions(+) create mode 100644 packages/domain/src/domain/sap/validation/__init__.py create mode 100644 packages/domain/src/domain/sap/validation/parity_report.py create mode 100644 packages/domain/src/domain/sap/validation/tests/__init__.py create mode 100644 packages/domain/src/domain/sap/validation/tests/test_parity_report.py diff --git a/packages/domain/src/domain/sap/validation/__init__.py b/packages/domain/src/domain/sap/validation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/domain/src/domain/sap/validation/parity_report.py b/packages/domain/src/domain/sap/validation/parity_report.py new file mode 100644 index 00000000..9a04a696 --- /dev/null +++ b/packages/domain/src/domain/sap/validation/parity_report.py @@ -0,0 +1,95 @@ +"""Parity-validation report for the deterministic SAP 10.3 calculator. + +ADR-0009 Session B compares `Sap10Calculator.calculate(epc).sap_score` +to the cert's `energy_rating_current` across a 1000-cert stratified +sample. The success criterion is MAE ≤ 1.0 SAP-point on the *typical +subset* (cohort excluding catastrophic-tail certs, multi-heating, +conservatory, room-in-roof) — those edge cases are themselves the +backlog Session B iterates against. + +This module is the pure aggregation step: given a list of per-cert +`ParityCase` records, it emits a typed `ParityReport` with global + +typical-subset MAE/RMSE/bias and the worst-N cases by |residual| for +investigation. The cert→case mapping itself (loading from the corpus, +running the calculator, looking up the cert's actual sap) lives at a +higher layer — keeps this report module trivial to test. + +Reference: ADR-0009 §"Validation" + Session B plan. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from math import sqrt +from typing import Final + + +_DEFAULT_WORST_N: Final[int] = 25 + + +@dataclass(frozen=True) +class ParityCase: + """One certificate's calculator-vs-cert SAP comparison. + + `is_typical` marks whether the cert belongs to the typical subset + the Session B success criterion is measured against. Catastrophic- + tail certs (sap ≤ 5 or ≥ 100), multi-heating, conservatory, and + room-in-roof cases set this False — they show up in the global + aggregate but not the typical-subset MAE. + """ + + certificate_number: str + actual_sap: int + predicted_sap: float + is_typical: bool + + +@dataclass(frozen=True) +class ParityReport: + case_count: int + typical_case_count: int + global_mae: float + typical_mae: float + global_rmse: float + global_bias: float + worst_cases: tuple[ParityCase, ...] + + +def _residual(case: ParityCase) -> float: + """Predicted − actual. Positive = calculator over-predicts.""" + return case.predicted_sap - case.actual_sap + + +def _mean_abs(cases: list[ParityCase]) -> float: + if not cases: + return 0.0 + return sum(abs(_residual(c)) for c in cases) / len(cases) + + +def _rmse(cases: list[ParityCase]) -> float: + if not cases: + return 0.0 + return sqrt(sum(_residual(c) ** 2 for c in cases) / len(cases)) + + +def _bias(cases: list[ParityCase]) -> float: + if not cases: + return 0.0 + return sum(_residual(c) for c in cases) / len(cases) + + +def build_parity_report( + cases: list[ParityCase], *, worst_n: int = _DEFAULT_WORST_N +) -> ParityReport: + """Aggregate a list of `ParityCase` into a typed `ParityReport`.""" + typical = [c for c in cases if c.is_typical] + worst = tuple(sorted(cases, key=lambda c: abs(_residual(c)), reverse=True)[:worst_n]) + return ParityReport( + case_count=len(cases), + typical_case_count=len(typical), + global_mae=_mean_abs(cases), + typical_mae=_mean_abs(typical), + global_rmse=_rmse(cases), + global_bias=_bias(cases), + worst_cases=worst, + ) diff --git a/packages/domain/src/domain/sap/validation/tests/__init__.py b/packages/domain/src/domain/sap/validation/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/packages/domain/src/domain/sap/validation/tests/test_parity_report.py b/packages/domain/src/domain/sap/validation/tests/test_parity_report.py new file mode 100644 index 00000000..cb987c45 --- /dev/null +++ b/packages/domain/src/domain/sap/validation/tests/test_parity_report.py @@ -0,0 +1,136 @@ +"""Tests for the parity-validation report. + +The report aggregates per-cert (predicted, actual) sap-score pairs into +the audit shape ADR-0009 Session B specifies: global MAE/RMSE/bias, MAE +on the "typical subset" (excluding catastrophic-tail certs), and the +worst-N residual cases for spec-interpretation iteration. + +Tests use synthetic pair lists so the math is hand-verifiable. + +Reference: ADR-0009 (Accepted) §"Validation" and Session B success +criterion (MAE ≤ 1.0 SAP-point on the typical subset). +""" + +from __future__ import annotations + +import pytest + +from domain.sap.validation.parity_report import ( + ParityCase, + ParityReport, + build_parity_report, +) + + +def _cases(*pairs: tuple[int, float]) -> list[ParityCase]: + """Build typical (non-tail) ParityCase objects from (actual, predicted) + pairs so tests can stay terse.""" + return [ + ParityCase( + certificate_number=str(i), + actual_sap=a, + predicted_sap=p, + is_typical=True, + ) + for i, (a, p) in enumerate(pairs) + ] + + +def test_global_mae_is_mean_absolute_residual_across_all_cases() -> None: + # Arrange — three certs, residuals 5, 3, 4 → MAE = (5+3+4) / 3 = 4. + cases = _cases((60, 55.0), (70, 73.0), (80, 76.0)) + + # Act + report = build_parity_report(cases) + + # Assert + assert report.global_mae == pytest.approx(4.0, abs=1e-6) + + +def test_global_rmse_uses_root_mean_square_of_residuals() -> None: + # Arrange — residuals 5, 3, 4 → RMSE = sqrt((25+9+16)/3) = sqrt(50/3) ≈ 4.082. + cases = _cases((60, 55.0), (70, 73.0), (80, 76.0)) + + # Act + report = build_parity_report(cases) + + # Assert + assert report.global_rmse == pytest.approx(4.082, abs=0.01) + + +def test_global_bias_is_signed_mean_residual_predicted_minus_actual() -> None: + # Arrange — residuals -5, +3, -4 → bias = mean = -2.0 (under-prediction). + cases = _cases((60, 55.0), (70, 73.0), (80, 76.0)) + + # Act + report = build_parity_report(cases) + + # Assert + assert report.global_bias == pytest.approx(-2.0, abs=1e-6) + + +def test_typical_subset_mae_ignores_cases_flagged_not_typical() -> None: + # Arrange — three typical-bucket residuals plus one catastrophic-tail + # cert (sap < 5) that should be excluded from the typical MAE. + cases = [ + ParityCase("a", 60, 59.0, is_typical=True), + ParityCase("b", 70, 71.0, is_typical=True), + ParityCase("c", 80, 78.0, is_typical=True), + ParityCase("d", 3, 35.0, is_typical=False), + ] + + # Act + report = build_parity_report(cases) + + # Assert — typical residuals are 1, 1, 2 → MAE = 4/3 ≈ 1.333. + # Global MAE includes the d-cert blowout: residuals 1,1,2,32 → 9.0. + assert report.typical_mae == pytest.approx(1.333, abs=0.01) + assert report.global_mae == pytest.approx(9.0, abs=1e-6) + assert report.case_count == 4 + assert report.typical_case_count == 3 + + +def test_worst_cases_returns_largest_absolute_residuals_first() -> None: + # Arrange — residuals 1, 10, 3, 7, 2 (signs vary). Worst 3 by |residual| + # must be 10, 7, 3. + cases = [ + ParityCase("a", 60, 59.0, is_typical=True), + ParityCase("b", 70, 80.0, is_typical=True), + ParityCase("c", 80, 77.0, is_typical=True), + ParityCase("d", 50, 57.0, is_typical=True), + ParityCase("e", 65, 63.0, is_typical=True), + ] + + # Act + report = build_parity_report(cases, worst_n=3) + + # Assert + worst_ids = [c.certificate_number for c in report.worst_cases] + assert worst_ids == ["b", "d", "c"] + + +def test_empty_case_list_yields_zeroed_report_without_division_error() -> None: + # Arrange — running parity validation before the cohort is loaded must + # not crash; the report just reports zeros. + + # Act + report = build_parity_report([]) + + # Assert + assert report.case_count == 0 + assert report.typical_case_count == 0 + assert report.global_mae == 0.0 + assert report.typical_mae == 0.0 + assert report.global_rmse == 0.0 + assert report.global_bias == 0.0 + assert report.worst_cases == () + + +def test_parity_report_is_immutable_dataclass() -> None: + # Arrange — frozen dataclass guarantees the report's audit values + # cannot be retroactively mutated after construction. + report = build_parity_report(_cases((60, 60.0))) + + # Act / Assert + with pytest.raises(Exception): + report.global_mae = 99.9 # type: ignore[misc]