test(epc): end-to-end SAP-accuracy gauge over the RdSAP-21.0.1 corpus

Adds a committed integration test driving the full API path — raw gov-EPC response → from_api_response → cert_to_inputs → calculate_sap_from_inputs — across all 1000 certs in the in-repo RdSAP-21.0.1 corpus, and pins the aggregate accuracy of our continuous SAP (plus CO2 and primary energy) against each cert's lodged figures. Mirrors scripts/eval_api_sap_accuracy.py but runs in CI off the committed corpus (~2s, no /tmp sample needed). Scoped to RdSAP-21.0.1 — the SAP 10.2-era schema whose lodged rating uses the same methodology we compute (a fair target). Pre-SAP10 schemas (17.x-20.0.0) lodge SAP 2012 ratings and are out of scope (guarded for mapping only by test_mapper_corpus.py). Current: SAP within-0.5 = 65.0%, MAE = 1.174 (tight floor/ceiling — the optimised gauge). CO2 MAE = 0.27 t/yr (bias +0.17) and PE MAE = 14.6 kWh/m2/yr (bias +8.9) are reported + loosely guarded: cost is well-calibrated but CO2/PE both run ~+5-10% high (uniform across fuels — a systematic CO2/PE-factor or scope gap, not yet investigated). Thresholds ratchet as slices tighten each metric. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 13:10:47 +00:00 · 2026-06-13 23:40:05 +00:00 · 2026-06-13 23:40:05 +00:00 · fbe1cb54ad
commit fbe1cb54ad
parent 5317175dd3
1 changed files with 139 additions and 0 deletions
--- a/tests/infrastructure/epc_client/test_sap_accuracy_corpus.py
+++ b/tests/infrastructure/epc_client/test_sap_accuracy_corpus.py
@ -0,0 +1,139 @@
+"""End-to-end SAP-accuracy gauge over the committed RdSAP-21.0.1 corpus.
+
+Drives the full API path — raw gov-EPC response → ``from_api_response`` →
+``cert_to_inputs`` → ``calculate_sap_from_inputs`` — across all 1000 certs in
+``backend/epc_api/json_samples/RdSAP-Schema-21.0.1/corpus.jsonl`` and pins the
+aggregate accuracy of our continuous SAP (and CO2 / PE) against each cert's
+lodged figures. This is the committed regression guard for the headline
+"% within 0.5 SAP of the lodged rating" gauge that the per-cert mapper work
+optimises (mirrors scripts/eval_api_sap_accuracy.py, but on the in-repo
+corpus so it runs in CI without the /tmp sample).
+
+SCOPE — RdSAP-21.0.1 ONLY. It is the RdSAP 10 / SAP 10.2-era schema, so its
+lodged ``energy_rating_current`` was produced by the same SAP methodology we
+compute, making it a fair accuracy target. The pre-SAP10 schemas (17.x-20.0.0)
+lodge SAP 2012 ratings — a different underlying calculation — so they are NOT
+expected to match and are excluded here (their mapper coverage is guarded by
+test_mapper_corpus.py instead).
+
+The asserted thresholds are deterministic floors/ceilings over the fixed
+corpus: tighten them whenever a slice improves the gauge (ratchet, never
+loosen). Run ``pytest -s`` to see the live metrics line.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+from datatypes.epc.domain.mapper import EpcPropertyDataMapper
+from domain.sap10_calculator.calculator import calculate_sap_from_inputs
+from domain.sap10_calculator.rdsap.cert_to_inputs import (
+    SAP_10_2_SPEC_PRICES,
+    cert_to_inputs,
+)
+
+_CORPUS = Path(
+    "backend/epc_api/json_samples/RdSAP-Schema-21.0.1/corpus.jsonl"
+)
+
+# Measured floors/ceilings over the fixed corpus at HEAD (1000 certs, 0 skips).
+# Current: SAP within-0.5 = 65.0%, SAP MAE = 1.174.
+# CO2 MAE = 0.27 t/yr (signed +0.17 — a systematic over-estimate, see below).
+# PE  MAE = 14.6 kWh/m2/yr (signed +8.9).
+#
+# The SAP (cost) gauge is the optimised target — its floor/ceiling are TIGHT.
+# CO2 and PE are reported + LOOSELY guarded: cost is well-calibrated but CO2
+# and PE both run ~+5-10% high (a real systematic gap, not yet investigated —
+# uniform across fuels, so a CO2/PE-factor or scope issue, NOT the energy or
+# cost). Their ceilings catch "got worse", not "isn't perfect".
+# RATCHET any of these up when a slice tightens the corresponding metric.
+_MIN_WITHIN_HALF_SAP = 0.62
+_MAX_SAP_MAE = 1.25
+_MAX_CO2_MAE_TONNES = 0.35      # t CO2 / yr vs co2_emissions_current
+_MAX_PE_PER_M2_MAE = 16.0       # kWh / m2 / yr vs energy_consumption_current
+
+
+def _load_corpus() -> list[dict[str, Any]]:
+    if not _CORPUS.exists():
+        return []
+    return [
+        json.loads(line)
+        for line in _CORPUS.read_text().splitlines()
+        if line.strip()
+    ]
+
+
+def test_api_path_sap_accuracy_on_rdsap_21_0_1_corpus(
+    capsys: pytest.CaptureFixture[str],
+) -> None:
+    # Arrange — the full in-repo 21.0.1 corpus.
+    corpus = _load_corpus()
+    if not corpus:
+        pytest.skip(f"no corpus at {_CORPUS}")
+
+    sap_abs_errs: list[float] = []
+    co2_signed_errs_t: list[float] = []      # our − lodged, tonnes/yr
+    pe_signed_errs: list[float] = []          # our − lodged, kWh/m²/yr
+    skipped = 0
+
+    # Act — run the API → EpcPropertyData → calculator pipeline per cert.
+    for doc in corpus:
+        lodged_sap = doc.get("energy_rating_current")
+        if lodged_sap is None:
+            skipped += 1
+            continue
+        try:
+            epc = EpcPropertyDataMapper.from_api_response(doc)
+            result = calculate_sap_from_inputs(
+                cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES)
+            )
+        except Exception:
+            # A mapper / calculator raise is a coverage gap tracked elsewhere
+            # (eval_api_sap_accuracy.py); here we gauge the certs that compute.
+            skipped += 1
+            continue
+
+        sap_abs_errs.append(abs(result.sap_score_continuous - lodged_sap))
+
+        lodged_co2_t = doc.get("co2_emissions_current")  # tonnes/yr
+        if lodged_co2_t is not None:
+            co2_signed_errs_t.append(result.co2_kg_per_yr / 1000.0 - lodged_co2_t)
+        lodged_pe_per_m2 = doc.get("energy_consumption_current")  # kWh/m²/yr (primary)
+        if lodged_pe_per_m2 is not None:
+            pe_signed_errs.append(result.primary_energy_kwh_per_m2 - lodged_pe_per_m2)
+
+    n = len(sap_abs_errs)
+    within_half = sum(1 for e in sap_abs_errs if e < 0.5) / n
+    sap_mae = sum(sap_abs_errs) / n
+    co2_mae = sum(abs(e) for e in co2_signed_errs_t) / len(co2_signed_errs_t)
+    co2_bias = sum(co2_signed_errs_t) / len(co2_signed_errs_t)
+    pe_mae = sum(abs(e) for e in pe_signed_errs) / len(pe_signed_errs)
+    pe_bias = sum(pe_signed_errs) / len(pe_signed_errs)
+
+    with capsys.disabled():
+        print(
+            f"\n[RdSAP-21.0.1 corpus | {n} computed / {skipped} skipped]"
+            f"\n  SAP within-0.5 = {within_half:.1%}   MAE = {sap_mae:.3f}"
+            f"\n  CO2 MAE = {co2_mae:.2f} t/yr   (bias {co2_bias:+.2f} t/yr)"
+            f"\n  PE  MAE = {pe_mae:.1f} kWh/m2/yr (bias {pe_bias:+.1f})"
+        )
+
+    # Assert — SAP (cost) is the optimised gauge: tight floor/ceiling. CO2/PE
+    # are loose "don't regress" guards (see module + threshold notes).
+    assert within_half >= _MIN_WITHIN_HALF_SAP, (
+        f"SAP within-0.5 {within_half:.1%} fell below floor "
+        f"{_MIN_WITHIN_HALF_SAP:.0%}"
+    )
+    assert sap_mae <= _MAX_SAP_MAE, (
+        f"SAP MAE {sap_mae:.3f} exceeded ceiling {_MAX_SAP_MAE}"
+    )
+    assert co2_mae <= _MAX_CO2_MAE_TONNES, (
+        f"CO2 MAE {co2_mae:.2f} t/yr exceeded ceiling {_MAX_CO2_MAE_TONNES}"
+    )
+    assert pe_mae <= _MAX_PE_PER_M2_MAE, (
+        f"PE MAE {pe_mae:.1f} kWh/m2/yr exceeded ceiling {_MAX_PE_PER_M2_MAE}"
+    )