test(epc): end-to-end SAP-accuracy gauge over the RdSAP-21.0.1 corpus

Adds a committed integration test driving the full API path — raw gov-EPC
response → from_api_response → cert_to_inputs → calculate_sap_from_inputs —
across all 1000 certs in the in-repo RdSAP-21.0.1 corpus, and pins the
aggregate accuracy of our continuous SAP (plus CO2 and primary energy)
against each cert's lodged figures. Mirrors scripts/eval_api_sap_accuracy.py
but runs in CI off the committed corpus (~2s, no /tmp sample needed).

Scoped to RdSAP-21.0.1 — the SAP 10.2-era schema whose lodged rating uses the
same methodology we compute (a fair target). Pre-SAP10 schemas (17.x-20.0.0)
lodge SAP 2012 ratings and are out of scope (guarded for mapping only by
test_mapper_corpus.py).

Current: SAP within-0.5 = 65.0%, MAE = 1.174 (tight floor/ceiling — the
optimised gauge). CO2 MAE = 0.27 t/yr (bias +0.17) and PE MAE = 14.6
kWh/m2/yr (bias +8.9) are reported + loosely guarded: cost is well-calibrated
but CO2/PE both run ~+5-10% high (uniform across fuels — a systematic
CO2/PE-factor or scope gap, not yet investigated). Thresholds ratchet as
slices tighten each metric.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-13 23:40:05 +00:00
parent 5317175dd3
commit fbe1cb54ad

View file

@ -0,0 +1,139 @@
"""End-to-end SAP-accuracy gauge over the committed RdSAP-21.0.1 corpus.
Drives the full API path raw gov-EPC response ``from_api_response``
``cert_to_inputs`` ``calculate_sap_from_inputs`` across all 1000 certs in
``backend/epc_api/json_samples/RdSAP-Schema-21.0.1/corpus.jsonl`` and pins the
aggregate accuracy of our continuous SAP (and CO2 / PE) against each cert's
lodged figures. This is the committed regression guard for the headline
"% within 0.5 SAP of the lodged rating" gauge that the per-cert mapper work
optimises (mirrors scripts/eval_api_sap_accuracy.py, but on the in-repo
corpus so it runs in CI without the /tmp sample).
SCOPE RdSAP-21.0.1 ONLY. It is the RdSAP 10 / SAP 10.2-era schema, so its
lodged ``energy_rating_current`` was produced by the same SAP methodology we
compute, making it a fair accuracy target. The pre-SAP10 schemas (17.x-20.0.0)
lodge SAP 2012 ratings a different underlying calculation so they are NOT
expected to match and are excluded here (their mapper coverage is guarded by
test_mapper_corpus.py instead).
The asserted thresholds are deterministic floors/ceilings over the fixed
corpus: tighten them whenever a slice improves the gauge (ratchet, never
loosen). Run ``pytest -s`` to see the live metrics line.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import pytest
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.sap10_calculator.calculator import calculate_sap_from_inputs
from domain.sap10_calculator.rdsap.cert_to_inputs import (
SAP_10_2_SPEC_PRICES,
cert_to_inputs,
)
_CORPUS = Path(
"backend/epc_api/json_samples/RdSAP-Schema-21.0.1/corpus.jsonl"
)
# Measured floors/ceilings over the fixed corpus at HEAD (1000 certs, 0 skips).
# Current: SAP within-0.5 = 65.0%, SAP MAE = 1.174.
# CO2 MAE = 0.27 t/yr (signed +0.17 — a systematic over-estimate, see below).
# PE MAE = 14.6 kWh/m2/yr (signed +8.9).
#
# The SAP (cost) gauge is the optimised target — its floor/ceiling are TIGHT.
# CO2 and PE are reported + LOOSELY guarded: cost is well-calibrated but CO2
# and PE both run ~+5-10% high (a real systematic gap, not yet investigated —
# uniform across fuels, so a CO2/PE-factor or scope issue, NOT the energy or
# cost). Their ceilings catch "got worse", not "isn't perfect".
# RATCHET any of these up when a slice tightens the corresponding metric.
_MIN_WITHIN_HALF_SAP = 0.62
_MAX_SAP_MAE = 1.25
_MAX_CO2_MAE_TONNES = 0.35 # t CO2 / yr vs co2_emissions_current
_MAX_PE_PER_M2_MAE = 16.0 # kWh / m2 / yr vs energy_consumption_current
def _load_corpus() -> list[dict[str, Any]]:
if not _CORPUS.exists():
return []
return [
json.loads(line)
for line in _CORPUS.read_text().splitlines()
if line.strip()
]
def test_api_path_sap_accuracy_on_rdsap_21_0_1_corpus(
capsys: pytest.CaptureFixture[str],
) -> None:
# Arrange — the full in-repo 21.0.1 corpus.
corpus = _load_corpus()
if not corpus:
pytest.skip(f"no corpus at {_CORPUS}")
sap_abs_errs: list[float] = []
co2_signed_errs_t: list[float] = [] # our lodged, tonnes/yr
pe_signed_errs: list[float] = [] # our lodged, kWh/m²/yr
skipped = 0
# Act — run the API → EpcPropertyData → calculator pipeline per cert.
for doc in corpus:
lodged_sap = doc.get("energy_rating_current")
if lodged_sap is None:
skipped += 1
continue
try:
epc = EpcPropertyDataMapper.from_api_response(doc)
result = calculate_sap_from_inputs(
cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES)
)
except Exception:
# A mapper / calculator raise is a coverage gap tracked elsewhere
# (eval_api_sap_accuracy.py); here we gauge the certs that compute.
skipped += 1
continue
sap_abs_errs.append(abs(result.sap_score_continuous - lodged_sap))
lodged_co2_t = doc.get("co2_emissions_current") # tonnes/yr
if lodged_co2_t is not None:
co2_signed_errs_t.append(result.co2_kg_per_yr / 1000.0 - lodged_co2_t)
lodged_pe_per_m2 = doc.get("energy_consumption_current") # kWh/m²/yr (primary)
if lodged_pe_per_m2 is not None:
pe_signed_errs.append(result.primary_energy_kwh_per_m2 - lodged_pe_per_m2)
n = len(sap_abs_errs)
within_half = sum(1 for e in sap_abs_errs if e < 0.5) / n
sap_mae = sum(sap_abs_errs) / n
co2_mae = sum(abs(e) for e in co2_signed_errs_t) / len(co2_signed_errs_t)
co2_bias = sum(co2_signed_errs_t) / len(co2_signed_errs_t)
pe_mae = sum(abs(e) for e in pe_signed_errs) / len(pe_signed_errs)
pe_bias = sum(pe_signed_errs) / len(pe_signed_errs)
with capsys.disabled():
print(
f"\n[RdSAP-21.0.1 corpus | {n} computed / {skipped} skipped]"
f"\n SAP within-0.5 = {within_half:.1%} MAE = {sap_mae:.3f}"
f"\n CO2 MAE = {co2_mae:.2f} t/yr (bias {co2_bias:+.2f} t/yr)"
f"\n PE MAE = {pe_mae:.1f} kWh/m2/yr (bias {pe_bias:+.1f})"
)
# Assert — SAP (cost) is the optimised gauge: tight floor/ceiling. CO2/PE
# are loose "don't regress" guards (see module + threshold notes).
assert within_half >= _MIN_WITHIN_HALF_SAP, (
f"SAP within-0.5 {within_half:.1%} fell below floor "
f"{_MIN_WITHIN_HALF_SAP:.0%}"
)
assert sap_mae <= _MAX_SAP_MAE, (
f"SAP MAE {sap_mae:.3f} exceeded ceiling {_MAX_SAP_MAE}"
)
assert co2_mae <= _MAX_CO2_MAE_TONNES, (
f"CO2 MAE {co2_mae:.2f} t/yr exceeded ceiling {_MAX_CO2_MAE_TONNES}"
)
assert pe_mae <= _MAX_PE_PER_M2_MAE, (
f"PE MAE {pe_mae:.1f} kWh/m2/yr exceeded ceiling {_MAX_PE_PER_M2_MAE}"
)