From fbe1cb54adb7b7baff5fc8de782f56adb729e09f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 13 Jun 2026 23:40:05 +0000 Subject: [PATCH] test(epc): end-to-end SAP-accuracy gauge over the RdSAP-21.0.1 corpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a committed integration test driving the full API path — raw gov-EPC response → from_api_response → cert_to_inputs → calculate_sap_from_inputs — across all 1000 certs in the in-repo RdSAP-21.0.1 corpus, and pins the aggregate accuracy of our continuous SAP (plus CO2 and primary energy) against each cert's lodged figures. Mirrors scripts/eval_api_sap_accuracy.py but runs in CI off the committed corpus (~2s, no /tmp sample needed). Scoped to RdSAP-21.0.1 — the SAP 10.2-era schema whose lodged rating uses the same methodology we compute (a fair target). Pre-SAP10 schemas (17.x-20.0.0) lodge SAP 2012 ratings and are out of scope (guarded for mapping only by test_mapper_corpus.py). Current: SAP within-0.5 = 65.0%, MAE = 1.174 (tight floor/ceiling — the optimised gauge). CO2 MAE = 0.27 t/yr (bias +0.17) and PE MAE = 14.6 kWh/m2/yr (bias +8.9) are reported + loosely guarded: cost is well-calibrated but CO2/PE both run ~+5-10% high (uniform across fuels — a systematic CO2/PE-factor or scope gap, not yet investigated). Thresholds ratchet as slices tighten each metric. Co-Authored-By: Claude Opus 4.8 --- .../epc_client/test_sap_accuracy_corpus.py | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 tests/infrastructure/epc_client/test_sap_accuracy_corpus.py diff --git a/tests/infrastructure/epc_client/test_sap_accuracy_corpus.py b/tests/infrastructure/epc_client/test_sap_accuracy_corpus.py new file mode 100644 index 00000000..8fa8b382 --- /dev/null +++ b/tests/infrastructure/epc_client/test_sap_accuracy_corpus.py @@ -0,0 +1,139 @@ +"""End-to-end SAP-accuracy gauge over the committed RdSAP-21.0.1 corpus. + +Drives the full API path — raw gov-EPC response → ``from_api_response`` → +``cert_to_inputs`` → ``calculate_sap_from_inputs`` — across all 1000 certs in +``backend/epc_api/json_samples/RdSAP-Schema-21.0.1/corpus.jsonl`` and pins the +aggregate accuracy of our continuous SAP (and CO2 / PE) against each cert's +lodged figures. This is the committed regression guard for the headline +"% within 0.5 SAP of the lodged rating" gauge that the per-cert mapper work +optimises (mirrors scripts/eval_api_sap_accuracy.py, but on the in-repo +corpus so it runs in CI without the /tmp sample). + +SCOPE — RdSAP-21.0.1 ONLY. It is the RdSAP 10 / SAP 10.2-era schema, so its +lodged ``energy_rating_current`` was produced by the same SAP methodology we +compute, making it a fair accuracy target. The pre-SAP10 schemas (17.x-20.0.0) +lodge SAP 2012 ratings — a different underlying calculation — so they are NOT +expected to match and are excluded here (their mapper coverage is guarded by +test_mapper_corpus.py instead). + +The asserted thresholds are deterministic floors/ceilings over the fixed +corpus: tighten them whenever a slice improves the gauge (ratchet, never +loosen). Run ``pytest -s`` to see the live metrics line. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +import pytest + +from datatypes.epc.domain.mapper import EpcPropertyDataMapper +from domain.sap10_calculator.calculator import calculate_sap_from_inputs +from domain.sap10_calculator.rdsap.cert_to_inputs import ( + SAP_10_2_SPEC_PRICES, + cert_to_inputs, +) + +_CORPUS = Path( + "backend/epc_api/json_samples/RdSAP-Schema-21.0.1/corpus.jsonl" +) + +# Measured floors/ceilings over the fixed corpus at HEAD (1000 certs, 0 skips). +# Current: SAP within-0.5 = 65.0%, SAP MAE = 1.174. +# CO2 MAE = 0.27 t/yr (signed +0.17 — a systematic over-estimate, see below). +# PE MAE = 14.6 kWh/m2/yr (signed +8.9). +# +# The SAP (cost) gauge is the optimised target — its floor/ceiling are TIGHT. +# CO2 and PE are reported + LOOSELY guarded: cost is well-calibrated but CO2 +# and PE both run ~+5-10% high (a real systematic gap, not yet investigated — +# uniform across fuels, so a CO2/PE-factor or scope issue, NOT the energy or +# cost). Their ceilings catch "got worse", not "isn't perfect". +# RATCHET any of these up when a slice tightens the corresponding metric. +_MIN_WITHIN_HALF_SAP = 0.62 +_MAX_SAP_MAE = 1.25 +_MAX_CO2_MAE_TONNES = 0.35 # t CO2 / yr vs co2_emissions_current +_MAX_PE_PER_M2_MAE = 16.0 # kWh / m2 / yr vs energy_consumption_current + + +def _load_corpus() -> list[dict[str, Any]]: + if not _CORPUS.exists(): + return [] + return [ + json.loads(line) + for line in _CORPUS.read_text().splitlines() + if line.strip() + ] + + +def test_api_path_sap_accuracy_on_rdsap_21_0_1_corpus( + capsys: pytest.CaptureFixture[str], +) -> None: + # Arrange — the full in-repo 21.0.1 corpus. + corpus = _load_corpus() + if not corpus: + pytest.skip(f"no corpus at {_CORPUS}") + + sap_abs_errs: list[float] = [] + co2_signed_errs_t: list[float] = [] # our − lodged, tonnes/yr + pe_signed_errs: list[float] = [] # our − lodged, kWh/m²/yr + skipped = 0 + + # Act — run the API → EpcPropertyData → calculator pipeline per cert. + for doc in corpus: + lodged_sap = doc.get("energy_rating_current") + if lodged_sap is None: + skipped += 1 + continue + try: + epc = EpcPropertyDataMapper.from_api_response(doc) + result = calculate_sap_from_inputs( + cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES) + ) + except Exception: + # A mapper / calculator raise is a coverage gap tracked elsewhere + # (eval_api_sap_accuracy.py); here we gauge the certs that compute. + skipped += 1 + continue + + sap_abs_errs.append(abs(result.sap_score_continuous - lodged_sap)) + + lodged_co2_t = doc.get("co2_emissions_current") # tonnes/yr + if lodged_co2_t is not None: + co2_signed_errs_t.append(result.co2_kg_per_yr / 1000.0 - lodged_co2_t) + lodged_pe_per_m2 = doc.get("energy_consumption_current") # kWh/m²/yr (primary) + if lodged_pe_per_m2 is not None: + pe_signed_errs.append(result.primary_energy_kwh_per_m2 - lodged_pe_per_m2) + + n = len(sap_abs_errs) + within_half = sum(1 for e in sap_abs_errs if e < 0.5) / n + sap_mae = sum(sap_abs_errs) / n + co2_mae = sum(abs(e) for e in co2_signed_errs_t) / len(co2_signed_errs_t) + co2_bias = sum(co2_signed_errs_t) / len(co2_signed_errs_t) + pe_mae = sum(abs(e) for e in pe_signed_errs) / len(pe_signed_errs) + pe_bias = sum(pe_signed_errs) / len(pe_signed_errs) + + with capsys.disabled(): + print( + f"\n[RdSAP-21.0.1 corpus | {n} computed / {skipped} skipped]" + f"\n SAP within-0.5 = {within_half:.1%} MAE = {sap_mae:.3f}" + f"\n CO2 MAE = {co2_mae:.2f} t/yr (bias {co2_bias:+.2f} t/yr)" + f"\n PE MAE = {pe_mae:.1f} kWh/m2/yr (bias {pe_bias:+.1f})" + ) + + # Assert — SAP (cost) is the optimised gauge: tight floor/ceiling. CO2/PE + # are loose "don't regress" guards (see module + threshold notes). + assert within_half >= _MIN_WITHIN_HALF_SAP, ( + f"SAP within-0.5 {within_half:.1%} fell below floor " + f"{_MIN_WITHIN_HALF_SAP:.0%}" + ) + assert sap_mae <= _MAX_SAP_MAE, ( + f"SAP MAE {sap_mae:.3f} exceeded ceiling {_MAX_SAP_MAE}" + ) + assert co2_mae <= _MAX_CO2_MAE_TONNES, ( + f"CO2 MAE {co2_mae:.2f} t/yr exceeded ceiling {_MAX_CO2_MAE_TONNES}" + ) + assert pe_mae <= _MAX_PE_PER_M2_MAE, ( + f"PE MAE {pe_mae:.1f} kWh/m2/yr exceeded ceiling {_MAX_PE_PER_M2_MAE}" + )