mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
test(epc): end-to-end SAP-accuracy gauge over the RdSAP-21.0.1 corpus
Adds a committed integration test driving the full API path — raw gov-EPC response → from_api_response → cert_to_inputs → calculate_sap_from_inputs — across all 1000 certs in the in-repo RdSAP-21.0.1 corpus, and pins the aggregate accuracy of our continuous SAP (plus CO2 and primary energy) against each cert's lodged figures. Mirrors scripts/eval_api_sap_accuracy.py but runs in CI off the committed corpus (~2s, no /tmp sample needed). Scoped to RdSAP-21.0.1 — the SAP 10.2-era schema whose lodged rating uses the same methodology we compute (a fair target). Pre-SAP10 schemas (17.x-20.0.0) lodge SAP 2012 ratings and are out of scope (guarded for mapping only by test_mapper_corpus.py). Current: SAP within-0.5 = 65.0%, MAE = 1.174 (tight floor/ceiling — the optimised gauge). CO2 MAE = 0.27 t/yr (bias +0.17) and PE MAE = 14.6 kWh/m2/yr (bias +8.9) are reported + loosely guarded: cost is well-calibrated but CO2/PE both run ~+5-10% high (uniform across fuels — a systematic CO2/PE-factor or scope gap, not yet investigated). Thresholds ratchet as slices tighten each metric. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
5317175dd3
commit
fbe1cb54ad
1 changed files with 139 additions and 0 deletions
139
tests/infrastructure/epc_client/test_sap_accuracy_corpus.py
Normal file
139
tests/infrastructure/epc_client/test_sap_accuracy_corpus.py
Normal file
|
|
@ -0,0 +1,139 @@
|
|||
"""End-to-end SAP-accuracy gauge over the committed RdSAP-21.0.1 corpus.
|
||||
|
||||
Drives the full API path — raw gov-EPC response → ``from_api_response`` →
|
||||
``cert_to_inputs`` → ``calculate_sap_from_inputs`` — across all 1000 certs in
|
||||
``backend/epc_api/json_samples/RdSAP-Schema-21.0.1/corpus.jsonl`` and pins the
|
||||
aggregate accuracy of our continuous SAP (and CO2 / PE) against each cert's
|
||||
lodged figures. This is the committed regression guard for the headline
|
||||
"% within 0.5 SAP of the lodged rating" gauge that the per-cert mapper work
|
||||
optimises (mirrors scripts/eval_api_sap_accuracy.py, but on the in-repo
|
||||
corpus so it runs in CI without the /tmp sample).
|
||||
|
||||
SCOPE — RdSAP-21.0.1 ONLY. It is the RdSAP 10 / SAP 10.2-era schema, so its
|
||||
lodged ``energy_rating_current`` was produced by the same SAP methodology we
|
||||
compute, making it a fair accuracy target. The pre-SAP10 schemas (17.x-20.0.0)
|
||||
lodge SAP 2012 ratings — a different underlying calculation — so they are NOT
|
||||
expected to match and are excluded here (their mapper coverage is guarded by
|
||||
test_mapper_corpus.py instead).
|
||||
|
||||
The asserted thresholds are deterministic floors/ceilings over the fixed
|
||||
corpus: tighten them whenever a slice improves the gauge (ratchet, never
|
||||
loosen). Run ``pytest -s`` to see the live metrics line.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
||||
from domain.sap10_calculator.calculator import calculate_sap_from_inputs
|
||||
from domain.sap10_calculator.rdsap.cert_to_inputs import (
|
||||
SAP_10_2_SPEC_PRICES,
|
||||
cert_to_inputs,
|
||||
)
|
||||
|
||||
_CORPUS = Path(
|
||||
"backend/epc_api/json_samples/RdSAP-Schema-21.0.1/corpus.jsonl"
|
||||
)
|
||||
|
||||
# Measured floors/ceilings over the fixed corpus at HEAD (1000 certs, 0 skips).
|
||||
# Current: SAP within-0.5 = 65.0%, SAP MAE = 1.174.
|
||||
# CO2 MAE = 0.27 t/yr (signed +0.17 — a systematic over-estimate, see below).
|
||||
# PE MAE = 14.6 kWh/m2/yr (signed +8.9).
|
||||
#
|
||||
# The SAP (cost) gauge is the optimised target — its floor/ceiling are TIGHT.
|
||||
# CO2 and PE are reported + LOOSELY guarded: cost is well-calibrated but CO2
|
||||
# and PE both run ~+5-10% high (a real systematic gap, not yet investigated —
|
||||
# uniform across fuels, so a CO2/PE-factor or scope issue, NOT the energy or
|
||||
# cost). Their ceilings catch "got worse", not "isn't perfect".
|
||||
# RATCHET any of these up when a slice tightens the corresponding metric.
|
||||
_MIN_WITHIN_HALF_SAP = 0.62
|
||||
_MAX_SAP_MAE = 1.25
|
||||
_MAX_CO2_MAE_TONNES = 0.35 # t CO2 / yr vs co2_emissions_current
|
||||
_MAX_PE_PER_M2_MAE = 16.0 # kWh / m2 / yr vs energy_consumption_current
|
||||
|
||||
|
||||
def _load_corpus() -> list[dict[str, Any]]:
|
||||
if not _CORPUS.exists():
|
||||
return []
|
||||
return [
|
||||
json.loads(line)
|
||||
for line in _CORPUS.read_text().splitlines()
|
||||
if line.strip()
|
||||
]
|
||||
|
||||
|
||||
def test_api_path_sap_accuracy_on_rdsap_21_0_1_corpus(
|
||||
capsys: pytest.CaptureFixture[str],
|
||||
) -> None:
|
||||
# Arrange — the full in-repo 21.0.1 corpus.
|
||||
corpus = _load_corpus()
|
||||
if not corpus:
|
||||
pytest.skip(f"no corpus at {_CORPUS}")
|
||||
|
||||
sap_abs_errs: list[float] = []
|
||||
co2_signed_errs_t: list[float] = [] # our − lodged, tonnes/yr
|
||||
pe_signed_errs: list[float] = [] # our − lodged, kWh/m²/yr
|
||||
skipped = 0
|
||||
|
||||
# Act — run the API → EpcPropertyData → calculator pipeline per cert.
|
||||
for doc in corpus:
|
||||
lodged_sap = doc.get("energy_rating_current")
|
||||
if lodged_sap is None:
|
||||
skipped += 1
|
||||
continue
|
||||
try:
|
||||
epc = EpcPropertyDataMapper.from_api_response(doc)
|
||||
result = calculate_sap_from_inputs(
|
||||
cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES)
|
||||
)
|
||||
except Exception:
|
||||
# A mapper / calculator raise is a coverage gap tracked elsewhere
|
||||
# (eval_api_sap_accuracy.py); here we gauge the certs that compute.
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
sap_abs_errs.append(abs(result.sap_score_continuous - lodged_sap))
|
||||
|
||||
lodged_co2_t = doc.get("co2_emissions_current") # tonnes/yr
|
||||
if lodged_co2_t is not None:
|
||||
co2_signed_errs_t.append(result.co2_kg_per_yr / 1000.0 - lodged_co2_t)
|
||||
lodged_pe_per_m2 = doc.get("energy_consumption_current") # kWh/m²/yr (primary)
|
||||
if lodged_pe_per_m2 is not None:
|
||||
pe_signed_errs.append(result.primary_energy_kwh_per_m2 - lodged_pe_per_m2)
|
||||
|
||||
n = len(sap_abs_errs)
|
||||
within_half = sum(1 for e in sap_abs_errs if e < 0.5) / n
|
||||
sap_mae = sum(sap_abs_errs) / n
|
||||
co2_mae = sum(abs(e) for e in co2_signed_errs_t) / len(co2_signed_errs_t)
|
||||
co2_bias = sum(co2_signed_errs_t) / len(co2_signed_errs_t)
|
||||
pe_mae = sum(abs(e) for e in pe_signed_errs) / len(pe_signed_errs)
|
||||
pe_bias = sum(pe_signed_errs) / len(pe_signed_errs)
|
||||
|
||||
with capsys.disabled():
|
||||
print(
|
||||
f"\n[RdSAP-21.0.1 corpus | {n} computed / {skipped} skipped]"
|
||||
f"\n SAP within-0.5 = {within_half:.1%} MAE = {sap_mae:.3f}"
|
||||
f"\n CO2 MAE = {co2_mae:.2f} t/yr (bias {co2_bias:+.2f} t/yr)"
|
||||
f"\n PE MAE = {pe_mae:.1f} kWh/m2/yr (bias {pe_bias:+.1f})"
|
||||
)
|
||||
|
||||
# Assert — SAP (cost) is the optimised gauge: tight floor/ceiling. CO2/PE
|
||||
# are loose "don't regress" guards (see module + threshold notes).
|
||||
assert within_half >= _MIN_WITHIN_HALF_SAP, (
|
||||
f"SAP within-0.5 {within_half:.1%} fell below floor "
|
||||
f"{_MIN_WITHIN_HALF_SAP:.0%}"
|
||||
)
|
||||
assert sap_mae <= _MAX_SAP_MAE, (
|
||||
f"SAP MAE {sap_mae:.3f} exceeded ceiling {_MAX_SAP_MAE}"
|
||||
)
|
||||
assert co2_mae <= _MAX_CO2_MAE_TONNES, (
|
||||
f"CO2 MAE {co2_mae:.2f} t/yr exceeded ceiling {_MAX_CO2_MAE_TONNES}"
|
||||
)
|
||||
assert pe_mae <= _MAX_PE_PER_M2_MAE, (
|
||||
f"PE MAE {pe_mae:.1f} kWh/m2/yr exceeded ceiling {_MAX_PE_PER_M2_MAE}"
|
||||
)
|
||||
Loading…
Add table
Reference in a new issue