Model/scripts/eval_api_sap_accuracy.py
Khalim Conn-Kowlessar 3b442f9606 scripts: promote the API SAP-accuracy toolkit from /tmp
Three reusable scripts (each with a purpose/usage docstring) for wide-scale
testing of the calculator's API front-end against the GOV.UK EPB register —
the toolkit behind the 1000-cert study (docs/HANDOVER_API_SAMPLE_ACCURACY.md):

  fetch_2026_epc_sample.py    — sample cert numbers across a date window
                                (random pages) + download full schema-21 JSON
                                to a cache; resumable, 429/5xx backoff.
  eval_api_sap_accuracy.py    — % within 0.5 SAP, error histogram, worst-40,
                                and the mapper/calculator raise breakdown.
  analyse_api_sap_clusters.py — error grouped by property + heating type to
                                locate clusters (electric heating, flats, PV).

Cache dir defaults to /tmp/epc_2026_sample, overridable via EPC_SAMPLE_CACHE.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 16:52:09 +00:00

169 lines
6.6 KiB
Python

"""Score the SAP10 calculator's API path against a cached EPC sample.
WHAT THIS IS FOR
----------------
Measures how well the API front-end (`from_api_response` → `cert_to_inputs`
→ continuous SAP) reproduces each cert's lodged rounded SAP
(`energy_rating_current`) across the sample built by
`fetch_2026_epc_sample.py`. This is the headline accuracy gauge for raw-API
behaviour on an unbiased population.
Each cert lands in one bucket:
- computed — ran end-to-end; SAP error recorded.
- unsupported_schema — pre-21 schema the mapper doesn't support (skip).
- raise:<Exc> — mapper raised (UnmappedApiCode etc.) — a gap to fix.
- calc_raise:<Exc> — calculator raised (UnmappedSapCode etc.) — a gap.
OUTPUT
------
- Category counts + the raise breakdown with example certs (what to fix).
- For computed certs: % within 0.5 / 1 / 2 / 5 SAP, median/mean/p90/p99/max
|err|, the signed mean (over- vs under-rating), abs-err histogram.
- The 40 worst offenders with diagnostic columns (to prioritise).
- A full per-cert CSV at <cache>/_results.csv for ad-hoc slicing.
USAGE
-----
PYTHONPATH=/workspaces/model python scripts/eval_api_sap_accuracy.py
Reads the cache written by `fetch_2026_epc_sample.py` (default
`/tmp/epc_2026_sample`, overridable via `EPC_SAMPLE_CACHE`).
"""
import os
import json
import csv
import math
from collections import Counter, defaultdict
from pathlib import Path
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.sap10_calculator.calculator import calculate_sap_from_inputs
from domain.sap10_calculator.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES, cert_to_inputs
CACHE = Path(os.environ.get("EPC_SAMPLE_CACHE", "/tmp/epc_2026_sample"))
def diag(doc):
"""A few raw-JSON fields that help explain a cert's error at a glance."""
es = doc.get("sap_energy_source", {}) or {}
h = doc.get("sap_heating", {}) or {}
mh = (h.get("main_heating_details") or [{}])
mh0 = mh[0] if mh else {}
pv = es.get("photovoltaic_supply")
return {
"schema": doc.get("schema_type"),
"prop_type": doc.get("property_type"),
"built_form": doc.get("built_form"),
"age_band": doc.get("construction_age_band"),
"mains_gas": es.get("mains_gas"),
"main_heat_cat": mh0.get("main_heating_category"),
"main_heat_idx": mh0.get("main_heating_index_number"),
"n_bps": len(doc.get("sap_building_parts") or []),
"lodged_band": doc.get("current_energy_efficiency_band"),
}
def main():
files = sorted(CACHE.glob("????-????-????-????-????.json"))
rows = []
cat = Counter()
exc_examples = defaultdict(list)
for f in files:
cert = f.stem
try:
doc = json.loads(f.read_text())
except Exception:
cat["bad_json"] += 1
continue
lodged = doc.get("energy_rating_current")
try:
epc = EpcPropertyDataMapper.from_api_response(doc)
except ValueError as e:
if "Unsupported EPC schema" in str(e):
cat["unsupported_schema"] += 1
else:
cat["raise:ValueError"] += 1
exc_examples["ValueError:" + str(e)[:60]].append(cert)
continue
except Exception as e:
ename = type(e).__name__
cat[f"raise:{ename}"] += 1
exc_examples[f"{ename}:{str(e)[:60]}"].append(cert)
continue
if lodged is None:
cat["no_lodged_sap"] += 1
continue
try:
cont = calculate_sap_from_inputs(
cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES)
).sap_score_continuous
except Exception as e:
ename = type(e).__name__
cat[f"calc_raise:{ename}"] += 1
exc_examples[f"calc:{ename}:{str(e)[:50]}"].append(cert)
continue
if not math.isfinite(cont):
cat["non_finite"] += 1
continue
err = cont - lodged
cat["computed"] += 1
rows.append({
"cert": cert, "our_cont": round(cont, 4), "lodged": lodged,
"err": round(err, 4), "abs_err": round(abs(err), 4), **diag(doc),
})
if rows:
keys = list(rows[0].keys())
with open(CACHE / "_results.csv", "w", newline="") as fh:
w = csv.DictWriter(fh, fieldnames=keys)
w.writeheader()
w.writerows(rows)
n = len(rows)
print("=" * 70)
print(f"SAMPLE: {len(files)} cached certs | categories:")
for k, v in cat.most_common():
print(f" {k:28s} {v}")
if n == 0:
return
abs_errs = sorted(r["abs_err"] for r in rows)
def pct(thr):
return 100.0 * sum(1 for r in rows if r["abs_err"] < thr) / n
print("=" * 70)
print(f"COMPUTED: {n} certs (continuous SAP vs lodged rounded)")
print(f" % |err| < 0.5 : {pct(0.5):.1f}% <-- headline")
print(f" % |err| < 1.0 : {pct(1.0):.1f}%")
print(f" % |err| < 2.0 : {pct(2.0):.1f}%")
print(f" % |err| < 5.0 : {pct(5.0):.1f}%")
print(f" median |err| : {abs_errs[n // 2]:.3f}")
print(f" mean |err| : {sum(abs_errs) / n:.3f}")
print(f" p90 |err| : {abs_errs[int(n * 0.90)]:.3f}")
print(f" p99 |err| : {abs_errs[int(n * 0.99)]:.3f}")
print(f" max |err| : {abs_errs[-1]:.3f}")
signed = [r["err"] for r in rows]
print(f" mean signed err: {sum(signed) / n:+.3f} (we - lodged; +ve = we over-rate)")
print(" abs-err buckets:")
for lo, hi in [(0, 0.5), (0.5, 1), (1, 2), (2, 5), (5, 10), (10, 1e9)]:
c = sum(1 for r in rows if lo <= r["abs_err"] < hi)
print(f" [{lo:>4}, {hi:>4}) : {c:4d} ({100 * c / n:4.1f}%)")
print("=" * 70)
print("TOP 40 LARGEST |err| (prioritise these):")
worst = sorted(rows, key=lambda r: -r["abs_err"])[:40]
print(f" {'cert':22s} {'err':>7s} {'our':>6s} {'lodg':>4s} prop bf age gas cat/idx bps")
for r in worst:
print(f" {r['cert']:22s} {r['err']:+7.2f} {r['our_cont']:6.1f} {r['lodged']:4d} "
f"{str(r['prop_type']):>4s} {str(r['built_form']):>2s} {str(r['age_band'])[:3]:>3s} "
f"{str(r['mains_gas']):>3s} {str(r['main_heat_cat']):>3s}/{str(r['main_heat_idx']):>6s} "
f"{r['n_bps']}")
if exc_examples:
print("=" * 70)
print("RAISE/ERROR EXAMPLES (mapper/calculator gaps — also prioritise):")
for k, v in sorted(exc_examples.items(), key=lambda kv: -len(kv[1]))[:20]:
print(f" [{len(v):3d}] {k} e.g. {v[0]}")
print(f"\nFull per-cert CSV -> {CACHE / '_results.csv'}")
if __name__ == "__main__":
main()