"""Score the SAP10 calculator's API path against a cached EPC sample. WHAT THIS IS FOR ---------------- Measures how well the API front-end (`from_api_response` → `cert_to_inputs` → continuous SAP) reproduces each cert's lodged rounded SAP (`energy_rating_current`) across the sample built by `fetch_2026_epc_sample.py`. This is the headline accuracy gauge for raw-API behaviour on an unbiased population. Each cert lands in one bucket: - computed — ran end-to-end; SAP error recorded. - unsupported_schema — pre-21 schema the mapper doesn't support (skip). - raise: — mapper raised (UnmappedApiCode etc.) — a gap to fix. - calc_raise: — calculator raised (UnmappedSapCode etc.) — a gap. OUTPUT ------ - Category counts + the raise breakdown with example certs (what to fix). - For computed certs: % within 0.5 / 1 / 2 / 5 SAP, median/mean/p90/p99/max |err|, the signed mean (over- vs under-rating), abs-err histogram. - The 40 worst offenders with diagnostic columns (to prioritise). - A full per-cert CSV at /_results.csv for ad-hoc slicing. USAGE ----- PYTHONPATH=/workspaces/model python scripts/eval_api_sap_accuracy.py Reads the cache written by `fetch_2026_epc_sample.py` (default `/tmp/epc_2026_sample`, overridable via `EPC_SAMPLE_CACHE`). """ import os import json import csv import math from collections import Counter, defaultdict from pathlib import Path from datatypes.epc.domain.mapper import EpcPropertyDataMapper from domain.sap10_calculator.calculator import calculate_sap_from_inputs from domain.sap10_calculator.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES, cert_to_inputs CACHE = Path(os.environ.get("EPC_SAMPLE_CACHE", "/tmp/epc_2026_sample")) def diag(doc): """A few raw-JSON fields that help explain a cert's error at a glance.""" es = doc.get("sap_energy_source", {}) or {} h = doc.get("sap_heating", {}) or {} mh = (h.get("main_heating_details") or [{}]) mh0 = mh[0] if mh else {} pv = es.get("photovoltaic_supply") return { "schema": doc.get("schema_type"), "prop_type": doc.get("property_type"), "built_form": doc.get("built_form"), "age_band": doc.get("construction_age_band"), "mains_gas": es.get("mains_gas"), "main_heat_cat": mh0.get("main_heating_category"), "main_heat_idx": mh0.get("main_heating_index_number"), "n_bps": len(doc.get("sap_building_parts") or []), "lodged_band": doc.get("current_energy_efficiency_band"), } def main(): files = sorted(CACHE.glob("????-????-????-????-????.json")) rows = [] cat = Counter() exc_examples = defaultdict(list) for f in files: cert = f.stem try: doc = json.loads(f.read_text()) except Exception: cat["bad_json"] += 1 continue lodged = doc.get("energy_rating_current") try: epc = EpcPropertyDataMapper.from_api_response(doc) except ValueError as e: if "Unsupported EPC schema" in str(e): cat["unsupported_schema"] += 1 else: cat["raise:ValueError"] += 1 exc_examples["ValueError:" + str(e)[:60]].append(cert) continue except Exception as e: ename = type(e).__name__ cat[f"raise:{ename}"] += 1 exc_examples[f"{ename}:{str(e)[:60]}"].append(cert) continue if lodged is None: cat["no_lodged_sap"] += 1 continue try: cont = calculate_sap_from_inputs( cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES) ).sap_score_continuous except Exception as e: ename = type(e).__name__ cat[f"calc_raise:{ename}"] += 1 exc_examples[f"calc:{ename}:{str(e)[:50]}"].append(cert) continue if not math.isfinite(cont): cat["non_finite"] += 1 continue err = cont - lodged cat["computed"] += 1 rows.append({ "cert": cert, "our_cont": round(cont, 4), "lodged": lodged, "err": round(err, 4), "abs_err": round(abs(err), 4), **diag(doc), }) if rows: keys = list(rows[0].keys()) with open(CACHE / "_results.csv", "w", newline="") as fh: w = csv.DictWriter(fh, fieldnames=keys) w.writeheader() w.writerows(rows) n = len(rows) print("=" * 70) print(f"SAMPLE: {len(files)} cached certs | categories:") for k, v in cat.most_common(): print(f" {k:28s} {v}") if n == 0: return abs_errs = sorted(r["abs_err"] for r in rows) def pct(thr): return 100.0 * sum(1 for r in rows if r["abs_err"] < thr) / n print("=" * 70) print(f"COMPUTED: {n} certs (continuous SAP vs lodged rounded)") print(f" % |err| < 0.5 : {pct(0.5):.1f}% <-- headline") print(f" % |err| < 1.0 : {pct(1.0):.1f}%") print(f" % |err| < 2.0 : {pct(2.0):.1f}%") print(f" % |err| < 5.0 : {pct(5.0):.1f}%") print(f" median |err| : {abs_errs[n // 2]:.3f}") print(f" mean |err| : {sum(abs_errs) / n:.3f}") print(f" p90 |err| : {abs_errs[int(n * 0.90)]:.3f}") print(f" p99 |err| : {abs_errs[int(n * 0.99)]:.3f}") print(f" max |err| : {abs_errs[-1]:.3f}") signed = [r["err"] for r in rows] print(f" mean signed err: {sum(signed) / n:+.3f} (we - lodged; +ve = we over-rate)") print(" abs-err buckets:") for lo, hi in [(0, 0.5), (0.5, 1), (1, 2), (2, 5), (5, 10), (10, 1e9)]: c = sum(1 for r in rows if lo <= r["abs_err"] < hi) print(f" [{lo:>4}, {hi:>4}) : {c:4d} ({100 * c / n:4.1f}%)") print("=" * 70) print("TOP 40 LARGEST |err| (prioritise these):") worst = sorted(rows, key=lambda r: -r["abs_err"])[:40] print(f" {'cert':22s} {'err':>7s} {'our':>6s} {'lodg':>4s} prop bf age gas cat/idx bps") for r in worst: print(f" {r['cert']:22s} {r['err']:+7.2f} {r['our_cont']:6.1f} {r['lodged']:4d} " f"{str(r['prop_type']):>4s} {str(r['built_form']):>2s} {str(r['age_band'])[:3]:>3s} " f"{str(r['mains_gas']):>3s} {str(r['main_heat_cat']):>3s}/{str(r['main_heat_idx']):>6s} " f"{r['n_bps']}") if exc_examples: print("=" * 70) print("RAISE/ERROR EXAMPLES (mapper/calculator gaps — also prioritise):") for k, v in sorted(exc_examples.items(), key=lambda kv: -len(kv[1]))[:20]: print(f" [{len(v):3d}] {k} e.g. {v[0]}") print(f"\nFull per-cert CSV -> {CACHE / '_results.csv'}") if __name__ == "__main__": main()