Model/scripts/eval_api_sap_accuracy.py

"""Score the SAP10 calculator's API path against a cached EPC sample.

WHAT THIS IS FOR
----------------
Measures how well the API front-end (`from_api_response` → `cert_to_inputs`
→ continuous SAP) reproduces each cert's lodged rounded SAP
(`energy_rating_current`) across the sample built by
`fetch_2026_epc_sample.py`. This is the headline accuracy gauge for raw-API
behaviour on an unbiased population.

Each cert lands in one bucket:
  - computed            — ran end-to-end; SAP error recorded.
  - unsupported_schema  — pre-21 schema the mapper doesn't support (skip).
  - raise:<Exc>         — mapper raised (UnmappedApiCode etc.) — a gap to fix.
  - calc_raise:<Exc>    — calculator raised (UnmappedSapCode etc.) — a gap.

OUTPUT
------
  - Category counts + the raise breakdown with example certs (what to fix).
  - For computed certs: % within 0.5 / 1 / 2 / 5 SAP, median/mean/p90/p99/max
    |err|, the signed mean (over- vs under-rating), abs-err histogram.
  - The 40 worst offenders with diagnostic columns (to prioritise).
  - A full per-cert CSV at <cache>/_results.csv for ad-hoc slicing.

USAGE
-----
    PYTHONPATH=/workspaces/model python scripts/eval_api_sap_accuracy.py

Reads the cache written by `fetch_2026_epc_sample.py` (default
`/tmp/epc_2026_sample`, overridable via `EPC_SAMPLE_CACHE`).
"""
import os
import json
import csv
import math
from collections import Counter, defaultdict
from pathlib import Path

from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.sap10_calculator.calculator import calculate_sap_from_inputs
from domain.sap10_calculator.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES, cert_to_inputs

CACHE = Path(os.environ.get("EPC_SAMPLE_CACHE", "/tmp/epc_2026_sample"))


def diag(doc):
    """A few raw-JSON fields that help explain a cert's error at a glance."""
    es = doc.get("sap_energy_source", {}) or {}
    h = doc.get("sap_heating", {}) or {}
    mh = (h.get("main_heating_details") or [{}])
    mh0 = mh[0] if mh else {}
    pv = es.get("photovoltaic_supply")
    return {
        "schema": doc.get("schema_type"),
        "prop_type": doc.get("property_type"),
        "built_form": doc.get("built_form"),
        "age_band": doc.get("construction_age_band"),
        "mains_gas": es.get("mains_gas"),
        "main_heat_cat": mh0.get("main_heating_category"),
        "main_heat_idx": mh0.get("main_heating_index_number"),
        "n_bps": len(doc.get("sap_building_parts") or []),
        "lodged_band": doc.get("current_energy_efficiency_band"),
    }


def main():
    files = sorted(CACHE.glob("????-????-????-????-????.json"))
    rows = []
    cat = Counter()
    exc_examples = defaultdict(list)
    for f in files:
        cert = f.stem
        try:
            doc = json.loads(f.read_text())
        except Exception:
            cat["bad_json"] += 1
            continue
        lodged = doc.get("energy_rating_current")
        try:
            epc = EpcPropertyDataMapper.from_api_response(doc)
        except ValueError as e:
            if "Unsupported EPC schema" in str(e):
                cat["unsupported_schema"] += 1
            else:
                cat["raise:ValueError"] += 1
                exc_examples["ValueError:" + str(e)[:60]].append(cert)
            continue
        except Exception as e:
            ename = type(e).__name__
            cat[f"raise:{ename}"] += 1
            exc_examples[f"{ename}:{str(e)[:60]}"].append(cert)
            continue
        if lodged is None:
            cat["no_lodged_sap"] += 1
            continue
        try:
            cont = calculate_sap_from_inputs(
                cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES)
            ).sap_score_continuous
        except Exception as e:
            ename = type(e).__name__
            cat[f"calc_raise:{ename}"] += 1
            exc_examples[f"calc:{ename}:{str(e)[:50]}"].append(cert)
            continue
        if not math.isfinite(cont):
            cat["non_finite"] += 1
            continue
        err = cont - lodged
        cat["computed"] += 1
        rows.append({
            "cert": cert, "our_cont": round(cont, 4), "lodged": lodged,
            "err": round(err, 4), "abs_err": round(abs(err), 4), **diag(doc),
        })

    if rows:
        keys = list(rows[0].keys())
        with open(CACHE / "_results.csv", "w", newline="") as fh:
            w = csv.DictWriter(fh, fieldnames=keys)
            w.writeheader()
            w.writerows(rows)

    n = len(rows)
    print("=" * 70)
    print(f"SAMPLE: {len(files)} cached certs | categories:")
    for k, v in cat.most_common():
        print(f"   {k:28s} {v}")
    if n == 0:
        return
    abs_errs = sorted(r["abs_err"] for r in rows)

    def pct(thr):
        return 100.0 * sum(1 for r in rows if r["abs_err"] < thr) / n

    print("=" * 70)
    print(f"COMPUTED: {n} certs (continuous SAP vs lodged rounded)")
    print(f"   % |err| < 0.5  : {pct(0.5):.1f}%   <-- headline")
    print(f"   % |err| < 1.0  : {pct(1.0):.1f}%")
    print(f"   % |err| < 2.0  : {pct(2.0):.1f}%")
    print(f"   % |err| < 5.0  : {pct(5.0):.1f}%")
    print(f"   median |err|   : {abs_errs[n // 2]:.3f}")
    print(f"   mean   |err|   : {sum(abs_errs) / n:.3f}")
    print(f"   p90 |err|      : {abs_errs[int(n * 0.90)]:.3f}")
    print(f"   p99 |err|      : {abs_errs[int(n * 0.99)]:.3f}")
    print(f"   max |err|      : {abs_errs[-1]:.3f}")
    signed = [r["err"] for r in rows]
    print(f"   mean signed err: {sum(signed) / n:+.3f} (we - lodged; +ve = we over-rate)")
    print("   abs-err buckets:")
    for lo, hi in [(0, 0.5), (0.5, 1), (1, 2), (2, 5), (5, 10), (10, 1e9)]:
        c = sum(1 for r in rows if lo <= r["abs_err"] < hi)
        print(f"      [{lo:>4}, {hi:>4}) : {c:4d}  ({100 * c / n:4.1f}%)")
    print("=" * 70)
    print("TOP 40 LARGEST |err| (prioritise these):")
    worst = sorted(rows, key=lambda r: -r["abs_err"])[:40]
    print(f"   {'cert':22s} {'err':>7s} {'our':>6s} {'lodg':>4s}  prop bf age  gas cat/idx bps")
    for r in worst:
        print(f"   {r['cert']:22s} {r['err']:+7.2f} {r['our_cont']:6.1f} {r['lodged']:4d}  "
              f"{str(r['prop_type']):>4s} {str(r['built_form']):>2s} {str(r['age_band'])[:3]:>3s} "
              f"{str(r['mains_gas']):>3s} {str(r['main_heat_cat']):>3s}/{str(r['main_heat_idx']):>6s} "
              f"{r['n_bps']}")
    if exc_examples:
        print("=" * 70)
        print("RAISE/ERROR EXAMPLES (mapper/calculator gaps — also prioritise):")
        for k, v in sorted(exc_examples.items(), key=lambda kv: -len(kv[1]))[:20]:
            print(f"   [{len(v):3d}] {k}   e.g. {v[0]}")
    print(f"\nFull per-cert CSV -> {CACHE / '_results.csv'}")


if __name__ == "__main__":
    main()