"""Group API-path SAP error by property + heating type to find clusters. WHAT THIS IS FOR ---------------- The headline number from `eval_api_sap_accuracy.py` tells you HOW accurate the API path is; this tells you WHERE the error lives so you can prioritise. It buckets the cached sample's per-cert SAP error (continuous vs lodged) by: - property type (house / flat / bungalow / maisonette / park home), - real PV presence, - heating identity (main_heating_category + whether a PCDB index is lodged), and prints n / mean|err| / %<0.5 per group, plus red flags (negative or extreme-low SAP). The load-bearing cut is heating: e.g. electric storage heaters (cat 7) and room heaters (cat 10) are the worst clusters, which points the next worksheet-backed fix at those systems. USAGE ----- PYTHONPATH=/workspaces/model python scripts/analyse_api_sap_clusters.py Reads the cache written by `fetch_2026_epc_sample.py` (default `/tmp/epc_2026_sample`, overridable via `EPC_SAMPLE_CACHE`). """ import os import json import math from collections import defaultdict from pathlib import Path from datatypes.epc.domain.mapper import EpcPropertyDataMapper from domain.sap10_calculator.calculator import calculate_sap_from_inputs from domain.sap10_calculator.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES, cert_to_inputs CACHE = Path(os.environ.get("EPC_SAMPLE_CACHE", "/tmp/epc_2026_sample")) PROP = {"0": "House", "1": "Bungalow", "2": "Flat", "3": "Maisonette", "4": "Park home"} def real_pv(doc): """True only for a genuine PV array — `none_or_no_details` / 0% is not PV.""" es = doc.get("sap_energy_source", {}) or {} pv = es.get("photovoltaic_supply") if not isinstance(pv, dict): return False if set(pv.keys()) <= {"none_or_no_details"}: nod = pv.get("none_or_no_details") or {} return bool(nod.get("percent_roof_area")) return True def heat_identity(doc): h = doc.get("sap_heating", {}) or {} mh = (h.get("main_heating_details") or [{}]) m0 = mh[0] if mh else {} return m0.get("main_heating_index_number"), m0.get("main_heating_category") def main(): rows = [] for f in sorted(CACHE.glob("????-????-????-????-????.json")): doc = json.loads(f.read_text()) lodged = doc.get("energy_rating_current") try: epc = EpcPropertyDataMapper.from_api_response(doc) cont = calculate_sap_from_inputs( cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES) ).sap_score_continuous except Exception: continue if lodged is None or not math.isfinite(cont): continue idx, cat = heat_identity(doc) rows.append(dict( cert=f.stem, ae=abs(cont - lodged), cont=cont, lodged=lodged, prop=PROP.get(str(doc.get("property_type")), str(doc.get("property_type"))), pv=real_pv(doc), idx=idx, cat=cat, neg=(cont < 0), low_lodged=(lodged <= 20), )) n = len(rows) def grp(keyfn, label): g = defaultdict(list) for r in rows: g[keyfn(r)].append(r["ae"]) print(f"\n-- mean|err| by {label} (n, mean|err|, %<0.5) --") for k, v in sorted(g.items(), key=lambda kv: -sum(kv[1]) / len(kv[1])): if len(v) < 5: continue p = 100 * sum(1 for x in v if x < 0.5) / len(v) print(f" {str(k):28s} n={len(v):4d} mean={sum(v) / len(v):6.2f} <0.5={p:4.1f}%") print(f"computed n={n}") grp(lambda r: r["prop"], "property type") grp(lambda r: "PV" if r["pv"] else "no-PV", "real PV presence") grp(lambda r: f"cat={r['cat']},idx={'Y' if r['idx'] else '-'}", "heating identity") neg = [r for r in rows if r["neg"]] loww = [r for r in rows if r["low_lodged"]] print(f"\nRED FLAGS: negative continuous SAP: {len(neg)} | lodged<=20 (extreme): {len(loww)}") print(" negative-SAP certs:", [r["cert"] for r in neg][:15]) if __name__ == "__main__": main()