mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Three reusable scripts (each with a purpose/usage docstring) for wide-scale
testing of the calculator's API front-end against the GOV.UK EPB register —
the toolkit behind the 1000-cert study (docs/HANDOVER_API_SAMPLE_ACCURACY.md):
fetch_2026_epc_sample.py — sample cert numbers across a date window
(random pages) + download full schema-21 JSON
to a cache; resumable, 429/5xx backoff.
eval_api_sap_accuracy.py — % within 0.5 SAP, error histogram, worst-40,
and the mapper/calculator raise breakdown.
analyse_api_sap_clusters.py — error grouped by property + heating type to
locate clusters (electric heating, flats, PV).
Cache dir defaults to /tmp/epc_2026_sample, overridable via EPC_SAMPLE_CACHE.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
102 lines
3.9 KiB
Python
102 lines
3.9 KiB
Python
"""Group API-path SAP error by property + heating type to find clusters.
|
|
|
|
WHAT THIS IS FOR
|
|
----------------
|
|
The headline number from `eval_api_sap_accuracy.py` tells you HOW accurate the
|
|
API path is; this tells you WHERE the error lives so you can prioritise. It
|
|
buckets the cached sample's per-cert SAP error (continuous vs lodged) by:
|
|
- property type (house / flat / bungalow / maisonette / park home),
|
|
- real PV presence,
|
|
- heating identity (main_heating_category + whether a PCDB index is lodged),
|
|
and prints n / mean|err| / %<0.5 per group, plus red flags (negative or
|
|
extreme-low SAP). The load-bearing cut is heating: e.g. electric storage
|
|
heaters (cat 7) and room heaters (cat 10) are the worst clusters, which points
|
|
the next worksheet-backed fix at those systems.
|
|
|
|
USAGE
|
|
-----
|
|
PYTHONPATH=/workspaces/model python scripts/analyse_api_sap_clusters.py
|
|
|
|
Reads the cache written by `fetch_2026_epc_sample.py` (default
|
|
`/tmp/epc_2026_sample`, overridable via `EPC_SAMPLE_CACHE`).
|
|
"""
|
|
import os
|
|
import json
|
|
import math
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
|
from domain.sap10_calculator.calculator import calculate_sap_from_inputs
|
|
from domain.sap10_calculator.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES, cert_to_inputs
|
|
|
|
CACHE = Path(os.environ.get("EPC_SAMPLE_CACHE", "/tmp/epc_2026_sample"))
|
|
PROP = {"0": "House", "1": "Bungalow", "2": "Flat", "3": "Maisonette", "4": "Park home"}
|
|
|
|
|
|
def real_pv(doc):
|
|
"""True only for a genuine PV array — `none_or_no_details` / 0% is not PV."""
|
|
es = doc.get("sap_energy_source", {}) or {}
|
|
pv = es.get("photovoltaic_supply")
|
|
if not isinstance(pv, dict):
|
|
return False
|
|
if set(pv.keys()) <= {"none_or_no_details"}:
|
|
nod = pv.get("none_or_no_details") or {}
|
|
return bool(nod.get("percent_roof_area"))
|
|
return True
|
|
|
|
|
|
def heat_identity(doc):
|
|
h = doc.get("sap_heating", {}) or {}
|
|
mh = (h.get("main_heating_details") or [{}])
|
|
m0 = mh[0] if mh else {}
|
|
return m0.get("main_heating_index_number"), m0.get("main_heating_category")
|
|
|
|
|
|
def main():
|
|
rows = []
|
|
for f in sorted(CACHE.glob("????-????-????-????-????.json")):
|
|
doc = json.loads(f.read_text())
|
|
lodged = doc.get("energy_rating_current")
|
|
try:
|
|
epc = EpcPropertyDataMapper.from_api_response(doc)
|
|
cont = calculate_sap_from_inputs(
|
|
cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES)
|
|
).sap_score_continuous
|
|
except Exception:
|
|
continue
|
|
if lodged is None or not math.isfinite(cont):
|
|
continue
|
|
idx, cat = heat_identity(doc)
|
|
rows.append(dict(
|
|
cert=f.stem, ae=abs(cont - lodged), cont=cont, lodged=lodged,
|
|
prop=PROP.get(str(doc.get("property_type")), str(doc.get("property_type"))),
|
|
pv=real_pv(doc), idx=idx, cat=cat,
|
|
neg=(cont < 0), low_lodged=(lodged <= 20),
|
|
))
|
|
n = len(rows)
|
|
|
|
def grp(keyfn, label):
|
|
g = defaultdict(list)
|
|
for r in rows:
|
|
g[keyfn(r)].append(r["ae"])
|
|
print(f"\n-- mean|err| by {label} (n, mean|err|, %<0.5) --")
|
|
for k, v in sorted(g.items(), key=lambda kv: -sum(kv[1]) / len(kv[1])):
|
|
if len(v) < 5:
|
|
continue
|
|
p = 100 * sum(1 for x in v if x < 0.5) / len(v)
|
|
print(f" {str(k):28s} n={len(v):4d} mean={sum(v) / len(v):6.2f} <0.5={p:4.1f}%")
|
|
|
|
print(f"computed n={n}")
|
|
grp(lambda r: r["prop"], "property type")
|
|
grp(lambda r: "PV" if r["pv"] else "no-PV", "real PV presence")
|
|
grp(lambda r: f"cat={r['cat']},idx={'Y' if r['idx'] else '-'}", "heating identity")
|
|
|
|
neg = [r for r in rows if r["neg"]]
|
|
loww = [r for r in rows if r["low_lodged"]]
|
|
print(f"\nRED FLAGS: negative continuous SAP: {len(neg)} | lodged<=20 (extreme): {len(loww)}")
|
|
print(" negative-SAP certs:", [r["cert"] for r in neg][:15])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|