From dde8ae30fa92499583a74b7e60b5d97d10274f08 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 May 2026 13:59:23 +0000 Subject: [PATCH] S-B2: parity probe + first-pass findings (100-cert baseline) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds services/ml_training_data/src/ml_training_data/sap_parity_probe.py — samples N certs from the v18a corpus, streams them via BulkZipReader, runs Sap10Calculator, prints MAE/RMSE/bias + worst-N residuals. Baseline across 100 certs: MAE 8.41, RMSE 13.98, bias -2.65, 0 errors. docs/sap-spec/PARITY_FINDINGS.md captures the dominant failure pattern (flats + bungalows under-predicted, 10 of the worst-15 are flats whose floor/roof are party with neighbouring dwellings) and the priority- ordered Session B iteration backlog (S-B-flat-surfaces first). Co-Authored-By: Claude Opus 4.7 --- docs/sap-spec/PARITY_FINDINGS.md | 61 ++++++++++ .../src/ml_training_data/sap_parity_probe.py | 111 ++++++++++++++++++ 2 files changed, 172 insertions(+) create mode 100644 docs/sap-spec/PARITY_FINDINGS.md create mode 100644 services/ml_training_data/src/ml_training_data/sap_parity_probe.py diff --git a/docs/sap-spec/PARITY_FINDINGS.md b/docs/sap-spec/PARITY_FINDINGS.md new file mode 100644 index 00000000..2e24969a --- /dev/null +++ b/docs/sap-spec/PARITY_FINDINGS.md @@ -0,0 +1,61 @@ +# Sap10Calculator parity probe — findings as of 2026-05-18 + +100-cert random sample from `data/ml_training/runs/2025_2026_n250000_v18a/data.parquet`, filtered to cert sap-score 20-95 (typical band). 0 errors — calculator runs end-to-end on every cert. + +## Headline + +| Metric | Value | +|---|---| +| MAE | 8.41 SAP-points | +| RMSE | 13.98 | +| Bias | -2.65 (slight under-prediction) | +| Within ±1 | 18.0% | +| Within ±3 | 36.0% | +| Within ±5 | 57.0% | +| Within ±10 | 84.0% | +| Worst residual | -56 SAP-points | + +Session B success criterion is MAE ≤ 1.0 on the typical subset; we're 8× that on the first pass, which roughly matches ADR-0009's expectation that the first run shakes out spec-interpretation gaps. + +## Dominant failure shape: flats and bungalows under-predicted + +10 of the 15 worst residuals are flats or bungalows. **Pattern**: calculator charges floor + roof heat loss to dwellings that don't have exposed floor / roof surfaces (mid-floor flats, top-floor flats with party ceiling, etc.). + +Worst 15 (residual = predicted − actual): + +| Cert | actual | predicted | residual | TFA | dwelling | +|---|---|---|---|---|---| +| 0320-2756-7670-2196-2035 | 78 | 22 | -56 | 57 | Semi-detached bungalow | +| 0036-1125-8600-0165-2206 | 63 | 18 | -45 | 42 | Mid-floor flat | +| 0340-2394-5510-2925-4421 | 75 | 35 | -40 | 73 | Mid-floor flat | +| 9360-2179-9590-2495-2615 | 78 | 39 | -39 | 54 | Ground-floor flat | +| 0036-0529-1500-0700-8276 | 75 | 36 | -39 | 47 | Top-floor flat | +| 0350-2182-9590-2526-7841 | 43 | 4 | -39 | 119 | Top-floor flat | +| 2148-3061-6204-0016-7204 | 81 | 44 | -37 | 67 | Mid-floor flat | +| 0800-1364-0922-4522-3963 | 71 | 37 | -34 | 70 | Detached bungalow | +| 2110-6453-5050-8205-9605 | 63 | 31 | -32 | 43 | Ground-floor maisonette | +| 2903-8339-6962-6004-0725 | 75 | 47 | -28 | 11 | Top-floor flat | +| 0320-2850-3380-2125-1661 | 70 | 48 | -22 | 45 | Semi-detached bungalow | +| 8035-9023-1500-0237-3226 | 43 | 63 | +20 | 64 | Detached bungalow | +| 9590-7751-0022-0599-3953 | 51 | 69 | +18 | 74 | Detached house | +| 2118-1198-2619-1711-7960 | 62 | 46 | -16 | 42 | Mid-floor flat | +| 3336-3822-5500-0437-9202 | 70 | 59 | -11 | 73 | Mid-floor maisonette | + +## Session B iteration backlog (priority order) + +1. **S-B-flat-surfaces** — Map `dwelling_type` to exposed floor/roof flags. Mid/top flats lose their `u_floor × ground_floor_area`; mid/ground flats lose their `u_roof × top_floor_area`. Expected impact: closes most of the −20 to −56 residuals. +2. **S-B-heating-eff-fallback** — When `sap_main_heating_code` is None, fall back through `main_heating_category` + age band to a modern-condensing-boiler efficiency, not the legacy 0.80. ~28% of our 100-cert sample had a null code with category=2. +3. **S-B-electric-storage-tariff** — Electric storage heaters (codes 401-409) should price space-heating fuel at Economy-7 low rate (Table 32 code 31, ~5.5 p/kWh), not standard rate 30. This is a 2× cost reduction on those certs. +4. **S-B-wall-uvalue-cascade-review** — Worst non-flat residuals suggest the wall U-value cascade is too conservative for recently-built / well-insulated stock. Review `domain.ml.rdsap_uvalues.u_wall` against RdSAP 10 Table 5. +5. **S-B-bungalow-investigation** — Bungalow residuals don't fit the flat-surfaces pattern (bungalows have full floor+roof). Hypothesis: thermal-bridging y-factor + storey-count interaction over-counts envelope. Probe specifically before deciding. +6. **S-B-pump-fan-default** — We default to 130 kWh/yr; SAP 10.3 Table 4f says higher for systems with mechanical ventilation. Marginal but consistent. + +## How to reproduce + +```bash +python adhoc/sap_calculator/probe_n.py # 100 certs, seed=7 +python adhoc/sap_calculator/probe_n.py 500 13 # bigger sample +python adhoc/sap_calculator/probe_worst.py # detailed cert-by-cert dump +``` + +`probe_n.py` runs in ~80s. Errors: 0/100. Mapper handles every real cert shape encountered. diff --git a/services/ml_training_data/src/ml_training_data/sap_parity_probe.py b/services/ml_training_data/src/ml_training_data/sap_parity_probe.py new file mode 100644 index 00000000..a0bebf54 --- /dev/null +++ b/services/ml_training_data/src/ml_training_data/sap_parity_probe.py @@ -0,0 +1,111 @@ +"""Sap10Calculator parity probe over N random certs from the corpus. + +ADR-0009 Session B exploratory tool. Loads the v18a parquet, samples N +certs from the typical sap-score range, streams them from the bulk JSON +ZIPs, runs the calculator, and prints the residual distribution + +worst-N residuals for spec-iteration triage. + +Usage (from repo root, with the workspace venv active): + python -m ml_training_data.sap_parity_probe # N=100, seed=7 + python -m ml_training_data.sap_parity_probe 500 13 # custom N + seed + +Findings get written up in docs/sap-spec/PARITY_FINDINGS.md. +""" +from __future__ import annotations + +import json +import sys +import time +from pathlib import Path +from typing import Any, cast + +import pandas as pd + +from datatypes.epc.domain.mapper import EpcPropertyDataMapper +from domain.sap.calculator import Sap10Calculator +from ml_training_data.bulk_zip_reader import BulkZipReader +from ml_training_data.storage import LocalStorage + + +_REPO = Path(__file__).resolve().parents[4] +_PARQUET = _REPO / "data" / "ml_training" / "runs" / "2025_2026_n250000_v18a" / "data.parquet" +_BULK = _REPO / "data" / "ml_training" / "bulk" +_ZIP_KEYS = ("certificates-2025.json.zip", "certificates-2026.json.zip") + + +def _sample_certs(n: int, seed: int) -> dict[str, int]: + df = pd.read_parquet(_PARQUET, columns=["certificate_number", "sap_score"]) + df = df[df["sap_score"].between(20, 95)] + s = df.sample(n, random_state=seed) + return dict(zip(s["certificate_number"], s["sap_score"].astype(int))) + + +def main(argv: list[str] | None = None) -> None: + args = argv if argv is not None else sys.argv[1:] + n = int(args[0]) if args else 100 + seed = int(args[1]) if len(args) > 1 else 7 + + targets = _sample_certs(n, seed) + print(f"Sampling {len(targets)} certs (seed={seed}) ...") + storage = LocalStorage(_BULK) + calc = Sap10Calculator() + results: list[dict[str, Any]] = [] + errors: list[dict[str, Any]] = [] + remaining = set(targets) + t0 = time.monotonic() + for zip_key in _ZIP_KEYS: + if not remaining: + break + if not storage.exists(zip_key): + print(f"!! missing {zip_key}", file=sys.stderr) + continue + reader = BulkZipReader(storage, zip_key) + for cert in reader.iter_certificates_filtered(remaining): + cn = cert["certificate_number"] + actual = targets[cn] + doc_field = cert.get("document") + document = cast( + dict[str, Any], + json.loads(doc_field) if isinstance(doc_field, str) else doc_field, + ) + try: + epc = EpcPropertyDataMapper.from_api_response(document) + result = calc.calculate(epc) + results.append({ + "cert": cn, + "actual": actual, + "predicted": result.sap_score, + "residual": result.sap_score - actual, + "ecf": round(result.ecf, 3), + "tfa": epc.total_floor_area_m2, + "ext": epc.extensions_count, + "dwelling": epc.dwelling_type, + }) + except Exception as e: # noqa: BLE001 — exploratory probe + errors.append({"cert": cn, "actual": actual, "error": f"{type(e).__name__}: {e}"}) + remaining.discard(cn) + elapsed = time.monotonic() - t0 + df = pd.DataFrame(results) + print(f"\nelapsed {elapsed:.1f}s; calculated={len(results)}, errored={len(errors)}, not_found={len(remaining)}") + if not df.empty: + df["abs_resid"] = df["residual"].abs() + print(f"\nMAE: {df['residual'].abs().mean():.2f}") + print(f"RMSE: {((df['residual'] ** 2).mean()) ** 0.5:.2f}") + print(f"bias: {df['residual'].mean():.2f}") + for thr in (1, 3, 5, 10): + pct = (df["abs_resid"] <= thr).mean() * 100 + print(f"within ±{thr}: {pct:.1f}%") + print("\nresidual distribution:") + print(df["residual"].describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])) + print("\nworst 15 by |residual|:") + print(df.nlargest(15, "abs_resid")[ + ["cert", "actual", "predicted", "residual", "ecf", "tfa", "ext", "dwelling"] + ].to_string(index=False)) + if errors: + print("\nerrors:") + for e in errors[:10]: + print(" ", e) + + +if __name__ == "__main__": + main()