Add corpus profiler for the ADR-0028 seeing-the-data table

Reusable per-schema profiler: glazed_area band mix, Validation Cohort size, observed-vs-predicted band glazing/floor ratio, and the ND/str sentinels that drive schema widening. Regenerates the ADR-0028 transfer-check table from any harvested corpus. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 13:10:47 +00:00 · 2026-06-12 12:36:08 +00:00 · 2026-06-12 12:36:08 +00:00 · 32eef951ee
commit 32eef951ee
parent 99981e07e7
1 changed files with 104 additions and 0 deletions
--- a/scripts/eon/profile_corpus.py
+++ b/scripts/eon/profile_corpus.py
@ -0,0 +1,104 @@
+"""Profile a harvested RdSAP corpus — the ADR-0028 "seeing the data" table.
+
+For a pre-SAP10 RdSAP corpus this prints the evidence that the inherited
+ADR-0027 coefficients transfer safely to the spec (ADR-0028 §Context):
+
+  * glazed_area band mix — the windowless-majority structure that forces
+    synthesis (the corpus structurally cannot self-fit band-1);
+  * the Validation Cohort — certs that lodge a real per-window `sap_windows`
+    array, used directly rather than synthesised over;
+  * observed glazing/floor ratio per band vs the inherited model's prediction
+    (`0.148 x band_multiplier`) — the per-spec transfer check;
+  * sentinel / shape counts (multiple_glazing_type "ND", dwelling_type as a
+    plain str) that drive the schema's required->optional widening.
+
+Usage (cell-by-cell or standalone):
+
+    python scripts/eon/profile_corpus.py RdSAP-Schema-19.0
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Any, Optional
+
+SAMPLES = Path("backend/epc_api/json_samples")
+
+# Inherited ADR-0027 coefficients (the single home is mapper.py; mirrored here
+# read-only for the transfer-check column).
+GLAZING_RATIO = 0.148
+BAND_MULTIPLIER = {1: 1.00, 2: 1.25, 3: 0.81, 4: 1.51, 5: 0.62}
+
+
+def _load(schema: str) -> list[dict[str, Any]]:
+    path = SAMPLES / schema / "corpus.jsonl"
+    return [
+        json.loads(line) for line in path.read_text().splitlines() if line.strip()
+    ]
+
+
+def _measurement_value(raw: Any) -> Optional[float]:
+    """Window/floor areas lodge as {"value": x, ...} or a bare number."""
+    if isinstance(raw, dict):
+        v = raw.get("value")
+        return float(v) if v is not None else None
+    if isinstance(raw, (int, float)):
+        return float(raw)
+    return None
+
+
+def profile(schema: str) -> None:
+    certs = _load(schema)
+    n = len(certs)
+    print(f"\n=== {schema} — {n} certs ===\n")
+
+    # glazed_area band mix
+    bands = Counter(c.get("glazed_area") for c in certs)
+    print("glazed_area band mix:")
+    for band, count in sorted(bands.items(), key=lambda x: (x[0] is None, x[0])):
+        print(f"  band {band}: {count:4d}  ({100 * count / n:.1f}%)")
+
+    # Validation Cohort — certs with a lodged per-window array
+    cohort = [c for c in certs if c.get("sap_windows")]
+    cohort_bands = Counter(c.get("glazed_area") for c in cohort)
+    print(f"\nValidation Cohort (lodged sap_windows): {len(cohort)}/{n}")
+    print(f"  cohort bands: {dict(sorted(cohort_bands.items()))}")
+
+    # observed glazing/floor ratio per band (cohort only) vs inherited prediction
+    by_band: dict[Any, list[float]] = defaultdict(list)
+    for c in cohort:
+        tfa = c.get("total_floor_area")
+        areas = [
+            _measurement_value(w.get("window_area")) for w in c["sap_windows"]
+        ]
+        areas = [a for a in areas if a is not None]
+        if tfa and areas:
+            by_band[c.get("glazed_area")].append(sum(areas) / float(tfa))
+    print("\nobserved glazing/floor ratio vs inherited 0.148 x multiplier:")
+    print("  band      observed (n)     predicted")
+    for band in sorted(by_band):
+        obs = by_band[band]
+        mean = sum(obs) / len(obs)
+        pred = GLAZING_RATIO * BAND_MULTIPLIER.get(band, 1.0)
+        print(f"  {band:<4}   {mean:.3f} (n={len(obs):>2})       {pred:.3f}")
+
+    # sentinels / shapes driving the schema widening
+    mgt_int = Counter(
+        c["multiple_glazing_type"]
+        for c in certs
+        if isinstance(c.get("multiple_glazing_type"), int)
+    )
+    mgt_nd = sum(1 for c in certs if c.get("multiple_glazing_type") == "ND")
+    dt_str = sum(1 for c in certs if isinstance(c.get("dwelling_type"), str))
+    print("\nsentinels / shapes:")
+    print(f"  multiple_glazing_type int codes: {dict(sorted(mgt_int.items()))}")
+    print(f"  multiple_glazing_type 'ND': {mgt_nd}/{n}")
+    print(f"  dwelling_type as plain str: {dt_str}/{n}")
+
+
+if __name__ == "__main__":
+    schema = sys.argv[1] if len(sys.argv) > 1 else "RdSAP-Schema-19.0"
+    profile(schema)