Add corpus profiler for the ADR-0028 seeing-the-data table

Reusable per-schema profiler: glazed_area band mix, Validation Cohort size,
observed-vs-predicted band glazing/floor ratio, and the ND/str sentinels that
drive schema widening. Regenerates the ADR-0028 transfer-check table from any
harvested corpus.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Jun-te Kim 2026-06-12 12:36:08 +00:00
parent 99981e07e7
commit 32eef951ee

View file

@ -0,0 +1,104 @@
"""Profile a harvested RdSAP corpus — the ADR-0028 "seeing the data" table.
For a pre-SAP10 RdSAP corpus this prints the evidence that the inherited
ADR-0027 coefficients transfer safely to the spec (ADR-0028 §Context):
* glazed_area band mix the windowless-majority structure that forces
synthesis (the corpus structurally cannot self-fit band-1);
* the Validation Cohort certs that lodge a real per-window `sap_windows`
array, used directly rather than synthesised over;
* observed glazing/floor ratio per band vs the inherited model's prediction
(`0.148 x band_multiplier`) the per-spec transfer check;
* sentinel / shape counts (multiple_glazing_type "ND", dwelling_type as a
plain str) that drive the schema's required->optional widening.
Usage (cell-by-cell or standalone):
python scripts/eon/profile_corpus.py RdSAP-Schema-19.0
"""
from __future__ import annotations
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any, Optional
SAMPLES = Path("backend/epc_api/json_samples")
# Inherited ADR-0027 coefficients (the single home is mapper.py; mirrored here
# read-only for the transfer-check column).
GLAZING_RATIO = 0.148
BAND_MULTIPLIER = {1: 1.00, 2: 1.25, 3: 0.81, 4: 1.51, 5: 0.62}
def _load(schema: str) -> list[dict[str, Any]]:
path = SAMPLES / schema / "corpus.jsonl"
return [
json.loads(line) for line in path.read_text().splitlines() if line.strip()
]
def _measurement_value(raw: Any) -> Optional[float]:
"""Window/floor areas lodge as {"value": x, ...} or a bare number."""
if isinstance(raw, dict):
v = raw.get("value")
return float(v) if v is not None else None
if isinstance(raw, (int, float)):
return float(raw)
return None
def profile(schema: str) -> None:
certs = _load(schema)
n = len(certs)
print(f"\n=== {schema}{n} certs ===\n")
# glazed_area band mix
bands = Counter(c.get("glazed_area") for c in certs)
print("glazed_area band mix:")
for band, count in sorted(bands.items(), key=lambda x: (x[0] is None, x[0])):
print(f" band {band}: {count:4d} ({100 * count / n:.1f}%)")
# Validation Cohort — certs with a lodged per-window array
cohort = [c for c in certs if c.get("sap_windows")]
cohort_bands = Counter(c.get("glazed_area") for c in cohort)
print(f"\nValidation Cohort (lodged sap_windows): {len(cohort)}/{n}")
print(f" cohort bands: {dict(sorted(cohort_bands.items()))}")
# observed glazing/floor ratio per band (cohort only) vs inherited prediction
by_band: dict[Any, list[float]] = defaultdict(list)
for c in cohort:
tfa = c.get("total_floor_area")
areas = [
_measurement_value(w.get("window_area")) for w in c["sap_windows"]
]
areas = [a for a in areas if a is not None]
if tfa and areas:
by_band[c.get("glazed_area")].append(sum(areas) / float(tfa))
print("\nobserved glazing/floor ratio vs inherited 0.148 x multiplier:")
print(" band observed (n) predicted")
for band in sorted(by_band):
obs = by_band[band]
mean = sum(obs) / len(obs)
pred = GLAZING_RATIO * BAND_MULTIPLIER.get(band, 1.0)
print(f" {band:<4} {mean:.3f} (n={len(obs):>2}) {pred:.3f}")
# sentinels / shapes driving the schema widening
mgt_int = Counter(
c["multiple_glazing_type"]
for c in certs
if isinstance(c.get("multiple_glazing_type"), int)
)
mgt_nd = sum(1 for c in certs if c.get("multiple_glazing_type") == "ND")
dt_str = sum(1 for c in certs if isinstance(c.get("dwelling_type"), str))
print("\nsentinels / shapes:")
print(f" multiple_glazing_type int codes: {dict(sorted(mgt_int.items()))}")
print(f" multiple_glazing_type 'ND': {mgt_nd}/{n}")
print(f" dwelling_type as plain str: {dt_str}/{n}")
if __name__ == "__main__":
schema = sys.argv[1] if len(sys.argv) > 1 else "RdSAP-Schema-19.0"
profile(schema)