mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
Add corpus profiler for the ADR-0028 seeing-the-data table
Reusable per-schema profiler: glazed_area band mix, Validation Cohort size, observed-vs-predicted band glazing/floor ratio, and the ND/str sentinels that drive schema widening. Regenerates the ADR-0028 transfer-check table from any harvested corpus. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
99981e07e7
commit
32eef951ee
1 changed files with 104 additions and 0 deletions
104
scripts/eon/profile_corpus.py
Normal file
104
scripts/eon/profile_corpus.py
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
"""Profile a harvested RdSAP corpus — the ADR-0028 "seeing the data" table.
|
||||
|
||||
For a pre-SAP10 RdSAP corpus this prints the evidence that the inherited
|
||||
ADR-0027 coefficients transfer safely to the spec (ADR-0028 §Context):
|
||||
|
||||
* glazed_area band mix — the windowless-majority structure that forces
|
||||
synthesis (the corpus structurally cannot self-fit band-1);
|
||||
* the Validation Cohort — certs that lodge a real per-window `sap_windows`
|
||||
array, used directly rather than synthesised over;
|
||||
* observed glazing/floor ratio per band vs the inherited model's prediction
|
||||
(`0.148 x band_multiplier`) — the per-spec transfer check;
|
||||
* sentinel / shape counts (multiple_glazing_type "ND", dwelling_type as a
|
||||
plain str) that drive the schema's required->optional widening.
|
||||
|
||||
Usage (cell-by-cell or standalone):
|
||||
|
||||
python scripts/eon/profile_corpus.py RdSAP-Schema-19.0
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
SAMPLES = Path("backend/epc_api/json_samples")
|
||||
|
||||
# Inherited ADR-0027 coefficients (the single home is mapper.py; mirrored here
|
||||
# read-only for the transfer-check column).
|
||||
GLAZING_RATIO = 0.148
|
||||
BAND_MULTIPLIER = {1: 1.00, 2: 1.25, 3: 0.81, 4: 1.51, 5: 0.62}
|
||||
|
||||
|
||||
def _load(schema: str) -> list[dict[str, Any]]:
|
||||
path = SAMPLES / schema / "corpus.jsonl"
|
||||
return [
|
||||
json.loads(line) for line in path.read_text().splitlines() if line.strip()
|
||||
]
|
||||
|
||||
|
||||
def _measurement_value(raw: Any) -> Optional[float]:
|
||||
"""Window/floor areas lodge as {"value": x, ...} or a bare number."""
|
||||
if isinstance(raw, dict):
|
||||
v = raw.get("value")
|
||||
return float(v) if v is not None else None
|
||||
if isinstance(raw, (int, float)):
|
||||
return float(raw)
|
||||
return None
|
||||
|
||||
|
||||
def profile(schema: str) -> None:
|
||||
certs = _load(schema)
|
||||
n = len(certs)
|
||||
print(f"\n=== {schema} — {n} certs ===\n")
|
||||
|
||||
# glazed_area band mix
|
||||
bands = Counter(c.get("glazed_area") for c in certs)
|
||||
print("glazed_area band mix:")
|
||||
for band, count in sorted(bands.items(), key=lambda x: (x[0] is None, x[0])):
|
||||
print(f" band {band}: {count:4d} ({100 * count / n:.1f}%)")
|
||||
|
||||
# Validation Cohort — certs with a lodged per-window array
|
||||
cohort = [c for c in certs if c.get("sap_windows")]
|
||||
cohort_bands = Counter(c.get("glazed_area") for c in cohort)
|
||||
print(f"\nValidation Cohort (lodged sap_windows): {len(cohort)}/{n}")
|
||||
print(f" cohort bands: {dict(sorted(cohort_bands.items()))}")
|
||||
|
||||
# observed glazing/floor ratio per band (cohort only) vs inherited prediction
|
||||
by_band: dict[Any, list[float]] = defaultdict(list)
|
||||
for c in cohort:
|
||||
tfa = c.get("total_floor_area")
|
||||
areas = [
|
||||
_measurement_value(w.get("window_area")) for w in c["sap_windows"]
|
||||
]
|
||||
areas = [a for a in areas if a is not None]
|
||||
if tfa and areas:
|
||||
by_band[c.get("glazed_area")].append(sum(areas) / float(tfa))
|
||||
print("\nobserved glazing/floor ratio vs inherited 0.148 x multiplier:")
|
||||
print(" band observed (n) predicted")
|
||||
for band in sorted(by_band):
|
||||
obs = by_band[band]
|
||||
mean = sum(obs) / len(obs)
|
||||
pred = GLAZING_RATIO * BAND_MULTIPLIER.get(band, 1.0)
|
||||
print(f" {band:<4} {mean:.3f} (n={len(obs):>2}) {pred:.3f}")
|
||||
|
||||
# sentinels / shapes driving the schema widening
|
||||
mgt_int = Counter(
|
||||
c["multiple_glazing_type"]
|
||||
for c in certs
|
||||
if isinstance(c.get("multiple_glazing_type"), int)
|
||||
)
|
||||
mgt_nd = sum(1 for c in certs if c.get("multiple_glazing_type") == "ND")
|
||||
dt_str = sum(1 for c in certs if isinstance(c.get("dwelling_type"), str))
|
||||
print("\nsentinels / shapes:")
|
||||
print(f" multiple_glazing_type int codes: {dict(sorted(mgt_int.items()))}")
|
||||
print(f" multiple_glazing_type 'ND': {mgt_nd}/{n}")
|
||||
print(f" dwelling_type as plain str: {dt_str}/{n}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
schema = sys.argv[1] if len(sys.argv) > 1 else "RdSAP-Schema-19.0"
|
||||
profile(schema)
|
||||
Loading…
Add table
Reference in a new issue