feat(modelling): turnkey offline cohort script (tables + CSV)

CertResult now carries its Plan (with flat baseline/post-SAP/measures
properties), and `format_cohort_csv` renders one browsable row per cert
(SAP transition, band, measures, cost, bill saving, valuation %, error).
`scripts/run_modelling_cohort.py` is turnkey: no args runs the committed
golden cohort, prints a sense-check table for the first measure-bearing
certs (a capped preview so a large dump doesn't flood the terminal), the
summary, and writes modelling_cohort.csv (gitignored). Point it at the
EPC dump when it lands.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-04 09:30:53 +00:00
parent d8ef40c745
commit 8b5ab1c59e
4 changed files with 126 additions and 47 deletions

2
.gitignore vendored
View file

@ -298,4 +298,4 @@ pyrightconfig.json
backlog/*
# Local Claude config files
.claude/*
.claude/*modelling_cohort.csv

View file

@ -4,7 +4,7 @@ Parses each file with `EpcPropertyDataMapper.from_api_response` (the EPC-API
shape) and runs it through `run_modelling` no database, no network, no
Baseline gate. A cert that raises (e.g. an unpriced fuel, an unmapped code) is
captured as an error rather than aborting the sweep, so one bad cert never
stops the inspection. Point it at your EPC dump and read the summary.
stops the inspection. Point it at your EPC dump and read the summary / CSV.
"""
from __future__ import annotations
@ -15,19 +15,30 @@ from pathlib import Path
from typing import Iterable, Optional
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.modelling.plan import Plan
from harness.console import DEFAULT_CATALOGUE, run_modelling
@dataclass(frozen=True)
class CertResult:
"""The outcome of modelling one cert: its measure count and SAP transition,
or the error it raised (then `measures` is 0 and the SAPs are None)."""
"""The outcome of modelling one cert: its `Plan` (for full inspection), or
the error it raised. The flat properties summarise the Plan for tables/CSV."""
name: str
measures: int
baseline_sap: Optional[float]
post_sap: Optional[float]
error: Optional[str]
plan: Optional[Plan] = None
error: Optional[str] = None
@property
def measures(self) -> int:
return 0 if self.plan is None else len(self.plan.measures)
@property
def baseline_sap(self) -> Optional[float]:
return None if self.plan is None else self.plan.baseline.sap_continuous
@property
def post_sap(self) -> Optional[float]:
return None if self.plan is None else self.plan.post_sap_continuous
def run_cohort(
@ -48,24 +59,10 @@ def run_cohort(
catalogue_path=catalogue_path,
print_table=False,
)
results.append(
CertResult(
name=path.stem,
measures=len(plan.measures),
baseline_sap=plan.baseline.sap_continuous,
post_sap=plan.post_sap_continuous,
error=None,
)
)
results.append(CertResult(name=path.stem, plan=plan))
except Exception as error: # noqa: BLE001 — one bad cert must not stop the sweep
results.append(
CertResult(
name=path.stem,
measures=0,
baseline_sap=None,
post_sap=None,
error=f"{type(error).__name__}: {error}",
)
CertResult(name=path.stem, error=f"{type(error).__name__}: {error}")
)
return results
@ -100,3 +97,46 @@ def format_cohort_summary(results: list[CertResult]) -> str:
for kind, count in sorted(error_kinds.items(), key=lambda item: -item[1])
)
return "\n".join(lines)
_CSV_HEADER = (
"cert,baseline_sap,post_sap,post_band,measures,measure_types,"
"cost_of_works,bill_savings,valuation_avg_pct,error"
)
def _csv_cell(value: object) -> str:
"""Render a CSV cell, rounding floats and keeping the row comma-safe
(measure types are ';'-joined; an error message's commas are stripped)."""
if value is None:
return ""
if isinstance(value, float):
return f"{value:.2f}"
return str(value).replace(",", ";")
def format_cohort_csv(results: list[CertResult]) -> str:
"""One header row plus one row per cert — browsable/sortable in a
spreadsheet for a large dump."""
rows = [_CSV_HEADER]
for result in results:
plan = result.plan
measure_types = (
";".join(measure.measure_type for measure in plan.measures)
if plan is not None
else ""
)
cells = [
result.name,
result.baseline_sap,
result.post_sap,
plan.post_epc_rating.value if plan is not None else None,
result.measures,
measure_types,
plan.cost_of_works if plan is not None else None,
plan.energy_bill_savings if plan is not None else None,
plan.valuation.average_pct if plan is not None else None,
result.error,
]
rows.append(",".join(_csv_cell(cell) for cell in cells))
return "\n".join(rows)

View file

@ -1,14 +1,18 @@
"""Run an EPC-JSON dump through Modelling offline and print a summary.
"""Run an EPC-JSON dump through Modelling offline — print tables + write a CSV.
The files must be API-shaped EPC JSON (identical to the EPC API response what
`from_api_response` parses). No database, no network. Run from the worktree root
so imports resolve to this checkout, not /workspaces/model:
so imports resolve to this checkout, not /workspaces/model.
python -m scripts.run_modelling_cohort <dir-of-api-json> [goal_band]
# no args -> the committed golden cohort (57 real API certs)
python -m scripts.run_modelling_cohort
e.g. against the committed golden cohort:
# your dump, optional goal band (default C)
python -m scripts.run_modelling_cohort path/to/dump C
python -m scripts.run_modelling_cohort tests/domain/sap10_calculator/rdsap/fixtures/golden
Prints a sense-check table for the first measure-bearing certs (a preview, so a
huge dump doesn't flood the terminal), the cohort summary, and writes the full
per-cert results to modelling_cohort.csv for browsing in a spreadsheet.
"""
from __future__ import annotations
@ -19,33 +23,48 @@ from pathlib import Path
_REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(_REPO_ROOT)) # worktree root first — avoid the import trap
from harness.cohort import format_cohort_summary, run_cohort # noqa: E402
from harness.cohort import ( # noqa: E402
format_cohort_csv,
format_cohort_summary,
run_cohort,
)
from harness.plan_table import format_plan_table # noqa: E402
_DEFAULT_DIR = _REPO_ROOT / "tests/domain/sap10_calculator/rdsap/fixtures/golden"
_PREVIEW_TABLES = 10
_CSV_PATH = Path("modelling_cohort.csv")
def main() -> None:
if len(sys.argv) < 2:
print(
"usage: python -m scripts.run_modelling_cohort "
"<dir-of-api-json> [goal_band]"
)
raise SystemExit(2)
directory = Path(sys.argv[1])
goal_band = sys.argv[2] if len(sys.argv) > 2 else "C"
args = sys.argv[1:]
directory = Path(args[0]) if args else _DEFAULT_DIR
goal_band = args[1] if len(args) > 1 else "C"
paths = sorted(directory.glob("*.json"))
if not paths:
print(f"no *.json files under {directory}")
raise SystemExit(1)
print(
f"modelling {len(paths)} EPC JSON(s) from {directory} "
f"(goal band {goal_band}), offline — no database...\n"
)
results = run_cohort(paths, goal_band=goal_band)
print(format_cohort_summary(results))
print("\ncerts with measures:")
shown = 0
for result in results:
if result.measures and result.baseline_sap is not None and result.post_sap is not None:
print(
f" {result.name} SAP {result.baseline_sap:.1f} -> "
f"{result.post_sap:.1f} ({result.measures} measures)"
)
if result.plan is not None and result.measures and shown < _PREVIEW_TABLES:
print(f"=== {result.name} ===")
print(format_plan_table(result.plan))
print()
shown += 1
measure_bearing = sum(1 for result in results if result.measures)
if measure_bearing > shown:
print(f"... and {measure_bearing - shown} more measure-bearing certs (see CSV)\n")
print(format_cohort_summary(results))
_CSV_PATH.write_text(format_cohort_csv(results) + "\n", encoding="utf-8")
print(f"\nwrote per-cert CSV -> {_CSV_PATH.resolve()}")
if __name__ == "__main__":

View file

@ -4,7 +4,12 @@ from __future__ import annotations
from pathlib import Path
from harness.cohort import CertResult, format_cohort_summary, run_cohort
from harness.cohort import (
CertResult,
format_cohort_csv,
format_cohort_summary,
run_cohort,
)
_GOLDEN = (
Path(__file__).resolve().parents[1]
@ -28,3 +33,18 @@ def test_run_cohort_models_each_api_json_offline() -> None:
# The summary renders without raising and counts the cohort.
summary: str = format_cohort_summary(results)
assert "2" in summary
def test_cohort_carries_each_plan_and_renders_a_csv() -> None:
# Arrange / Act
paths: list[Path] = sorted(_GOLDEN.glob("*.json"))[:3]
results: list[CertResult] = run_cohort(paths)
# Assert — each cert either modelled (carries its Plan) or errored.
for result in results:
assert (result.plan is not None) != (result.error is not None)
# CSV: a header row plus one row per cert, browsable in a spreadsheet.
csv: str = format_cohort_csv(results)
lines: list[str] = csv.splitlines()
assert lines[0].startswith("cert,")
assert len(lines) == len(results) + 1