feat(modelling): turnkey offline cohort script (tables + CSV)

CertResult now carries its Plan (with flat baseline/post-SAP/measures properties), and `format_cohort_csv` renders one browsable row per cert (SAP transition, band, measures, cost, bill saving, valuation %, error). `scripts/run_modelling_cohort.py` is turnkey: no args runs the committed golden cohort, prints a sense-check table for the first measure-bearing certs (a capped preview so a large dump doesn't flood the terminal), the summary, and writes modelling_cohort.csv (gitignored). Point it at the EPC dump when it lands. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-27 23:35:01 +00:00 · 2026-06-04 09:30:53 +00:00 · 2026-06-04 09:30:53 +00:00 · 8b5ab1c59e
commit 8b5ab1c59e
parent d8ef40c745
4 changed files with 126 additions and 47 deletions
--- a/.gitignore
+++ b/.gitignore
@ -298,4 +298,4 @@ pyrightconfig.json
 backlog/*

 # Local Claude config files
-.claude/*
+.claude/*modelling_cohort.csv
--- a/harness/cohort.py
+++ b/harness/cohort.py
@ -4,7 +4,7 @@ Parses each file with `EpcPropertyDataMapper.from_api_response` (the EPC-API
 shape) and runs it through `run_modelling` — no database, no network, no
 Baseline gate. A cert that raises (e.g. an unpriced fuel, an unmapped code) is
 captured as an error rather than aborting the sweep, so one bad cert never
-stops the inspection. Point it at your EPC dump and read the summary.
+stops the inspection. Point it at your EPC dump and read the summary / CSV.
 """

 from __future__ import annotations
@ -15,19 +15,30 @@ from pathlib import Path
 from typing import Iterable, Optional

 from datatypes.epc.domain.mapper import EpcPropertyDataMapper
+from domain.modelling.plan import Plan
 from harness.console import DEFAULT_CATALOGUE, run_modelling


@dataclass(frozen=True)
 class CertResult:
-    """The outcome of modelling one cert: its measure count and SAP transition,
-    or the error it raised (then `measures` is 0 and the SAPs are None)."""
+    """The outcome of modelling one cert: its `Plan` (for full inspection), or
+    the error it raised. The flat properties summarise the Plan for tables/CSV."""

    name: str
-    measures: int
-    baseline_sap: Optional[float]
-    post_sap: Optional[float]
-    error: Optional[str]
+    plan: Optional[Plan] = None
+    error: Optional[str] = None
+
+    @property
+    def measures(self) -> int:
+        return 0 if self.plan is None else len(self.plan.measures)
+
+    @property
+    def baseline_sap(self) -> Optional[float]:
+        return None if self.plan is None else self.plan.baseline.sap_continuous
+
+    @property
+    def post_sap(self) -> Optional[float]:
+        return None if self.plan is None else self.plan.post_sap_continuous


 def run_cohort(
@ -48,24 +59,10 @@ def run_cohort(
                catalogue_path=catalogue_path,
                print_table=False,
            )
-            results.append(
-                CertResult(
-                    name=path.stem,
-                    measures=len(plan.measures),
-                    baseline_sap=plan.baseline.sap_continuous,
-                    post_sap=plan.post_sap_continuous,
-                    error=None,
-                )
-            )
+            results.append(CertResult(name=path.stem, plan=plan))
        except Exception as error:  # noqa: BLE001 — one bad cert must not stop the sweep
            results.append(
-                CertResult(
-                    name=path.stem,
-                    measures=0,
-                    baseline_sap=None,
-                    post_sap=None,
-                    error=f"{type(error).__name__}: {error}",
-                )
+                CertResult(name=path.stem, error=f"{type(error).__name__}: {error}")
            )
    return results

@ -100,3 +97,46 @@ def format_cohort_summary(results: list[CertResult]) -> str:
            for kind, count in sorted(error_kinds.items(), key=lambda item: -item[1])
        )
    return "\n".join(lines)
+
+
+_CSV_HEADER = (
+    "cert,baseline_sap,post_sap,post_band,measures,measure_types,"
+    "cost_of_works,bill_savings,valuation_avg_pct,error"
+)
+
+
+def _csv_cell(value: object) -> str:
+    """Render a CSV cell, rounding floats and keeping the row comma-safe
+    (measure types are ';'-joined; an error message's commas are stripped)."""
+    if value is None:
+        return ""
+    if isinstance(value, float):
+        return f"{value:.2f}"
+    return str(value).replace(",", ";")
+
+
+def format_cohort_csv(results: list[CertResult]) -> str:
+    """One header row plus one row per cert — browsable/sortable in a
+    spreadsheet for a large dump."""
+    rows = [_CSV_HEADER]
+    for result in results:
+        plan = result.plan
+        measure_types = (
+            ";".join(measure.measure_type for measure in plan.measures)
+            if plan is not None
+            else ""
+        )
+        cells = [
+            result.name,
+            result.baseline_sap,
+            result.post_sap,
+            plan.post_epc_rating.value if plan is not None else None,
+            result.measures,
+            measure_types,
+            plan.cost_of_works if plan is not None else None,
+            plan.energy_bill_savings if plan is not None else None,
+            plan.valuation.average_pct if plan is not None else None,
+            result.error,
+        ]
+        rows.append(",".join(_csv_cell(cell) for cell in cells))
+    return "\n".join(rows)
--- a/scripts/run_modelling_cohort.py
+++ b/scripts/run_modelling_cohort.py
@ -1,14 +1,18 @@
-"""Run an EPC-JSON dump through Modelling offline and print a summary.
+"""Run an EPC-JSON dump through Modelling offline — print tables + write a CSV.

 The files must be API-shaped EPC JSON (identical to the EPC API response — what
 `from_api_response` parses). No database, no network. Run from the worktree root
-so imports resolve to this checkout, not /workspaces/model:
+so imports resolve to this checkout, not /workspaces/model.

-    python -m scripts.run_modelling_cohort <dir-of-api-json> [goal_band]
+    # no args -> the committed golden cohort (57 real API certs)
+    python -m scripts.run_modelling_cohort

-e.g. against the committed golden cohort:
+    # your dump, optional goal band (default C)
+    python -m scripts.run_modelling_cohort path/to/dump C

-    python -m scripts.run_modelling_cohort tests/domain/sap10_calculator/rdsap/fixtures/golden
+Prints a sense-check table for the first measure-bearing certs (a preview, so a
+huge dump doesn't flood the terminal), the cohort summary, and writes the full
+per-cert results to modelling_cohort.csv for browsing in a spreadsheet.
 """

 from __future__ import annotations
@ -19,33 +23,48 @@ from pathlib import Path
 _REPO_ROOT = Path(__file__).resolve().parents[1]
 sys.path.insert(0, str(_REPO_ROOT))  # worktree root first — avoid the import trap

-from harness.cohort import format_cohort_summary, run_cohort  # noqa: E402
+from harness.cohort import (  # noqa: E402
+    format_cohort_csv,
+    format_cohort_summary,
+    run_cohort,
+)
+from harness.plan_table import format_plan_table  # noqa: E402
+
+_DEFAULT_DIR = _REPO_ROOT / "tests/domain/sap10_calculator/rdsap/fixtures/golden"
+_PREVIEW_TABLES = 10
+_CSV_PATH = Path("modelling_cohort.csv")


 def main() -> None:
-    if len(sys.argv) < 2:
-        print(
-            "usage: python -m scripts.run_modelling_cohort "
-            "<dir-of-api-json> [goal_band]"
-        )
-        raise SystemExit(2)
-
-    directory = Path(sys.argv[1])
-    goal_band = sys.argv[2] if len(sys.argv) > 2 else "C"
+    args = sys.argv[1:]
+    directory = Path(args[0]) if args else _DEFAULT_DIR
+    goal_band = args[1] if len(args) > 1 else "C"
    paths = sorted(directory.glob("*.json"))
    if not paths:
        print(f"no *.json files under {directory}")
        raise SystemExit(1)

+    print(
+        f"modelling {len(paths)} EPC JSON(s) from {directory} "
+        f"(goal band {goal_band}), offline — no database...\n"
+    )
    results = run_cohort(paths, goal_band=goal_band)
-    print(format_cohort_summary(results))
-    print("\ncerts with measures:")
+
+    shown = 0
    for result in results:
-        if result.measures and result.baseline_sap is not None and result.post_sap is not None:
-            print(
-                f"  {result.name}  SAP {result.baseline_sap:.1f} -> "
-                f"{result.post_sap:.1f}  ({result.measures} measures)"
-            )
+        if result.plan is not None and result.measures and shown < _PREVIEW_TABLES:
+            print(f"=== {result.name} ===")
+            print(format_plan_table(result.plan))
+            print()
+            shown += 1
+    measure_bearing = sum(1 for result in results if result.measures)
+    if measure_bearing > shown:
+        print(f"... and {measure_bearing - shown} more measure-bearing certs (see CSV)\n")
+
+    print(format_cohort_summary(results))
+
+    _CSV_PATH.write_text(format_cohort_csv(results) + "\n", encoding="utf-8")
+    print(f"\nwrote per-cert CSV -> {_CSV_PATH.resolve()}")


 if __name__ == "__main__":
--- a/tests/harness/test_cohort.py
+++ b/tests/harness/test_cohort.py
@ -4,7 +4,12 @@ from __future__ import annotations

 from pathlib import Path

-from harness.cohort import CertResult, format_cohort_summary, run_cohort
+from harness.cohort import (
+    CertResult,
+    format_cohort_csv,
+    format_cohort_summary,
+    run_cohort,
+)

 _GOLDEN = (
    Path(__file__).resolve().parents[1]
@ -28,3 +33,18 @@ def test_run_cohort_models_each_api_json_offline() -> None:
    # The summary renders without raising and counts the cohort.
    summary: str = format_cohort_summary(results)
    assert "2" in summary
+
+
+def test_cohort_carries_each_plan_and_renders_a_csv() -> None:
+    # Arrange / Act
+    paths: list[Path] = sorted(_GOLDEN.glob("*.json"))[:3]
+    results: list[CertResult] = run_cohort(paths)
+
+    # Assert — each cert either modelled (carries its Plan) or errored.
+    for result in results:
+        assert (result.plan is not None) != (result.error is not None)
+    # CSV: a header row plus one row per cert, browsable in a spreadsheet.
+    csv: str = format_cohort_csv(results)
+    lines: list[str] = csv.splitlines()
+    assert lines[0].startswith("cert,")
+    assert len(lines) == len(results) + 1