Model/harness/cohort.py
Khalim Conn-Kowlessar 8b5ab1c59e feat(modelling): turnkey offline cohort script (tables + CSV)
CertResult now carries its Plan (with flat baseline/post-SAP/measures
properties), and `format_cohort_csv` renders one browsable row per cert
(SAP transition, band, measures, cost, bill saving, valuation %, error).
`scripts/run_modelling_cohort.py` is turnkey: no args runs the committed
golden cohort, prints a sense-check table for the first measure-bearing
certs (a capped preview so a large dump doesn't flood the terminal), the
summary, and writes modelling_cohort.csv (gitignored). Point it at the
EPC dump when it lands.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 09:30:53 +00:00

142 lines
4.9 KiB
Python

"""Run a cohort of API-shaped EPC JSONs through Modelling, offline.
Parses each file with `EpcPropertyDataMapper.from_api_response` (the EPC-API
shape) and runs it through `run_modelling` — no database, no network, no
Baseline gate. A cert that raises (e.g. an unpriced fuel, an unmapped code) is
captured as an error rather than aborting the sweep, so one bad cert never
stops the inspection. Point it at your EPC dump and read the summary / CSV.
"""
from __future__ import annotations
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Optional
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.modelling.plan import Plan
from harness.console import DEFAULT_CATALOGUE, run_modelling
@dataclass(frozen=True)
class CertResult:
"""The outcome of modelling one cert: its `Plan` (for full inspection), or
the error it raised. The flat properties summarise the Plan for tables/CSV."""
name: str
plan: Optional[Plan] = None
error: Optional[str] = None
@property
def measures(self) -> int:
return 0 if self.plan is None else len(self.plan.measures)
@property
def baseline_sap(self) -> Optional[float]:
return None if self.plan is None else self.plan.baseline.sap_continuous
@property
def post_sap(self) -> Optional[float]:
return None if self.plan is None else self.plan.post_sap_continuous
def run_cohort(
json_paths: Iterable[Path],
*,
goal_band: str = "C",
catalogue_path: Path = DEFAULT_CATALOGUE,
) -> list[CertResult]:
"""Model every API-JSON path in `json_paths` offline, returning one
`CertResult` each (errors captured, never raised)."""
results: list[CertResult] = []
for path in json_paths:
try:
epc = EpcPropertyDataMapper.from_api_response(json.loads(path.read_text()))
plan = run_modelling(
epc,
goal_band=goal_band,
catalogue_path=catalogue_path,
print_table=False,
)
results.append(CertResult(name=path.stem, plan=plan))
except Exception as error: # noqa: BLE001 — one bad cert must not stop the sweep
results.append(
CertResult(name=path.stem, error=f"{type(error).__name__}: {error}")
)
return results
def format_cohort_summary(results: list[CertResult]) -> str:
"""A compact summary: cohort size, how many ran / produced measures /
errored, the measure-count distribution, and each distinct error."""
ran = [result for result in results if result.error is None]
errored = [result for result in results if result.error is not None]
with_measures = sum(1 for result in ran if result.measures > 0)
distribution: dict[int, int] = {}
for result in ran:
distribution[result.measures] = distribution.get(result.measures, 0) + 1
error_kinds: dict[str, int] = {}
for result in errored:
assert result.error is not None
error_kinds[result.error] = error_kinds.get(result.error, 0) + 1
lines = [
f"cohort size : {len(results)}",
f"ran offline : {len(ran)}",
f"w/ measures : {with_measures}",
f"errors : {len(errored)}",
f"measure-count distribution: {dict(sorted(distribution.items()))}",
]
if error_kinds:
lines.append("error kinds:")
lines.extend(
f" {count:3d} {kind}"
for kind, count in sorted(error_kinds.items(), key=lambda item: -item[1])
)
return "\n".join(lines)
_CSV_HEADER = (
"cert,baseline_sap,post_sap,post_band,measures,measure_types,"
"cost_of_works,bill_savings,valuation_avg_pct,error"
)
def _csv_cell(value: object) -> str:
"""Render a CSV cell, rounding floats and keeping the row comma-safe
(measure types are ';'-joined; an error message's commas are stripped)."""
if value is None:
return ""
if isinstance(value, float):
return f"{value:.2f}"
return str(value).replace(",", ";")
def format_cohort_csv(results: list[CertResult]) -> str:
"""One header row plus one row per cert — browsable/sortable in a
spreadsheet for a large dump."""
rows = [_CSV_HEADER]
for result in results:
plan = result.plan
measure_types = (
";".join(measure.measure_type for measure in plan.measures)
if plan is not None
else ""
)
cells = [
result.name,
result.baseline_sap,
result.post_sap,
plan.post_epc_rating.value if plan is not None else None,
result.measures,
measure_types,
plan.cost_of_works if plan is not None else None,
plan.energy_bill_savings if plan is not None else None,
plan.valuation.average_pct if plan is not None else None,
result.error,
]
rows.append(",".join(_csv_cell(cell) for cell in cells))
return "\n".join(rows)