Model/harness/cohort.py

"""Run a cohort of API-shaped EPC JSONs through Modelling, offline.

Parses each file with `EpcPropertyDataMapper.from_api_response` (the EPC-API
shape) and runs it through `run_modelling` — no database, no network, no
Baseline gate. A cert that raises (e.g. an unpriced fuel, an unmapped code) is
captured as an error rather than aborting the sweep, so one bad cert never
stops the inspection. Point it at your EPC dump and read the summary / CSV.
"""

from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Optional

from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.modelling.plan import Plan
from harness.console import DEFAULT_CATALOGUE, run_modelling


@dataclass(frozen=True)
class CertResult:
    """The outcome of modelling one cert: its `Plan` (for full inspection), or
    the error it raised. The flat properties summarise the Plan for tables/CSV."""

    name: str
    plan: Optional[Plan] = None
    error: Optional[str] = None

    @property
    def measures(self) -> int:
        return 0 if self.plan is None else len(self.plan.measures)

    @property
    def baseline_sap(self) -> Optional[float]:
        return None if self.plan is None else self.plan.baseline.sap_continuous

    @property
    def post_sap(self) -> Optional[float]:
        return None if self.plan is None else self.plan.post_sap_continuous


def run_cohort(
    json_paths: Iterable[Path],
    *,
    goal_band: str = "C",
    catalogue_path: Path = DEFAULT_CATALOGUE,
) -> list[CertResult]:
    """Model every API-JSON path in `json_paths` offline, returning one
    `CertResult` each (errors captured, never raised)."""
    results: list[CertResult] = []
    for path in json_paths:
        try:
            epc = EpcPropertyDataMapper.from_api_response(json.loads(path.read_text()))
            plan = run_modelling(
                epc,
                goal_band=goal_band,
                catalogue_path=catalogue_path,
                print_table=False,
            )
            results.append(CertResult(name=path.stem, plan=plan))
        except Exception as error:  # noqa: BLE001 — one bad cert must not stop the sweep
            results.append(
                CertResult(name=path.stem, error=f"{type(error).__name__}: {error}")
            )
    return results


def format_cohort_summary(results: list[CertResult]) -> str:
    """A compact summary: cohort size, how many ran / produced measures /
    errored, the measure-count distribution, and each distinct error."""
    ran = [result for result in results if result.error is None]
    errored = [result for result in results if result.error is not None]
    with_measures = sum(1 for result in ran if result.measures > 0)

    distribution: dict[int, int] = {}
    for result in ran:
        distribution[result.measures] = distribution.get(result.measures, 0) + 1

    error_kinds: dict[str, int] = {}
    for result in errored:
        assert result.error is not None
        error_kinds[result.error] = error_kinds.get(result.error, 0) + 1

    lines = [
        f"cohort size : {len(results)}",
        f"ran offline : {len(ran)}",
        f"w/ measures : {with_measures}",
        f"errors      : {len(errored)}",
        f"measure-count distribution: {dict(sorted(distribution.items()))}",
    ]
    if error_kinds:
        lines.append("error kinds:")
        lines.extend(
            f"  {count:3d}  {kind}"
            for kind, count in sorted(error_kinds.items(), key=lambda item: -item[1])
        )
    return "\n".join(lines)


_CSV_HEADER = (
    "cert,baseline_sap,post_sap,post_band,measures,measure_types,"
    "cost_of_works,bill_savings,valuation_avg_pct,error"
)


def _csv_cell(value: object) -> str:
    """Render a CSV cell, rounding floats and keeping the row comma-safe
    (measure types are ';'-joined; an error message's commas are stripped)."""
    if value is None:
        return ""
    if isinstance(value, float):
        return f"{value:.2f}"
    return str(value).replace(",", ";")


def format_cohort_csv(results: list[CertResult]) -> str:
    """One header row plus one row per cert — browsable/sortable in a
    spreadsheet for a large dump."""
    rows = [_CSV_HEADER]
    for result in results:
        plan = result.plan
        measure_types = (
            ";".join(measure.measure_type for measure in plan.measures)
            if plan is not None
            else ""
        )
        cells = [
            result.name,
            result.baseline_sap,
            result.post_sap,
            plan.post_epc_rating.value if plan is not None else None,
            result.measures,
            measure_types,
            plan.cost_of_works if plan is not None else None,
            plan.energy_bill_savings if plan is not None else None,
            plan.valuation.average_pct if plan is not None else None,
            result.error,
        ]
        rows.append(",".join(_csv_cell(cell) for cell in cells))
    return "\n".join(rows)