feat(epc-prediction): leave-one-out validation harness (ADR-0029)

Pure compare_prediction (TDD): wall-construction classification hit + signed residuals on floor area, window count, total window area, building-parts count. Plus validate_epc_prediction.py (IO plumbing): drops each cert from its postcode cohort, predicts from the rest on guaranteed inputs only, aggregates the metrics, and reports SAP three ways (pred-calc vs lodged / vs calc-on-actual / vs the neighbour-mean baseline). Smoke run: wall 90.9%, floor-area mean|·| 42.6 m2 (a real signal — template-copied floor area is noisy), SAP pred-calc edges baseline. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 13:10:47 +00:00 · 2026-06-13 23:55:05 +00:00 · 2026-06-13 23:55:05 +00:00 · f3ad6343a3
commit f3ad6343a3
parent 5e6d2cff16
3 changed files with 326 additions and 0 deletions
--- a/domain/epc_prediction/prediction_comparison.py
+++ b/domain/epc_prediction/prediction_comparison.py
@ -0,0 +1,60 @@
+"""Per-Property prediction comparison for the EPC Prediction validation harness
+(ADR-0029).
+
+`compare_prediction` scores a predicted `EpcPropertyData` against the actual one
+on the accuracy signals the leave-one-out harness aggregates: classification
+matches on the key categoricals (wall / roof / floor construction + insulation,
+construction age band) and residuals on the geometry (window area + count,
+building-parts count, floor area). Pure — the SAP residual is computed in the
+runner, which has the calculator and the lodged SAP.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from datatypes.epc.domain.epc_property_data import EpcPropertyData, SapBuildingPart
+
+
+@dataclass(frozen=True)
+class PredictionComparison:
+    """One Property's prediction accuracy: classification hits + geometry
+    residuals (predicted − actual)."""
+
+    wall_construction_correct: bool
+    floor_area_residual: float
+    building_parts_residual: int
+    window_count_residual: int
+    total_window_area_residual: float
+
+
+def _main(epc: EpcPropertyData) -> SapBuildingPart:
+    return epc.sap_building_parts[0]
+
+
+def _total_window_area(epc: EpcPropertyData) -> float:
+    return sum(w.window_width * w.window_height for w in epc.sap_windows)
+
+
+def compare_prediction(
+    predicted: EpcPropertyData, actual: EpcPropertyData
+) -> PredictionComparison:
+    """Compare a predicted picture against the actual one, field by field. All
+    residuals are signed, predicted − actual."""
+    return PredictionComparison(
+        wall_construction_correct=(
+            _main(predicted).wall_construction == _main(actual).wall_construction
+        ),
+        floor_area_residual=(
+            predicted.total_floor_area_m2 - actual.total_floor_area_m2
+        ),
+        building_parts_residual=(
+            len(predicted.sap_building_parts) - len(actual.sap_building_parts)
+        ),
+        window_count_residual=(
+            len(predicted.sap_windows) - len(actual.sap_windows)
+        ),
+        total_window_area_residual=(
+            _total_window_area(predicted) - _total_window_area(actual)
+        ),
+    )
--- a/scripts/validate_epc_prediction.py
+++ b/scripts/validate_epc_prediction.py
@ -0,0 +1,165 @@
+"""Leave-one-out accuracy harness for EPC Prediction (ADR-0029).
+
+Runs entirely against the frozen postcode-clustered corpus
+(`fetch_epc_prediction_corpus.py`). For every cert that has neighbours, it
+drops that cert from its postcode cohort, predicts it from the rest using only
+its *guaranteed* inputs (property type + built form), and compares the predicted
+`EpcPropertyData` to the actual one.
+
+Reports the ADR-0029 metrics:
+  - classification rate: main wall construction (extend as coverage grows);
+  - geometry residuals: floor area, window count + total window area, building
+    parts (mean signed + mean absolute);
+  - SAP reported three ways — predicted-then-calculated vs (a) the actual lodged
+    SAP, (b) the calculator on the actual components, (c) the neighbour-mean SAP
+    baseline (the number predict-then-calculate must beat).
+
+USAGE
+-----
+    PYTHONPATH=. python scripts/validate_epc_prediction.py
+
+Corpus dir: $EPC_PREDICTION_CORPUS (default /tmp/epc_prediction_corpus).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import statistics
+from pathlib import Path
+from typing import Optional
+
+from datatypes.epc.domain.epc_property_data import EpcPropertyData
+from datatypes.epc.domain.mapper import EpcPropertyDataMapper
+from domain.epc_prediction.comparable_properties import (
+    Comparable,
+    PredictionTarget,
+    select_comparables,
+)
+from domain.epc_prediction.epc_prediction import EpcPrediction
+from domain.epc_prediction.prediction_comparison import compare_prediction
+from domain.sap10_calculator.calculator import Sap10Calculator
+
+CORPUS = Path(os.environ.get("EPC_PREDICTION_CORPUS", "/tmp/epc_prediction_corpus"))
+
+
+def _load_cohort(postcode: str, certs: list[str]) -> list[Comparable]:
+    """Map a postcode's cached cert payloads to Comparables, skipping any the
+    mapper rejects (unsupported schema, malformed)."""
+    cohort: list[Comparable] = []
+    for cert in certs:
+        path = CORPUS / postcode / f"{cert}.json"
+        if not path.exists():
+            continue
+        try:
+            epc = EpcPropertyDataMapper.from_api_response(json.loads(path.read_text()))
+        except Exception:  # noqa: BLE001 — a bad cert must not abort the sweep
+            continue
+        cohort.append(Comparable(epc=epc, certificate_number=cert))
+    return cohort
+
+
+def _sap(calculator: Sap10Calculator, epc: EpcPropertyData) -> Optional[float]:
+    try:
+        return calculator.calculate(epc).sap_score_continuous
+    except Exception:  # noqa: BLE001 — some pictures don't score; count as misses
+        return None
+
+
+def main() -> None:
+    index_path = CORPUS / "_index.json"
+    if not index_path.exists():
+        raise SystemExit(f"no corpus at {CORPUS} — run fetch_epc_prediction_corpus.py")
+    index: dict[str, list[str]] = json.loads(index_path.read_text())
+
+    calculator = Sap10Calculator()
+    predictor = EpcPrediction()
+
+    wall_hits = wall_total = 0
+    floor_res: list[float] = []
+    window_count_res: list[int] = []
+    window_area_res: list[float] = []
+    parts_res: list[int] = []
+    sap_vs_lodged: list[float] = []
+    sap_vs_calc_actual: list[float] = []
+    sap_vs_neighbour_mean: list[float] = []
+    predicted_n = skipped_no_cohort = 0
+
+    for postcode, certs in index.items():
+        cohort = _load_cohort(postcode, certs)
+        if len(cohort) < 2:
+            skipped_no_cohort += len(cohort)
+            continue
+        for i, held_out in enumerate(cohort):
+            others = [c for j, c in enumerate(cohort) if j != i]
+            actual = held_out.epc
+            target = PredictionTarget(
+                postcode=postcode,
+                property_type=actual.property_type or "",
+                built_form=actual.built_form,
+            )
+            comparables = select_comparables(target, others)
+            if not comparables.members:
+                continue
+            predicted = predictor.predict(target, comparables)
+            predicted_n += 1
+
+            cmp = compare_prediction(predicted, actual)
+            wall_total += 1
+            wall_hits += int(cmp.wall_construction_correct)
+            floor_res.append(cmp.floor_area_residual)
+            window_count_res.append(cmp.window_count_residual)
+            window_area_res.append(cmp.total_window_area_residual)
+            parts_res.append(cmp.building_parts_residual)
+
+            sap_pred = _sap(calculator, predicted)
+            lodged = actual.energy_rating_current
+            if sap_pred is not None and lodged is not None:
+                sap_vs_lodged.append(abs(sap_pred - lodged))
+            sap_actual = _sap(calculator, actual)
+            if sap_pred is not None and sap_actual is not None:
+                sap_vs_calc_actual.append(abs(sap_pred - sap_actual))
+            neighbour_lodged = [
+                c.epc.energy_rating_current
+                for c in comparables.members
+                if c.epc.energy_rating_current is not None
+            ]
+            if neighbour_lodged and lodged is not None:
+                baseline = statistics.mean(neighbour_lodged)
+                sap_vs_neighbour_mean.append(abs(baseline - lodged))
+
+    print(f"corpus: {CORPUS}")
+    print(f"predicted {predicted_n} held-out certs ({skipped_no_cohort} had no cohort)\n")
+    if wall_total:
+        print(f"CLASSIFICATION  wall_construction: {wall_hits}/{wall_total} = "
+              f"{wall_hits / wall_total:.1%}")
+    _residual("floor_area (m2)", floor_res)
+    _residual("window_count", [float(x) for x in window_count_res])
+    _residual("total_window_area (m2)", window_area_res)
+    _residual("building_parts", [float(x) for x in parts_res])
+    print()
+    _sap_line("SAP |pred-calc − lodged|", sap_vs_lodged)
+    _sap_line("SAP |pred-calc − calc(actual)|", sap_vs_calc_actual)
+    _sap_line("SAP |neighbour-mean − lodged| (baseline)", sap_vs_neighbour_mean)
+
+
+def _residual(label: str, values: list[float]) -> None:
+    if not values:
+        print(f"RESIDUAL  {label}: (none)")
+        return
+    mean_signed = statistics.mean(values)
+    mean_abs = statistics.mean(abs(v) for v in values)
+    print(f"RESIDUAL  {label}: mean {mean_signed:+.2f} | mean|·| {mean_abs:.2f} "
+          f"(n={len(values)})")
+
+
+def _sap_line(label: str, values: list[float]) -> None:
+    if not values:
+        print(f"{label}: (none)")
+        return
+    print(f"{label}: MAE {statistics.mean(values):.2f} | "
+          f"median {statistics.median(values):.2f} (n={len(values)})")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/domain/epc_prediction/test_prediction_comparison.py
+++ b/tests/domain/epc_prediction/test_prediction_comparison.py
@ -0,0 +1,101 @@
+"""Behaviour of the per-Property prediction comparison (ADR-0029): given a
+predicted EpcPropertyData and the actual one, report the accuracy signals the
+validation harness aggregates — classification matches on the key categoricals
+and residuals on the geometry. Pure; SAP residual is computed in the runner
+(it needs the calculator + lodged SAP).
+"""
+
+from typing import Optional
+
+from datatypes.epc.domain.epc_property_data import (
+    EpcPropertyData,
+    SapBuildingPart,
+    SapWindow,
+)
+from domain.epc_prediction.prediction_comparison import compare_prediction
+
+
+def _epc(
+    *,
+    wall_construction: int = 1,
+    floor_area: float = 80.0,
+    building_parts: int = 1,
+    windows: Optional[list[tuple[float, float]]] = None,
+) -> EpcPropertyData:
+    epc: EpcPropertyData = object.__new__(EpcPropertyData)
+    epc.total_floor_area_m2 = floor_area
+    parts: list[SapBuildingPart] = []
+    for _ in range(building_parts):
+        part: SapBuildingPart = object.__new__(SapBuildingPart)
+        part.wall_construction = wall_construction
+        parts.append(part)
+    epc.sap_building_parts = parts
+    sap_windows: list[SapWindow] = []
+    for width, height in windows or []:
+        w: SapWindow = object.__new__(SapWindow)
+        w.window_width = width
+        w.window_height = height
+        sap_windows.append(w)
+    epc.sap_windows = sap_windows
+    return epc
+
+
+def test_flags_a_correct_main_wall_construction_classification() -> None:
+    # Arrange — predicted and actual agree on cavity (1).
+    predicted = _epc(wall_construction=1)
+    actual = _epc(wall_construction=1)
+
+    # Act
+    comparison = compare_prediction(predicted, actual)
+
+    # Assert
+    assert comparison.wall_construction_correct is True
+
+
+def test_flags_an_incorrect_main_wall_construction_classification() -> None:
+    # Arrange — predicted cavity (1), actual solid brick (2).
+    predicted = _epc(wall_construction=1)
+    actual = _epc(wall_construction=2)
+
+    # Act
+    comparison = compare_prediction(predicted, actual)
+
+    # Assert
+    assert comparison.wall_construction_correct is False
+
+
+def test_reports_the_floor_area_residual_as_predicted_minus_actual() -> None:
+    # Arrange — predicted 90 m², actual 100 m² (a 10 m² under-prediction).
+    predicted = _epc(floor_area=90.0)
+    actual = _epc(floor_area=100.0)
+
+    # Act
+    comparison = compare_prediction(predicted, actual)
+
+    # Assert — signed residual, predicted − actual.
+    assert abs(comparison.floor_area_residual - (-10.0)) <= 1e-9
+
+
+def test_reports_the_building_parts_count_residual() -> None:
+    # Arrange — predicted a single part; the actual has a main + an extension.
+    predicted = _epc(building_parts=1)
+    actual = _epc(building_parts=2)
+
+    # Act
+    comparison = compare_prediction(predicted, actual)
+
+    # Assert — predicted − actual.
+    assert comparison.building_parts_residual == -1
+
+
+def test_reports_window_count_and_total_area_residuals() -> None:
+    # Arrange — predicted 2 windows (3 m² total); actual 1 window (1 m²).
+    predicted = _epc(windows=[(1.0, 1.0), (2.0, 1.0)])
+    actual = _epc(windows=[(1.0, 1.0)])
+
+    # Act
+    comparison = compare_prediction(predicted, actual)
+
+    # Assert
+    assert comparison.window_count_residual == 1
+    assert abs(comparison.total_window_area_residual - 2.0) <= 1e-9