feat(epc-prediction): leave-one-out validation harness (ADR-0029)

Pure compare_prediction (TDD): wall-construction classification hit + signed
residuals on floor area, window count, total window area, building-parts count.
Plus validate_epc_prediction.py (IO plumbing): drops each cert from its postcode
cohort, predicts from the rest on guaranteed inputs only, aggregates the metrics,
and reports SAP three ways (pred-calc vs lodged / vs calc-on-actual / vs the
neighbour-mean baseline). Smoke run: wall 90.9%, floor-area mean|·| 42.6 m2 (a
real signal — template-copied floor area is noisy), SAP pred-calc edges baseline.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-13 23:55:05 +00:00
parent 5e6d2cff16
commit f3ad6343a3
3 changed files with 326 additions and 0 deletions

View file

@ -0,0 +1,60 @@
"""Per-Property prediction comparison for the EPC Prediction validation harness
(ADR-0029).
`compare_prediction` scores a predicted `EpcPropertyData` against the actual one
on the accuracy signals the leave-one-out harness aggregates: classification
matches on the key categoricals (wall / roof / floor construction + insulation,
construction age band) and residuals on the geometry (window area + count,
building-parts count, floor area). Pure the SAP residual is computed in the
runner, which has the calculator and the lodged SAP.
"""
from __future__ import annotations
from dataclasses import dataclass
from datatypes.epc.domain.epc_property_data import EpcPropertyData, SapBuildingPart
@dataclass(frozen=True)
class PredictionComparison:
"""One Property's prediction accuracy: classification hits + geometry
residuals (predicted actual)."""
wall_construction_correct: bool
floor_area_residual: float
building_parts_residual: int
window_count_residual: int
total_window_area_residual: float
def _main(epc: EpcPropertyData) -> SapBuildingPart:
return epc.sap_building_parts[0]
def _total_window_area(epc: EpcPropertyData) -> float:
return sum(w.window_width * w.window_height for w in epc.sap_windows)
def compare_prediction(
predicted: EpcPropertyData, actual: EpcPropertyData
) -> PredictionComparison:
"""Compare a predicted picture against the actual one, field by field. All
residuals are signed, predicted actual."""
return PredictionComparison(
wall_construction_correct=(
_main(predicted).wall_construction == _main(actual).wall_construction
),
floor_area_residual=(
predicted.total_floor_area_m2 - actual.total_floor_area_m2
),
building_parts_residual=(
len(predicted.sap_building_parts) - len(actual.sap_building_parts)
),
window_count_residual=(
len(predicted.sap_windows) - len(actual.sap_windows)
),
total_window_area_residual=(
_total_window_area(predicted) - _total_window_area(actual)
),
)

View file

@ -0,0 +1,165 @@
"""Leave-one-out accuracy harness for EPC Prediction (ADR-0029).
Runs entirely against the frozen postcode-clustered corpus
(`fetch_epc_prediction_corpus.py`). For every cert that has neighbours, it
drops that cert from its postcode cohort, predicts it from the rest using only
its *guaranteed* inputs (property type + built form), and compares the predicted
`EpcPropertyData` to the actual one.
Reports the ADR-0029 metrics:
- classification rate: main wall construction (extend as coverage grows);
- geometry residuals: floor area, window count + total window area, building
parts (mean signed + mean absolute);
- SAP reported three ways predicted-then-calculated vs (a) the actual lodged
SAP, (b) the calculator on the actual components, (c) the neighbour-mean SAP
baseline (the number predict-then-calculate must beat).
USAGE
-----
PYTHONPATH=. python scripts/validate_epc_prediction.py
Corpus dir: $EPC_PREDICTION_CORPUS (default /tmp/epc_prediction_corpus).
"""
from __future__ import annotations
import json
import os
import statistics
from pathlib import Path
from typing import Optional
from datatypes.epc.domain.epc_property_data import EpcPropertyData
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.epc_prediction.comparable_properties import (
Comparable,
PredictionTarget,
select_comparables,
)
from domain.epc_prediction.epc_prediction import EpcPrediction
from domain.epc_prediction.prediction_comparison import compare_prediction
from domain.sap10_calculator.calculator import Sap10Calculator
CORPUS = Path(os.environ.get("EPC_PREDICTION_CORPUS", "/tmp/epc_prediction_corpus"))
def _load_cohort(postcode: str, certs: list[str]) -> list[Comparable]:
"""Map a postcode's cached cert payloads to Comparables, skipping any the
mapper rejects (unsupported schema, malformed)."""
cohort: list[Comparable] = []
for cert in certs:
path = CORPUS / postcode / f"{cert}.json"
if not path.exists():
continue
try:
epc = EpcPropertyDataMapper.from_api_response(json.loads(path.read_text()))
except Exception: # noqa: BLE001 — a bad cert must not abort the sweep
continue
cohort.append(Comparable(epc=epc, certificate_number=cert))
return cohort
def _sap(calculator: Sap10Calculator, epc: EpcPropertyData) -> Optional[float]:
try:
return calculator.calculate(epc).sap_score_continuous
except Exception: # noqa: BLE001 — some pictures don't score; count as misses
return None
def main() -> None:
index_path = CORPUS / "_index.json"
if not index_path.exists():
raise SystemExit(f"no corpus at {CORPUS} — run fetch_epc_prediction_corpus.py")
index: dict[str, list[str]] = json.loads(index_path.read_text())
calculator = Sap10Calculator()
predictor = EpcPrediction()
wall_hits = wall_total = 0
floor_res: list[float] = []
window_count_res: list[int] = []
window_area_res: list[float] = []
parts_res: list[int] = []
sap_vs_lodged: list[float] = []
sap_vs_calc_actual: list[float] = []
sap_vs_neighbour_mean: list[float] = []
predicted_n = skipped_no_cohort = 0
for postcode, certs in index.items():
cohort = _load_cohort(postcode, certs)
if len(cohort) < 2:
skipped_no_cohort += len(cohort)
continue
for i, held_out in enumerate(cohort):
others = [c for j, c in enumerate(cohort) if j != i]
actual = held_out.epc
target = PredictionTarget(
postcode=postcode,
property_type=actual.property_type or "",
built_form=actual.built_form,
)
comparables = select_comparables(target, others)
if not comparables.members:
continue
predicted = predictor.predict(target, comparables)
predicted_n += 1
cmp = compare_prediction(predicted, actual)
wall_total += 1
wall_hits += int(cmp.wall_construction_correct)
floor_res.append(cmp.floor_area_residual)
window_count_res.append(cmp.window_count_residual)
window_area_res.append(cmp.total_window_area_residual)
parts_res.append(cmp.building_parts_residual)
sap_pred = _sap(calculator, predicted)
lodged = actual.energy_rating_current
if sap_pred is not None and lodged is not None:
sap_vs_lodged.append(abs(sap_pred - lodged))
sap_actual = _sap(calculator, actual)
if sap_pred is not None and sap_actual is not None:
sap_vs_calc_actual.append(abs(sap_pred - sap_actual))
neighbour_lodged = [
c.epc.energy_rating_current
for c in comparables.members
if c.epc.energy_rating_current is not None
]
if neighbour_lodged and lodged is not None:
baseline = statistics.mean(neighbour_lodged)
sap_vs_neighbour_mean.append(abs(baseline - lodged))
print(f"corpus: {CORPUS}")
print(f"predicted {predicted_n} held-out certs ({skipped_no_cohort} had no cohort)\n")
if wall_total:
print(f"CLASSIFICATION wall_construction: {wall_hits}/{wall_total} = "
f"{wall_hits / wall_total:.1%}")
_residual("floor_area (m2)", floor_res)
_residual("window_count", [float(x) for x in window_count_res])
_residual("total_window_area (m2)", window_area_res)
_residual("building_parts", [float(x) for x in parts_res])
print()
_sap_line("SAP |pred-calc lodged|", sap_vs_lodged)
_sap_line("SAP |pred-calc calc(actual)|", sap_vs_calc_actual)
_sap_line("SAP |neighbour-mean lodged| (baseline)", sap_vs_neighbour_mean)
def _residual(label: str, values: list[float]) -> None:
if not values:
print(f"RESIDUAL {label}: (none)")
return
mean_signed = statistics.mean(values)
mean_abs = statistics.mean(abs(v) for v in values)
print(f"RESIDUAL {label}: mean {mean_signed:+.2f} | mean|·| {mean_abs:.2f} "
f"(n={len(values)})")
def _sap_line(label: str, values: list[float]) -> None:
if not values:
print(f"{label}: (none)")
return
print(f"{label}: MAE {statistics.mean(values):.2f} | "
f"median {statistics.median(values):.2f} (n={len(values)})")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,101 @@
"""Behaviour of the per-Property prediction comparison (ADR-0029): given a
predicted EpcPropertyData and the actual one, report the accuracy signals the
validation harness aggregates classification matches on the key categoricals
and residuals on the geometry. Pure; SAP residual is computed in the runner
(it needs the calculator + lodged SAP).
"""
from typing import Optional
from datatypes.epc.domain.epc_property_data import (
EpcPropertyData,
SapBuildingPart,
SapWindow,
)
from domain.epc_prediction.prediction_comparison import compare_prediction
def _epc(
*,
wall_construction: int = 1,
floor_area: float = 80.0,
building_parts: int = 1,
windows: Optional[list[tuple[float, float]]] = None,
) -> EpcPropertyData:
epc: EpcPropertyData = object.__new__(EpcPropertyData)
epc.total_floor_area_m2 = floor_area
parts: list[SapBuildingPart] = []
for _ in range(building_parts):
part: SapBuildingPart = object.__new__(SapBuildingPart)
part.wall_construction = wall_construction
parts.append(part)
epc.sap_building_parts = parts
sap_windows: list[SapWindow] = []
for width, height in windows or []:
w: SapWindow = object.__new__(SapWindow)
w.window_width = width
w.window_height = height
sap_windows.append(w)
epc.sap_windows = sap_windows
return epc
def test_flags_a_correct_main_wall_construction_classification() -> None:
# Arrange — predicted and actual agree on cavity (1).
predicted = _epc(wall_construction=1)
actual = _epc(wall_construction=1)
# Act
comparison = compare_prediction(predicted, actual)
# Assert
assert comparison.wall_construction_correct is True
def test_flags_an_incorrect_main_wall_construction_classification() -> None:
# Arrange — predicted cavity (1), actual solid brick (2).
predicted = _epc(wall_construction=1)
actual = _epc(wall_construction=2)
# Act
comparison = compare_prediction(predicted, actual)
# Assert
assert comparison.wall_construction_correct is False
def test_reports_the_floor_area_residual_as_predicted_minus_actual() -> None:
# Arrange — predicted 90 m², actual 100 m² (a 10 m² under-prediction).
predicted = _epc(floor_area=90.0)
actual = _epc(floor_area=100.0)
# Act
comparison = compare_prediction(predicted, actual)
# Assert — signed residual, predicted actual.
assert abs(comparison.floor_area_residual - (-10.0)) <= 1e-9
def test_reports_the_building_parts_count_residual() -> None:
# Arrange — predicted a single part; the actual has a main + an extension.
predicted = _epc(building_parts=1)
actual = _epc(building_parts=2)
# Act
comparison = compare_prediction(predicted, actual)
# Assert — predicted actual.
assert comparison.building_parts_residual == -1
def test_reports_window_count_and_total_area_residuals() -> None:
# Arrange — predicted 2 windows (3 m² total); actual 1 window (1 m²).
predicted = _epc(windows=[(1.0, 1.0), (2.0, 1.0)])
actual = _epc(windows=[(1.0, 1.0)])
# Act
comparison = compare_prediction(predicted, actual)
# Assert
assert comparison.window_count_residual == 1
assert abs(comparison.total_window_area_residual - 2.0) <= 1e-9