From 65cb094abeb24871d42a86016b25777ed62366d3 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 14 Jun 2026 09:04:24 +0000
Subject: [PATCH] feat(epc-prediction): SAP-10.2 target filter + carbon/PE
 end-to-end (ADR-0030)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make the leave-one-out runner ADR-0030-compliant:
- Hold out only SAP 10.2 targets (sap_version == 10.2) — the source cohort
  keeps every vintage (components are methodology-agnostic).
- Label Component Accuracy as the PRIMARY, calculator-independent section.
- End-to-end vs API-lodged (SECONDARY, calculator-FLOORED): add CO2 (tonnes)
  and PEI (kWh/m2) alongside SAP, using the canonical performance.py mapping
  (co2_kg/1000; primary_energy_kwh_per_m2).
- Add the attribution readout calc(actual) vs lodged SAP — the calculator
  floor the end-to-end can reach.
- Drop the neighbour-mean-of-lodged-SAP baseline (mixes SAP versions —
  rejected by ADR-0030).

On the 181 SAP-10.2 targets: component rates are higher than the all-vintage
view (age band 60.9 -> 78.5%, floor_area mean|.| 12.7 -> 8.4). End-to-end SAP
MAE 6.34 vs the calc(actual) floor of 3.25 — ~half the gap is the known
API-path calculator residual, not prediction error.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 scripts/validate_epc_prediction.py | 87 +++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 25 deletions(-)

diff --git a/scripts/validate_epc_prediction.py b/scripts/validate_epc_prediction.py
index 1bc97c4e..18ee4bbb 100644
--- a/scripts/validate_epc_prediction.py
+++ b/scripts/validate_epc_prediction.py
@@ -39,7 +39,13 @@ from domain.epc_prediction.comparable_properties import (
 )
 from domain.epc_prediction.epc_prediction import EpcPrediction
 from domain.epc_prediction.prediction_comparison import compare_prediction
-from domain.sap10_calculator.calculator import Sap10Calculator
+from domain.sap10_calculator.calculator import Sap10Calculator, SapResult
+
+# Target-cert spec gate: only SAP 10.2 certs (schema 21.0.x) carry full-fidelity
+# lodged components + a same-spec lodged figure to check against (ADR-0030). The
+# source cohort keeps all vintages — components are methodology-agnostic.
+_SAP_10_2: float = 10.2
+_KG_PER_TONNE: float = 1000.0
 
 CORPUS = Path(os.environ.get("EPC_PREDICTION_CORPUS", "/tmp/epc_prediction_corpus"))
 
@@ -101,13 +107,21 @@ def _recency(comparable: Comparable) -> tuple[date, str]:
     )
 
 
-def _sap(calculator: Sap10Calculator, epc: EpcPropertyData) -> Optional[float]:
+def _result(
+    calculator: Sap10Calculator, epc: EpcPropertyData
+) -> Optional[SapResult]:
     try:
-        return calculator.calculate(epc).sap_score_continuous
+        return calculator.calculate(epc)
     except Exception:  # noqa: BLE001 — some pictures don't score; count as misses
         return None
 
 
+def _co2_tonnes(result: SapResult) -> float:
+    """Calculated annual CO2 in tonnes, matching the lodged `co2_emissions_current`
+    scale (see domain/property_baseline/performance.py)."""
+    return result.co2_kg_per_yr / _KG_PER_TONNE
+
+
 def main() -> None:
     index_path = CORPUS / "_index.json"
     if not index_path.exists():
@@ -126,10 +140,14 @@ def main() -> None:
     window_area_res: list[float] = []
     parts_res: list[int] = []
     door_res: list[int] = []
+    # End-to-end (calculator-FLOORED) vs API-lodged — secondary guard, ADR-0030.
     sap_vs_lodged: list[float] = []
-    sap_vs_calc_actual: list[float] = []
-    sap_vs_neighbour_mean: list[float] = []
-    predicted_n = skipped_no_cohort = 0
+    co2_vs_lodged: list[float] = []
+    pei_vs_lodged: list[float] = []
+    # Attribution readout: how far the calculator alone is from lodged on the
+    # ACTUAL components — the floor the end-to-end numbers can reach.
+    sap_calc_actual_vs_lodged: list[float] = []
+    predicted_n = skipped_non_102 = skipped_no_cohort = 0
 
     for postcode, certs in index.items():
         cohort = _load_cohort(postcode, certs)
@@ -138,6 +156,11 @@ def main() -> None:
             skipped_no_cohort += len(targets)
             continue
         for held_out in targets:
+            # Only SAP 10.2 certs are valid validation targets (ADR-0030); the
+            # source cohort (`others`) keeps every vintage.
+            if held_out.epc.sap_version != _SAP_10_2:
+                skipped_non_102 += 1
+                continue
             # Exclude every cert of the held-out address (not just the held cert)
             # so a re-lodgement of the same property cannot leak into the cohort.
             others = [
@@ -166,24 +189,36 @@ def main() -> None:
             parts_res.append(cmp.building_parts_residual)
             door_res.append(cmp.door_count_residual)
 
-            sap_pred = _sap(calculator, predicted)
-            lodged = actual.energy_rating_current
-            if sap_pred is not None and lodged is not None:
-                sap_vs_lodged.append(abs(sap_pred - lodged))
-            sap_actual = _sap(calculator, actual)
-            if sap_pred is not None and sap_actual is not None:
-                sap_vs_calc_actual.append(abs(sap_pred - sap_actual))
-            neighbour_lodged = [
-                c.epc.energy_rating_current
-                for c in comparables.members
-                if c.epc.energy_rating_current is not None
-            ]
-            if neighbour_lodged and lodged is not None:
-                baseline = statistics.mean(neighbour_lodged)
-                sap_vs_neighbour_mean.append(abs(baseline - lodged))
+            pred_result = _result(calculator, predicted)
+            actual_result = _result(calculator, actual)
+            lodged_sap = actual.energy_rating_current
+            lodged_co2 = actual.co2_emissions_current
+            lodged_pei = actual.energy_consumption_current
+            if pred_result is not None:
+                if lodged_sap is not None:
+                    sap_vs_lodged.append(
+                        abs(pred_result.sap_score_continuous - lodged_sap)
+                    )
+                if lodged_co2 is not None:
+                    co2_vs_lodged.append(
+                        abs(_co2_tonnes(pred_result) - lodged_co2)
+                    )
+                if lodged_pei is not None:
+                    pei_vs_lodged.append(
+                        abs(pred_result.primary_energy_kwh_per_m2 - lodged_pei)
+                    )
+            if actual_result is not None and lodged_sap is not None:
+                sap_calc_actual_vs_lodged.append(
+                    abs(actual_result.sap_score_continuous - lodged_sap)
+                )
 
     print(f"corpus: {CORPUS}")
-    print(f"predicted {predicted_n} held-out certs ({skipped_no_cohort} had no cohort)\n")
+    print(
+        f"predicted {predicted_n} SAP-10.2 held-out targets "
+        f"({skipped_non_102} non-10.2 targets skipped, "
+        f"{skipped_no_cohort} had no cohort)\n"
+    )
+    print("--- Component Accuracy (PRIMARY, calculator-independent) ---")
     for name, (hits, total) in categoricals.items():
         if total:
             print(f"CLASSIFICATION  {name}: {hits}/{total} = {hits / total:.1%}")
@@ -194,9 +229,11 @@ def main() -> None:
     _residual("building_parts", [float(x) for x in parts_res])
     _residual("door_count", [float(x) for x in door_res])
     print()
-    _sap_line("SAP |pred-calc − lodged|", sap_vs_lodged)
-    _sap_line("SAP |pred-calc − calc(actual)|", sap_vs_calc_actual)
-    _sap_line("SAP |neighbour-mean − lodged| (baseline)", sap_vs_neighbour_mean)
+    print("--- End-to-end vs API-lodged (SECONDARY, calculator-FLOORED) ---")
+    _sap_line("SAP |pred − lodged|", sap_vs_lodged)
+    _sap_line("CO2 (t) |pred − lodged|", co2_vs_lodged)
+    _sap_line("PEI (kWh/m2) |pred − lodged|", pei_vs_lodged)
+    _sap_line("  floor: SAP |calc(actual) − lodged|", sap_calc_actual_vs_lodged)
 
 
 def _tally(counter: list[int], hit: Optional[bool]) -> None: