From 0c70280dea0bd2d2dec0a21fe6ed85605a885bd9 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 24 Jun 2026 09:07:24 +0000
Subject: [PATCH] guard(modelling_e2e): quarantine predicted Properties the
 calculator mis-scores
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TEMPORARY guard (remove once the SAP calculator's oil-heating under-score is
fixed): a predicted oil-boiler picture scores SAP 13/G against its own
synthesised recorded SAP of 50/E, so the optimiser overshoots goal C all the
way to band A and publishes nonsense.

A predicted EpcPropertyData carries its recorded SAP (energy_rating_current).
When the calculator baseline diverges from it by more than ~one band (20 SAP
points), withhold the Plan: raise inside the per-property loop so the existing
failure isolation drops just that property into `failures` and fails the
subtask, while every other property still models and persists. Lodged
Properties are untouched — they have a real recorded cert and the Rebaseliner
already owns this check.

Verified end-to-end against property 713406 (UPRN 100061849247): baseline 13.2
vs recorded 50 -> quarantined, no Plan written.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 applications/modelling_e2e/handler.py         | 45 +++++++++++++++++++
 .../modelling_e2e/test_handler.py             | 29 ++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/applications/modelling_e2e/handler.py b/applications/modelling_e2e/handler.py
index c5aa189b..f9c55696 100644
--- a/applications/modelling_e2e/handler.py
+++ b/applications/modelling_e2e/handler.py
@@ -212,6 +212,35 @@ def _predict_epc(
     return predicted
 
 
+# --- TEMPORARY GUARD: remove once the SAP calculator's oil-heating under-score
+# is fixed (predicted oil-boiler picture scores SAP 13/G vs a recorded 50/E). ---
+# A predicted EpcPropertyData carries its own recorded SAP (energy_rating_current,
+# synthesised from the cohort). When the calculator's baseline score contradicts
+# that by more than ~one EPC band the picture is being mis-scored, so any Plan
+# built on it overshoots (e.g. goal C lands at band A). Quarantine the property —
+# skip its Plan — rather than ship nonsense. Lodged properties are unaffected:
+# they have a real recorded cert and the Rebaseliner already owns this check.
+_PREDICTED_BASELINE_DIVERGENCE_GUARD = 20.0  # SAP points (~one EPC band)
+
+
+class ImplausiblePredictedBaseline(Exception):
+    """A predicted Property's calculator baseline contradicts its recorded SAP by
+    more than a band — the calculator is mis-scoring the synthesised picture, so
+    the Plan is untrustworthy and is withheld (caught per-property as a failure)."""
+
+
+def _predicted_baseline_is_implausible(
+    baseline_sap: float, recorded_sap: Optional[int]
+) -> bool:
+    """True when a predicted Property's calculator baseline diverges from the
+    picture's own recorded SAP by more than the guard band. A missing recorded
+    SAP (no reference) is never implausible — the guard only fires on a concrete
+    contradiction."""
+    if recorded_sap is None:
+        return False
+    return abs(baseline_sap - recorded_sap) > _PREDICTED_BASELINE_DIVERGENCE_GUARD
+
+
 @task_handler(task_source="modelling_e2e", source=Source.PROPERTY)
 def handler(body: dict[str, Any], context: Any) -> Optional[dict[str, Any]]:
     trigger = ModellingE2ETriggerBody.model_validate(body)
@@ -389,6 +418,22 @@ def handler(body: dict[str, Any], context: Any) -> Optional[dict[str, Any]]:
                     f"measures={len(plan.measures)}"
                 )
 
+                # Quarantine a predicted Property whose calculator baseline
+                # contradicts its synthesised recorded SAP (TEMPORARY guard —
+                # see _predicted_baseline_is_implausible). Raising drops this one
+                # property into `failures` and skips its Plan/Baseline; the rest
+                # of the batch is unaffected.
+                if predicted_epc is not None and _predicted_baseline_is_implausible(
+                    plan.baseline.sap_continuous, effective_epc.energy_rating_current
+                ):
+                    raise ImplausiblePredictedBaseline(
+                        f"property={property_id}: predicted baseline SAP "
+                        f"{plan.baseline.sap_continuous:.1f} diverges from the "
+                        f"picture's recorded SAP {effective_epc.energy_rating_current} "
+                        f"by > {_PREDICTED_BASELINE_DIVERGENCE_GUARD:.0f} points — "
+                        f"likely a calculator mis-score; withholding the plan"
+                    )
+
                 if dry_run:
                     measure_types = (
                         ", ".join(m.measure_type for m in plan.measures) or "none"
diff --git a/tests/applications/modelling_e2e/test_handler.py b/tests/applications/modelling_e2e/test_handler.py
index baa20103..18939029 100644
--- a/tests/applications/modelling_e2e/test_handler.py
+++ b/tests/applications/modelling_e2e/test_handler.py
@@ -68,6 +68,9 @@ def _plan_mock() -> MagicMock:
     plan = MagicMock()
     plan.measures = []
     plan.cost_of_works = 0.0
+    # A plausible baseline so the predicted-baseline guard stays silent (it
+    # compares this against the picture's recorded SAP).
+    plan.baseline.sap_continuous = 50.0
     return plan
 
 
@@ -330,6 +333,7 @@ def test_prediction_path_saves_predicted_epc_plan_and_baseline(
     mock_part = MagicMock()
     mock_part.identifier = BuildingPartIdentifier.MAIN
     mock_predicted_epc.sap_building_parts = [mock_part]
+    mock_predicted_epc.energy_rating_current = 50  # matches plan baseline -> guard silent
 
     mock_comparables = MagicMock()
     mock_comparables.members = [MagicMock()]  # non-empty cohort
@@ -534,6 +538,7 @@ def test_empty_own_postcode_broadens_to_nearby_and_predicts() -> None:
     mock_part = MagicMock()
     mock_part.identifier = BuildingPartIdentifier.MAIN
     mock_predicted_epc.sap_building_parts = [mock_part]
+    mock_predicted_epc.energy_rating_current = 50  # matches plan baseline -> guard silent
 
     # First select_comparables (own postcode) is empty → broaden; the second
     # (nearby cohort) finds comparables.
@@ -757,6 +762,7 @@ def test_cohort_cache_prevents_duplicate_candidates_for_calls() -> None:
     mock_part = MagicMock()
     mock_part.identifier = BuildingPartIdentifier.MAIN
     mock_predicted_epc.sap_building_parts = [mock_part]
+    mock_predicted_epc.energy_rating_current = 50  # matches plan baseline -> guard silent
 
     mock_comparables = MagicMock()
     mock_comparables.members = [MagicMock()]
@@ -919,3 +925,26 @@ def test_dry_run_skips_all_db_writes() -> None:
 
     # Assert — UoW never entered
     MockUoW.return_value.__enter__.assert_not_called()
+
+
+def test_predicted_baseline_within_band_is_plausible() -> None:
+    # A predicted picture whose calculator baseline tracks its recorded SAP
+    # (here 50 vs 47) is trusted — the guard does not fire.
+    from applications.modelling_e2e.handler import _predicted_baseline_is_implausible
+
+    assert _predicted_baseline_is_implausible(47.0, 50) is False
+
+
+def test_predicted_baseline_beyond_band_is_implausible() -> None:
+    # The 713406 case: calculator scores the oil-boiler picture at 13 while the
+    # synthesised cert records 50 — a >20-point contradiction the guard rejects.
+    from applications.modelling_e2e.handler import _predicted_baseline_is_implausible
+
+    assert _predicted_baseline_is_implausible(13.2, 50) is True
+
+
+def test_predicted_baseline_without_a_recorded_sap_is_not_judged() -> None:
+    # No recorded SAP means no reference to contradict, so the guard stays silent.
+    from applications.modelling_e2e.handler import _predicted_baseline_is_implausible
+
+    assert _predicted_baseline_is_implausible(13.2, None) is False