mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
guard(modelling_e2e): quarantine predicted Properties the calculator mis-scores
TEMPORARY guard (remove once the SAP calculator's oil-heating under-score is fixed): a predicted oil-boiler picture scores SAP 13/G against its own synthesised recorded SAP of 50/E, so the optimiser overshoots goal C all the way to band A and publishes nonsense. A predicted EpcPropertyData carries its recorded SAP (energy_rating_current). When the calculator baseline diverges from it by more than ~one band (20 SAP points), withhold the Plan: raise inside the per-property loop so the existing failure isolation drops just that property into `failures` and fails the subtask, while every other property still models and persists. Lodged Properties are untouched — they have a real recorded cert and the Rebaseliner already owns this check. Verified end-to-end against property 713406 (UPRN 100061849247): baseline 13.2 vs recorded 50 -> quarantined, no Plan written. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
01bc93ed33
commit
0c70280dea
2 changed files with 74 additions and 0 deletions
|
|
@ -212,6 +212,35 @@ def _predict_epc(
|
||||||
return predicted
|
return predicted
|
||||||
|
|
||||||
|
|
||||||
|
# --- TEMPORARY GUARD: remove once the SAP calculator's oil-heating under-score
|
||||||
|
# is fixed (predicted oil-boiler picture scores SAP 13/G vs a recorded 50/E). ---
|
||||||
|
# A predicted EpcPropertyData carries its own recorded SAP (energy_rating_current,
|
||||||
|
# synthesised from the cohort). When the calculator's baseline score contradicts
|
||||||
|
# that by more than ~one EPC band the picture is being mis-scored, so any Plan
|
||||||
|
# built on it overshoots (e.g. goal C lands at band A). Quarantine the property —
|
||||||
|
# skip its Plan — rather than ship nonsense. Lodged properties are unaffected:
|
||||||
|
# they have a real recorded cert and the Rebaseliner already owns this check.
|
||||||
|
_PREDICTED_BASELINE_DIVERGENCE_GUARD = 20.0 # SAP points (~one EPC band)
|
||||||
|
|
||||||
|
|
||||||
|
class ImplausiblePredictedBaseline(Exception):
|
||||||
|
"""A predicted Property's calculator baseline contradicts its recorded SAP by
|
||||||
|
more than a band — the calculator is mis-scoring the synthesised picture, so
|
||||||
|
the Plan is untrustworthy and is withheld (caught per-property as a failure)."""
|
||||||
|
|
||||||
|
|
||||||
|
def _predicted_baseline_is_implausible(
|
||||||
|
baseline_sap: float, recorded_sap: Optional[int]
|
||||||
|
) -> bool:
|
||||||
|
"""True when a predicted Property's calculator baseline diverges from the
|
||||||
|
picture's own recorded SAP by more than the guard band. A missing recorded
|
||||||
|
SAP (no reference) is never implausible — the guard only fires on a concrete
|
||||||
|
contradiction."""
|
||||||
|
if recorded_sap is None:
|
||||||
|
return False
|
||||||
|
return abs(baseline_sap - recorded_sap) > _PREDICTED_BASELINE_DIVERGENCE_GUARD
|
||||||
|
|
||||||
|
|
||||||
@task_handler(task_source="modelling_e2e", source=Source.PROPERTY)
|
@task_handler(task_source="modelling_e2e", source=Source.PROPERTY)
|
||||||
def handler(body: dict[str, Any], context: Any) -> Optional[dict[str, Any]]:
|
def handler(body: dict[str, Any], context: Any) -> Optional[dict[str, Any]]:
|
||||||
trigger = ModellingE2ETriggerBody.model_validate(body)
|
trigger = ModellingE2ETriggerBody.model_validate(body)
|
||||||
|
|
@ -389,6 +418,22 @@ def handler(body: dict[str, Any], context: Any) -> Optional[dict[str, Any]]:
|
||||||
f"measures={len(plan.measures)}"
|
f"measures={len(plan.measures)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Quarantine a predicted Property whose calculator baseline
|
||||||
|
# contradicts its synthesised recorded SAP (TEMPORARY guard —
|
||||||
|
# see _predicted_baseline_is_implausible). Raising drops this one
|
||||||
|
# property into `failures` and skips its Plan/Baseline; the rest
|
||||||
|
# of the batch is unaffected.
|
||||||
|
if predicted_epc is not None and _predicted_baseline_is_implausible(
|
||||||
|
plan.baseline.sap_continuous, effective_epc.energy_rating_current
|
||||||
|
):
|
||||||
|
raise ImplausiblePredictedBaseline(
|
||||||
|
f"property={property_id}: predicted baseline SAP "
|
||||||
|
f"{plan.baseline.sap_continuous:.1f} diverges from the "
|
||||||
|
f"picture's recorded SAP {effective_epc.energy_rating_current} "
|
||||||
|
f"by > {_PREDICTED_BASELINE_DIVERGENCE_GUARD:.0f} points — "
|
||||||
|
f"likely a calculator mis-score; withholding the plan"
|
||||||
|
)
|
||||||
|
|
||||||
if dry_run:
|
if dry_run:
|
||||||
measure_types = (
|
measure_types = (
|
||||||
", ".join(m.measure_type for m in plan.measures) or "none"
|
", ".join(m.measure_type for m in plan.measures) or "none"
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,9 @@ def _plan_mock() -> MagicMock:
|
||||||
plan = MagicMock()
|
plan = MagicMock()
|
||||||
plan.measures = []
|
plan.measures = []
|
||||||
plan.cost_of_works = 0.0
|
plan.cost_of_works = 0.0
|
||||||
|
# A plausible baseline so the predicted-baseline guard stays silent (it
|
||||||
|
# compares this against the picture's recorded SAP).
|
||||||
|
plan.baseline.sap_continuous = 50.0
|
||||||
return plan
|
return plan
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -330,6 +333,7 @@ def test_prediction_path_saves_predicted_epc_plan_and_baseline(
|
||||||
mock_part = MagicMock()
|
mock_part = MagicMock()
|
||||||
mock_part.identifier = BuildingPartIdentifier.MAIN
|
mock_part.identifier = BuildingPartIdentifier.MAIN
|
||||||
mock_predicted_epc.sap_building_parts = [mock_part]
|
mock_predicted_epc.sap_building_parts = [mock_part]
|
||||||
|
mock_predicted_epc.energy_rating_current = 50 # matches plan baseline -> guard silent
|
||||||
|
|
||||||
mock_comparables = MagicMock()
|
mock_comparables = MagicMock()
|
||||||
mock_comparables.members = [MagicMock()] # non-empty cohort
|
mock_comparables.members = [MagicMock()] # non-empty cohort
|
||||||
|
|
@ -534,6 +538,7 @@ def test_empty_own_postcode_broadens_to_nearby_and_predicts() -> None:
|
||||||
mock_part = MagicMock()
|
mock_part = MagicMock()
|
||||||
mock_part.identifier = BuildingPartIdentifier.MAIN
|
mock_part.identifier = BuildingPartIdentifier.MAIN
|
||||||
mock_predicted_epc.sap_building_parts = [mock_part]
|
mock_predicted_epc.sap_building_parts = [mock_part]
|
||||||
|
mock_predicted_epc.energy_rating_current = 50 # matches plan baseline -> guard silent
|
||||||
|
|
||||||
# First select_comparables (own postcode) is empty → broaden; the second
|
# First select_comparables (own postcode) is empty → broaden; the second
|
||||||
# (nearby cohort) finds comparables.
|
# (nearby cohort) finds comparables.
|
||||||
|
|
@ -757,6 +762,7 @@ def test_cohort_cache_prevents_duplicate_candidates_for_calls() -> None:
|
||||||
mock_part = MagicMock()
|
mock_part = MagicMock()
|
||||||
mock_part.identifier = BuildingPartIdentifier.MAIN
|
mock_part.identifier = BuildingPartIdentifier.MAIN
|
||||||
mock_predicted_epc.sap_building_parts = [mock_part]
|
mock_predicted_epc.sap_building_parts = [mock_part]
|
||||||
|
mock_predicted_epc.energy_rating_current = 50 # matches plan baseline -> guard silent
|
||||||
|
|
||||||
mock_comparables = MagicMock()
|
mock_comparables = MagicMock()
|
||||||
mock_comparables.members = [MagicMock()]
|
mock_comparables.members = [MagicMock()]
|
||||||
|
|
@ -919,3 +925,26 @@ def test_dry_run_skips_all_db_writes() -> None:
|
||||||
|
|
||||||
# Assert — UoW never entered
|
# Assert — UoW never entered
|
||||||
MockUoW.return_value.__enter__.assert_not_called()
|
MockUoW.return_value.__enter__.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
def test_predicted_baseline_within_band_is_plausible() -> None:
|
||||||
|
# A predicted picture whose calculator baseline tracks its recorded SAP
|
||||||
|
# (here 50 vs 47) is trusted — the guard does not fire.
|
||||||
|
from applications.modelling_e2e.handler import _predicted_baseline_is_implausible
|
||||||
|
|
||||||
|
assert _predicted_baseline_is_implausible(47.0, 50) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_predicted_baseline_beyond_band_is_implausible() -> None:
|
||||||
|
# The 713406 case: calculator scores the oil-boiler picture at 13 while the
|
||||||
|
# synthesised cert records 50 — a >20-point contradiction the guard rejects.
|
||||||
|
from applications.modelling_e2e.handler import _predicted_baseline_is_implausible
|
||||||
|
|
||||||
|
assert _predicted_baseline_is_implausible(13.2, 50) is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_predicted_baseline_without_a_recorded_sap_is_not_judged() -> None:
|
||||||
|
# No recorded SAP means no reference to contradict, so the guard stays silent.
|
||||||
|
from applications.modelling_e2e.handler import _predicted_baseline_is_implausible
|
||||||
|
|
||||||
|
assert _predicted_baseline_is_implausible(13.2, None) is False
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue