feat(epc-prediction): Tier-1 ratcheting Component Accuracy gate (ADR-0030)

The committed CI gate: run the calculator-free leave-one-out scorer over the frozen anonymised fixture (36 SAP-10.2 targets) and assert each per-component classification rate / geometry residual is no worse than a committed baseline. Prediction is deterministic + the fixture frozen, so the numbers reproduce exactly — a failure is a real regression, never sample noise. - 19 rate floors + 5 residual ceilings, seeded at the currently-measured values; they only ever tighten (no-widening ethos on an aggregate). - Calculator-FREE — component floors are the real gate; the end-to-end SAP/carbon/PE guards stay out (their floor is the separate API-path calculator workstream). - Skips with a message when the fixture is absent. 25 parametrized assertions, all green. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 13:10:47 +00:00 · 2026-06-14 09:19:39 +00:00 · 2026-06-14 09:19:39 +00:00 · e3a2720e5c
commit e3a2720e5c
parent 008c1922c4
1 changed files with 106 additions and 0 deletions
--- a/tests/domain/epc_prediction/test_component_accuracy_gate.py
+++ b/tests/domain/epc_prediction/test_component_accuracy_gate.py
@ -0,0 +1,106 @@
+"""Tier-1 ratcheting Component Accuracy gate (ADR-0030).
+
+Runs the calculator-free leave-one-out scorer over the committed, anonymised
+fixture and asserts each per-component classification rate / geometry residual is
+no worse than a committed baseline. Because the prediction is deterministic and
+the fixture is frozen, every run reproduces the same numbers exactly — so a
+failure means a real *regression* in prediction quality, never sample noise.
+
+The floors / ceilings are the currently-measured values and only ever **tighten**
+(the repo's no-tolerance-widening ethos applied to an aggregate): when prediction
+improves, ratchet the relevant floor up in the same change. The end-to-end
+SAP / carbon / PE guards are deliberately *not* here — they need the calculator,
+whose API-path residual is a separate workstream; the component floors are the
+real gate (ADR-0030).
+"""
+
+from pathlib import Path
+
+import pytest
+
+from domain.epc_prediction.validation import (
+    ComponentAccuracy,
+    evaluate_component_accuracy,
+)
+from harness.epc_prediction_corpus import load_corpus
+
+_FIXTURE = Path(__file__).parents[3] / "tests" / "fixtures" / "epc_prediction"
+
+# Minimum classification hit-rate per component (ratchet floors). Tighten — never
+# loosen — as prediction improves. Values are the measured rates over the frozen
+# 36-target fixture; a 1e-3 tolerance absorbs float rounding only.
+_RATE_FLOORS: dict[str, float] = {
+    "wall_construction": 0.8889,
+    "wall_insulation_type": 0.7778,
+    "construction_age_band": 0.6389,
+    "roof_construction": 0.7222,
+    "floor_construction": 0.7500,
+    "heating_main_fuel": 0.9722,
+    "heating_main_category": 0.8889,
+    "heating_main_control": 0.7500,
+    "water_heating_fuel": 0.9167,
+    "water_heating_code": 0.8889,
+    "has_hot_water_cylinder": 0.8889,
+    "cylinder_insulation_type": 0.1667,
+    "secondary_heating_type": 0.0000,
+    "roof_insulation_thickness": 0.1471,
+    "floor_insulation": 0.9062,
+    "has_room_in_roof": 0.8333,
+    "modal_glazing_type": 0.5000,
+    "has_pv": 1.0000,
+    "solar_water_heating": 1.0000,
+}
+
+# Maximum mean absolute residual per numeric component (ratchet ceilings).
+_RESIDUAL_CEILINGS: dict[str, float] = {
+    "floor_area": 12.2175,
+    "window_count": 3.8889,
+    "total_window_area": 4.4067,
+    "building_parts": 0.3333,
+    "door_count": 0.6389,
+}
+
+_TOLERANCE = 1e-3
+
+
+@pytest.fixture(scope="module")
+def accuracy() -> ComponentAccuracy:
+    if not (_FIXTURE / "_index.json").exists():
+        pytest.skip(f"no EPC Prediction fixture at {_FIXTURE}")
+    return evaluate_component_accuracy(load_corpus(_FIXTURE))
+
+
+def test_fixture_yields_the_expected_target_count(
+    accuracy: ComponentAccuracy,
+) -> None:
+    # The frozen fixture must still produce its full set of SAP-10.2 targets — a
+    # drop means the fixture or the target filter changed.
+    assert accuracy.targets >= 36
+
+
+@pytest.mark.parametrize("component,floor", sorted(_RATE_FLOORS.items()))
+def test_classification_rate_does_not_regress(
+    accuracy: ComponentAccuracy, component: str, floor: float
+) -> None:
+    # Arrange / Act
+    rate = accuracy.rate(component)
+
+    # Assert — the component is still applicable and at or above its floor.
+    assert rate is not None, f"{component} had no applicable targets"
+    assert rate >= floor - _TOLERANCE, (
+        f"{component} classification regressed: {rate:.4f} < floor {floor:.4f}"
+    )
+
+
+@pytest.mark.parametrize("component,ceiling", sorted(_RESIDUAL_CEILINGS.items()))
+def test_residual_does_not_regress(
+    accuracy: ComponentAccuracy, component: str, ceiling: float
+) -> None:
+    # Arrange / Act
+    mean_abs = accuracy.mean_abs_residual(component)
+
+    # Assert — the mean absolute residual is at or below its ceiling.
+    assert mean_abs is not None, f"{component} had no residuals"
+    assert mean_abs <= ceiling + _TOLERANCE, (
+        f"{component} residual regressed: {mean_abs:.4f} > ceiling {ceiling:.4f}"
+    )