From e3a2720e5cb1eb1a1ca94167d65c9e0dce4c6d83 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 14 Jun 2026 09:19:39 +0000 Subject: [PATCH] feat(epc-prediction): Tier-1 ratcheting Component Accuracy gate (ADR-0030) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The committed CI gate: run the calculator-free leave-one-out scorer over the frozen anonymised fixture (36 SAP-10.2 targets) and assert each per-component classification rate / geometry residual is no worse than a committed baseline. Prediction is deterministic + the fixture frozen, so the numbers reproduce exactly — a failure is a real regression, never sample noise. - 19 rate floors + 5 residual ceilings, seeded at the currently-measured values; they only ever tighten (no-widening ethos on an aggregate). - Calculator-FREE — component floors are the real gate; the end-to-end SAP/carbon/PE guards stay out (their floor is the separate API-path calculator workstream). - Skips with a message when the fixture is absent. 25 parametrized assertions, all green. Co-Authored-By: Claude Opus 4.8 --- .../test_component_accuracy_gate.py | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 tests/domain/epc_prediction/test_component_accuracy_gate.py diff --git a/tests/domain/epc_prediction/test_component_accuracy_gate.py b/tests/domain/epc_prediction/test_component_accuracy_gate.py new file mode 100644 index 00000000..20897e97 --- /dev/null +++ b/tests/domain/epc_prediction/test_component_accuracy_gate.py @@ -0,0 +1,106 @@ +"""Tier-1 ratcheting Component Accuracy gate (ADR-0030). + +Runs the calculator-free leave-one-out scorer over the committed, anonymised +fixture and asserts each per-component classification rate / geometry residual is +no worse than a committed baseline. Because the prediction is deterministic and +the fixture is frozen, every run reproduces the same numbers exactly — so a +failure means a real *regression* in prediction quality, never sample noise. + +The floors / ceilings are the currently-measured values and only ever **tighten** +(the repo's no-tolerance-widening ethos applied to an aggregate): when prediction +improves, ratchet the relevant floor up in the same change. The end-to-end +SAP / carbon / PE guards are deliberately *not* here — they need the calculator, +whose API-path residual is a separate workstream; the component floors are the +real gate (ADR-0030). +""" + +from pathlib import Path + +import pytest + +from domain.epc_prediction.validation import ( + ComponentAccuracy, + evaluate_component_accuracy, +) +from harness.epc_prediction_corpus import load_corpus + +_FIXTURE = Path(__file__).parents[3] / "tests" / "fixtures" / "epc_prediction" + +# Minimum classification hit-rate per component (ratchet floors). Tighten — never +# loosen — as prediction improves. Values are the measured rates over the frozen +# 36-target fixture; a 1e-3 tolerance absorbs float rounding only. +_RATE_FLOORS: dict[str, float] = { + "wall_construction": 0.8889, + "wall_insulation_type": 0.7778, + "construction_age_band": 0.6389, + "roof_construction": 0.7222, + "floor_construction": 0.7500, + "heating_main_fuel": 0.9722, + "heating_main_category": 0.8889, + "heating_main_control": 0.7500, + "water_heating_fuel": 0.9167, + "water_heating_code": 0.8889, + "has_hot_water_cylinder": 0.8889, + "cylinder_insulation_type": 0.1667, + "secondary_heating_type": 0.0000, + "roof_insulation_thickness": 0.1471, + "floor_insulation": 0.9062, + "has_room_in_roof": 0.8333, + "modal_glazing_type": 0.5000, + "has_pv": 1.0000, + "solar_water_heating": 1.0000, +} + +# Maximum mean absolute residual per numeric component (ratchet ceilings). +_RESIDUAL_CEILINGS: dict[str, float] = { + "floor_area": 12.2175, + "window_count": 3.8889, + "total_window_area": 4.4067, + "building_parts": 0.3333, + "door_count": 0.6389, +} + +_TOLERANCE = 1e-3 + + +@pytest.fixture(scope="module") +def accuracy() -> ComponentAccuracy: + if not (_FIXTURE / "_index.json").exists(): + pytest.skip(f"no EPC Prediction fixture at {_FIXTURE}") + return evaluate_component_accuracy(load_corpus(_FIXTURE)) + + +def test_fixture_yields_the_expected_target_count( + accuracy: ComponentAccuracy, +) -> None: + # The frozen fixture must still produce its full set of SAP-10.2 targets — a + # drop means the fixture or the target filter changed. + assert accuracy.targets >= 36 + + +@pytest.mark.parametrize("component,floor", sorted(_RATE_FLOORS.items())) +def test_classification_rate_does_not_regress( + accuracy: ComponentAccuracy, component: str, floor: float +) -> None: + # Arrange / Act + rate = accuracy.rate(component) + + # Assert — the component is still applicable and at or above its floor. + assert rate is not None, f"{component} had no applicable targets" + assert rate >= floor - _TOLERANCE, ( + f"{component} classification regressed: {rate:.4f} < floor {floor:.4f}" + ) + + +@pytest.mark.parametrize("component,ceiling", sorted(_RESIDUAL_CEILINGS.items())) +def test_residual_does_not_regress( + accuracy: ComponentAccuracy, component: str, ceiling: float +) -> None: + # Arrange / Act + mean_abs = accuracy.mean_abs_residual(component) + + # Assert — the mean absolute residual is at or below its ceiling. + assert mean_abs is not None, f"{component} had no residuals" + assert mean_abs <= ceiling + _TOLERANCE, ( + f"{component} residual regressed: {mean_abs:.4f} > ceiling {ceiling:.4f}" + )