feat(epc-prediction): Tier-1 ratcheting Component Accuracy gate (ADR-0030)

The committed CI gate: run the calculator-free leave-one-out scorer over the
frozen anonymised fixture (36 SAP-10.2 targets) and assert each per-component
classification rate / geometry residual is no worse than a committed baseline.
Prediction is deterministic + the fixture frozen, so the numbers reproduce
exactly — a failure is a real regression, never sample noise.

- 19 rate floors + 5 residual ceilings, seeded at the currently-measured
  values; they only ever tighten (no-widening ethos on an aggregate).
- Calculator-FREE — component floors are the real gate; the end-to-end
  SAP/carbon/PE guards stay out (their floor is the separate API-path
  calculator workstream).
- Skips with a message when the fixture is absent.

25 parametrized assertions, all green.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-14 09:19:39 +00:00
parent 008c1922c4
commit e3a2720e5c

View file

@ -0,0 +1,106 @@
"""Tier-1 ratcheting Component Accuracy gate (ADR-0030).
Runs the calculator-free leave-one-out scorer over the committed, anonymised
fixture and asserts each per-component classification rate / geometry residual is
no worse than a committed baseline. Because the prediction is deterministic and
the fixture is frozen, every run reproduces the same numbers exactly so a
failure means a real *regression* in prediction quality, never sample noise.
The floors / ceilings are the currently-measured values and only ever **tighten**
(the repo's no-tolerance-widening ethos applied to an aggregate): when prediction
improves, ratchet the relevant floor up in the same change. The end-to-end
SAP / carbon / PE guards are deliberately *not* here they need the calculator,
whose API-path residual is a separate workstream; the component floors are the
real gate (ADR-0030).
"""
from pathlib import Path
import pytest
from domain.epc_prediction.validation import (
ComponentAccuracy,
evaluate_component_accuracy,
)
from harness.epc_prediction_corpus import load_corpus
_FIXTURE = Path(__file__).parents[3] / "tests" / "fixtures" / "epc_prediction"
# Minimum classification hit-rate per component (ratchet floors). Tighten — never
# loosen — as prediction improves. Values are the measured rates over the frozen
# 36-target fixture; a 1e-3 tolerance absorbs float rounding only.
_RATE_FLOORS: dict[str, float] = {
"wall_construction": 0.8889,
"wall_insulation_type": 0.7778,
"construction_age_band": 0.6389,
"roof_construction": 0.7222,
"floor_construction": 0.7500,
"heating_main_fuel": 0.9722,
"heating_main_category": 0.8889,
"heating_main_control": 0.7500,
"water_heating_fuel": 0.9167,
"water_heating_code": 0.8889,
"has_hot_water_cylinder": 0.8889,
"cylinder_insulation_type": 0.1667,
"secondary_heating_type": 0.0000,
"roof_insulation_thickness": 0.1471,
"floor_insulation": 0.9062,
"has_room_in_roof": 0.8333,
"modal_glazing_type": 0.5000,
"has_pv": 1.0000,
"solar_water_heating": 1.0000,
}
# Maximum mean absolute residual per numeric component (ratchet ceilings).
_RESIDUAL_CEILINGS: dict[str, float] = {
"floor_area": 12.2175,
"window_count": 3.8889,
"total_window_area": 4.4067,
"building_parts": 0.3333,
"door_count": 0.6389,
}
_TOLERANCE = 1e-3
@pytest.fixture(scope="module")
def accuracy() -> ComponentAccuracy:
if not (_FIXTURE / "_index.json").exists():
pytest.skip(f"no EPC Prediction fixture at {_FIXTURE}")
return evaluate_component_accuracy(load_corpus(_FIXTURE))
def test_fixture_yields_the_expected_target_count(
accuracy: ComponentAccuracy,
) -> None:
# The frozen fixture must still produce its full set of SAP-10.2 targets — a
# drop means the fixture or the target filter changed.
assert accuracy.targets >= 36
@pytest.mark.parametrize("component,floor", sorted(_RATE_FLOORS.items()))
def test_classification_rate_does_not_regress(
accuracy: ComponentAccuracy, component: str, floor: float
) -> None:
# Arrange / Act
rate = accuracy.rate(component)
# Assert — the component is still applicable and at or above its floor.
assert rate is not None, f"{component} had no applicable targets"
assert rate >= floor - _TOLERANCE, (
f"{component} classification regressed: {rate:.4f} < floor {floor:.4f}"
)
@pytest.mark.parametrize("component,ceiling", sorted(_RESIDUAL_CEILINGS.items()))
def test_residual_does_not_regress(
accuracy: ComponentAccuracy, component: str, ceiling: float
) -> None:
# Arrange / Act
mean_abs = accuracy.mean_abs_residual(component)
# Assert — the mean absolute residual is at or below its ceiling.
assert mean_abs is not None, f"{component} had no residuals"
assert mean_abs <= ceiling + _TOLERANCE, (
f"{component} residual regressed: {mean_abs:.4f} > ceiling {ceiling:.4f}"
)