feat(epc-prediction): ±1-band age scoring + window_count cosmetic (#1222)

Measurement honesty so we optimise SAP-relevant accuracy, not SAP-neutral misses (ADR-0030 Component Accuracy): - Add construction_age_band_pm1: an exact-or-adjacent-band hit. Adjacent RdSAP age bands carry near-identical U-values, so an off-by-one is ~SAP-neutral. Full corpus: exact 78.5% but ±1-band 91.7% (fixture 63.9% -> 83.3%) — most age misses are adjacent. - Drop window_count from the gate's residual ceilings (cosmetic): the predicted picture clusters at a mapper-default 4 windows vs actuals 1-21, but total_window_area (the SAP-relevant signal) stays tight at ~3.4 m2. Gate: + construction_age_band_pm1 floor 0.8333; window_count no longer gated. Closes #1222 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 13:10:47 +00:00 · 2026-06-14 10:01:20 +00:00 · 2026-06-14 10:01:20 +00:00 · ffaedd8d14
commit ffaedd8d14
parent a5b7310911
3 changed files with 62 additions and 1 deletions
--- a/domain/epc_prediction/prediction_comparison.py
+++ b/domain/epc_prediction/prediction_comparison.py
@ -59,6 +59,32 @@ def _classify(predicted: object, actual: object) -> Optional[bool]:
    return predicted == actual
 # RdSAP construction age bands, oldest → newest. Adjacent bands carry near-
 # identical U-values, so an off-by-one is treated as a (SAP-neutral) ±1 hit.
 _AGE_BAND_ORDER: str = "ABCDEFGHIJKL"
 def _age_band_within_one(predicted: object, actual: object) -> Optional[bool]:
    """A ±1-band age hit: None when the actual is absent, True on an exact or
    adjacent-band match, else False (issue #1222 — exact match overstates the
    SAP impact of age-band misses)."""
    if actual is None:
        return None
    if predicted == actual:
        return True
    if (
        isinstance(predicted, str)
        and isinstance(actual, str)
        and predicted in _AGE_BAND_ORDER
        and actual in _AGE_BAND_ORDER
    ):
        return (
            abs(_AGE_BAND_ORDER.index(predicted) - _AGE_BAND_ORDER.index(actual))
            <= 1
        )
    return False
 def _main_heating_detail(epc: EpcPropertyData) -> Optional[MainHeatingDetail]:
    """The primary heating system's detail row, or None when none is lodged."""
    details = epc.sap_heating.main_heating_details
@ -179,6 +205,10 @@ def compare_prediction(
            _main(predicted).construction_age_band,
            _main(actual).construction_age_band,
        ),
        "construction_age_band_pm1": _age_band_within_one(
            _main(predicted).construction_age_band,
            _main(actual).construction_age_band,
        ),
        "roof_construction": _classify(
            _main(predicted).roof_construction,
            _main(actual).roof_construction,
--- a/tests/domain/epc_prediction/test_component_accuracy_gate.py
+++ b/tests/domain/epc_prediction/test_component_accuracy_gate.py
@ -33,6 +33,7 @@ _RATE_FLOORS: dict[str, float] = {
    "wall_construction": 0.8889,
    "wall_insulation_type": 0.7778,
    "construction_age_band": 0.6389,
    "construction_age_band_pm1": 0.8333,
    "roof_construction": 0.7222,
    "floor_construction": 0.7500,
    "heating_main_fuel": 0.9722,
@ -52,9 +53,11 @@ _RATE_FLOORS: dict[str, float] = {
 }
 # Maximum mean absolute residual per numeric component (ratchet ceilings).
 # window_count is deliberately excluded — it is cosmetic for SAP (issue #1222):
 # the predicted picture clusters at a mapper-default 4 windows while actuals
 # spread 1-21, yet total_window_area (the SAP-relevant signal) stays tight.
 _RESIDUAL_CEILINGS: dict[str, float] = {
    "floor_area": 12.2175,
    "window_count": 3.8889,
    "total_window_area": 4.4067,
    "building_parts": 0.3333,
    "door_count": 0.6389,
--- a/tests/domain/epc_prediction/test_prediction_comparison.py
+++ b/tests/domain/epc_prediction/test_prediction_comparison.py
@ -97,6 +97,34 @@ def _epc(
    return epc
 def test_scores_age_band_within_one_band() -> None:
    # Arrange — predicted age band K, actual J (adjacent). Adjacent RdSAP age
    # bands carry near-identical U-values, so an off-by-one is ~SAP-neutral: it
    # misses the exact hit but counts as a ±1-band hit (issue #1222).
    predicted = _epc(construction_age_band="K")
    actual = _epc(construction_age_band="J")
    # Act
    hits = compare_prediction(predicted, actual).categorical_hits
    # Assert
    assert hits["construction_age_band"] is False
    assert hits["construction_age_band_pm1"] is True
 def test_age_band_two_apart_misses_both() -> None:
    # Arrange — predicted K, actual H (three bands apart): a real miss on both.
    predicted = _epc(construction_age_band="K")
    actual = _epc(construction_age_band="H")
    # Act
    hits = compare_prediction(predicted, actual).categorical_hits
    # Assert
    assert hits["construction_age_band"] is False
    assert hits["construction_age_band_pm1"] is False
 def test_flags_a_correct_main_wall_construction_classification() -> None:
    # Arrange — predicted and actual agree on cavity (1).
    predicted = _epc(wall_construction=1)