From a5b7310911d628a3248a607569ec582b07122e29 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 14 Jun 2026 09:45:22 +0000 Subject: [PATCH] feat(epc-prediction): recency-weighted mode for roof insulation (ADR-0029/0030) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Investigated recency-weighting (weight cohort votes by an exponential decay in cert age). Key finding: it must be SELECTIVE. On the validation corpus it HURTS permanent categoricals (wall 91.2->89.5, age 78.5->75.7 — discards still-valid data) but clearly HELPS time-varying ones, where a recent neighbour reflects the current physical state: roof_insulation_thickness 56.7 -> 60.7% corpus (+4pp) 29.4 -> 41.2% fixture (+12pp) So apply a recency-weighted mode only to roof_insulation_thickness (loft top-ups happen over time); keep the plain mode for permanent categoricals. tau = 4yr (~2.8yr half-life); falls back to plain mode when no registration dates are lodged. Gate floor ratcheted 0.2941 -> 0.4118. Co-Authored-By: Claude Opus 4.8 --- domain/epc_prediction/epc_prediction.py | 46 ++++++++++++++++++- .../test_component_accuracy_gate.py | 2 +- .../epc_prediction/test_epc_prediction.py | 34 ++++++++++++++ 3 files changed, 79 insertions(+), 3 deletions(-) diff --git a/domain/epc_prediction/epc_prediction.py b/domain/epc_prediction/epc_prediction.py index d522aa56..8d6a9b3c 100644 --- a/domain/epc_prediction/epc_prediction.py +++ b/domain/epc_prediction/epc_prediction.py @@ -10,8 +10,10 @@ logic — deterministic neighbour synthesis, not ML. from __future__ import annotations import copy +import math import statistics -from collections import Counter +from collections import Counter, defaultdict +from datetime import date from typing import Iterable, Optional, Union from datatypes.epc.domain.epc_property_data import ( @@ -71,7 +73,10 @@ class EpcPrediction: main: SapBuildingPart = predicted.sap_building_parts[0] members = comparables.members for attr in _MAIN_PART_CATEGORICALS: - mode = _mode(_main_part_attr(c, attr) for c in members) + if attr in _RECENCY_WEIGHTED_CATEGORICALS: + mode = _recency_weighted_mode(members, attr) + else: + mode = _mode(_main_part_attr(c, attr) for c in members) if mode is not None: setattr(main, attr, mode) floor_dims = main.sap_floor_dimensions @@ -112,6 +117,19 @@ _FLOOR_DIM_CATEGORICALS: tuple[str, ...] = ( "floor_insulation", ) +# Categoricals whose physical value CHANGES over time (e.g. loft top-ups), so a +# recent neighbour reflects the current state better than an old one — these take +# a recency-WEIGHTED mode. Permanent categoricals (wall / age) take the plain +# mode: recency-weighting them was net-negative on the validation corpus (it +# discards data that is still valid). `_RECENCY_TAU_YEARS` is the exponential +# decay constant (≈2.8-year half-life), chosen on the corpus (roof insulation +# +4pp / +12pp on the fixture). +_RECENCY_WEIGHTED_CATEGORICALS: frozenset[str] = frozenset( + {"roof_insulation_thickness"} +) +_RECENCY_TAU_YEARS: float = 4.0 +_DAYS_PER_YEAR: float = 365.0 + def _main_part_attr( comparable: Comparable, attr: str @@ -139,6 +157,30 @@ def _mode( return Counter(present).most_common(1)[0][0] +def _recency_weighted_mode( + members: tuple[Comparable, ...], attr: str +) -> Optional[Union[int, str]]: + """The cohort mode of a main-part attribute, weighting each comparable's vote + by recency — an exponential decay in the cert's age relative to the newest in + the cohort. Newer neighbours dominate, so a stale majority can't outvote the + current state. Falls back to a plain mode when no registration dates are + lodged (all ages 0 ⇒ equal weight).""" + newest: date = max( + (c.registration_date or date.min for c in members), default=date.min + ) + weights: dict[Union[int, str], float] = defaultdict(float) + for comparable in members: + value = _main_part_attr(comparable, attr) + if value is None: + continue + lodged: date = comparable.registration_date or date.min + age_years: float = (newest - lodged).days / _DAYS_PER_YEAR + weights[value] += math.exp(-age_years / _RECENCY_TAU_YEARS) + if not weights: + return None + return max(weights, key=lambda value: weights[value]) + + def _int_mode(values: Iterable[Optional[int]]) -> Optional[int]: """`_mode` narrowed to int-coded fields (keeps pyright strict happy when the target attribute is typed `Optional[int]`).""" diff --git a/tests/domain/epc_prediction/test_component_accuracy_gate.py b/tests/domain/epc_prediction/test_component_accuracy_gate.py index 63841aaf..8edac364 100644 --- a/tests/domain/epc_prediction/test_component_accuracy_gate.py +++ b/tests/domain/epc_prediction/test_component_accuracy_gate.py @@ -43,7 +43,7 @@ _RATE_FLOORS: dict[str, float] = { "has_hot_water_cylinder": 0.8889, "cylinder_insulation_type": 0.1667, "secondary_heating_type": 0.0000, - "roof_insulation_thickness": 0.2941, + "roof_insulation_thickness": 0.4118, "floor_insulation": 0.9062, "has_room_in_roof": 0.8333, "modal_glazing_type": 0.5000, diff --git a/tests/domain/epc_prediction/test_epc_prediction.py b/tests/domain/epc_prediction/test_epc_prediction.py index df1bb8f4..c18e113e 100644 --- a/tests/domain/epc_prediction/test_epc_prediction.py +++ b/tests/domain/epc_prediction/test_epc_prediction.py @@ -5,6 +5,7 @@ homogeneous categoricals to the recency-weighted cohort mode, apply Landlord Overrides on top. Pure domain logic. """ +from datetime import date from typing import Optional, Union from datatypes.epc.domain.epc_property_data import ( @@ -68,6 +69,17 @@ def _cohort(*epcs: EpcPropertyData) -> ComparableProperties: ) +def _dated_cohort( + *dated: tuple[EpcPropertyData, date], +) -> ComparableProperties: + return ComparableProperties( + members=tuple( + Comparable(epc=e, certificate_number=str(i), registration_date=d) + for i, (e, d) in enumerate(dated) + ) + ) + + def test_predicts_a_picture_by_copying_a_representative_template() -> None: # Arrange — a single comparable with a distinctive structure (2 building # parts, 92 m²); with nothing else to go on it is the template. @@ -189,6 +201,28 @@ def test_modes_roof_and_floor_insulation() -> None: assert main.sap_floor_dimensions[0].floor_insulation == 2 +def test_recency_weights_roof_insulation_mode() -> None: + # Arrange — an old majority (three 2015 certs at 100 mm) and a recent + # minority (two 2025 certs at 300 mm). Roof insulation is topped up over + # time, so the recent neighbours reflect the current state: the recency- + # weighted mode must pick 300 over the plain-majority 100. + cohort = _dated_cohort( + (_epc(roof_insulation_thickness=100), date(2015, 1, 1)), + (_epc(roof_insulation_thickness=100), date(2015, 1, 1)), + (_epc(roof_insulation_thickness=100), date(2015, 1, 1)), + (_epc(roof_insulation_thickness=300), date(2025, 1, 1)), + (_epc(roof_insulation_thickness=300), date(2025, 1, 1)), + ) + + # Act + predicted: EpcPropertyData = EpcPrediction().predict( + PredictionTarget(postcode="LS6 1AA", property_type="2"), cohort + ) + + # Assert — recency overrides the stale majority. + assert predicted.sap_building_parts[0].roof_insulation_thickness == 300 + + def test_applies_a_known_wall_override_over_the_mode() -> None: # Arrange — the cohort mode is cavity (1), but we KNOW the target is solid # brick (2), a Landlord Override. The known value must win over the estimate.