feat(epc-prediction): recency-weighted mode for roof insulation (ADR-0029/0030)

Investigated recency-weighting (weight cohort votes by an exponential decay
in cert age). Key finding: it must be SELECTIVE. On the validation corpus it
HURTS permanent categoricals (wall 91.2->89.5, age 78.5->75.7 — discards
still-valid data) but clearly HELPS time-varying ones, where a recent
neighbour reflects the current physical state:
  roof_insulation_thickness  56.7 -> 60.7%  corpus   (+4pp)
                             29.4 -> 41.2%  fixture  (+12pp)

So apply a recency-weighted mode only to roof_insulation_thickness (loft
top-ups happen over time); keep the plain mode for permanent categoricals.
tau = 4yr (~2.8yr half-life); falls back to plain mode when no registration
dates are lodged. Gate floor ratcheted 0.2941 -> 0.4118.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-14 09:45:22 +00:00
parent 9dd23477ac
commit a5b7310911
3 changed files with 79 additions and 3 deletions

View file

@ -10,8 +10,10 @@ logic — deterministic neighbour synthesis, not ML.
from __future__ import annotations
import copy
import math
import statistics
from collections import Counter
from collections import Counter, defaultdict
from datetime import date
from typing import Iterable, Optional, Union
from datatypes.epc.domain.epc_property_data import (
@ -71,7 +73,10 @@ class EpcPrediction:
main: SapBuildingPart = predicted.sap_building_parts[0]
members = comparables.members
for attr in _MAIN_PART_CATEGORICALS:
mode = _mode(_main_part_attr(c, attr) for c in members)
if attr in _RECENCY_WEIGHTED_CATEGORICALS:
mode = _recency_weighted_mode(members, attr)
else:
mode = _mode(_main_part_attr(c, attr) for c in members)
if mode is not None:
setattr(main, attr, mode)
floor_dims = main.sap_floor_dimensions
@ -112,6 +117,19 @@ _FLOOR_DIM_CATEGORICALS: tuple[str, ...] = (
"floor_insulation",
)
# Categoricals whose physical value CHANGES over time (e.g. loft top-ups), so a
# recent neighbour reflects the current state better than an old one — these take
# a recency-WEIGHTED mode. Permanent categoricals (wall / age) take the plain
# mode: recency-weighting them was net-negative on the validation corpus (it
# discards data that is still valid). `_RECENCY_TAU_YEARS` is the exponential
# decay constant (≈2.8-year half-life), chosen on the corpus (roof insulation
# +4pp / +12pp on the fixture).
_RECENCY_WEIGHTED_CATEGORICALS: frozenset[str] = frozenset(
{"roof_insulation_thickness"}
)
_RECENCY_TAU_YEARS: float = 4.0
_DAYS_PER_YEAR: float = 365.0
def _main_part_attr(
comparable: Comparable, attr: str
@ -139,6 +157,30 @@ def _mode(
return Counter(present).most_common(1)[0][0]
def _recency_weighted_mode(
members: tuple[Comparable, ...], attr: str
) -> Optional[Union[int, str]]:
"""The cohort mode of a main-part attribute, weighting each comparable's vote
by recency an exponential decay in the cert's age relative to the newest in
the cohort. Newer neighbours dominate, so a stale majority can't outvote the
current state. Falls back to a plain mode when no registration dates are
lodged (all ages 0 equal weight)."""
newest: date = max(
(c.registration_date or date.min for c in members), default=date.min
)
weights: dict[Union[int, str], float] = defaultdict(float)
for comparable in members:
value = _main_part_attr(comparable, attr)
if value is None:
continue
lodged: date = comparable.registration_date or date.min
age_years: float = (newest - lodged).days / _DAYS_PER_YEAR
weights[value] += math.exp(-age_years / _RECENCY_TAU_YEARS)
if not weights:
return None
return max(weights, key=lambda value: weights[value])
def _int_mode(values: Iterable[Optional[int]]) -> Optional[int]:
"""`_mode` narrowed to int-coded fields (keeps pyright strict happy when the
target attribute is typed `Optional[int]`)."""

View file

@ -43,7 +43,7 @@ _RATE_FLOORS: dict[str, float] = {
"has_hot_water_cylinder": 0.8889,
"cylinder_insulation_type": 0.1667,
"secondary_heating_type": 0.0000,
"roof_insulation_thickness": 0.2941,
"roof_insulation_thickness": 0.4118,
"floor_insulation": 0.9062,
"has_room_in_roof": 0.8333,
"modal_glazing_type": 0.5000,

View file

@ -5,6 +5,7 @@ homogeneous categoricals to the recency-weighted cohort mode, apply Landlord
Overrides on top. Pure domain logic.
"""
from datetime import date
from typing import Optional, Union
from datatypes.epc.domain.epc_property_data import (
@ -68,6 +69,17 @@ def _cohort(*epcs: EpcPropertyData) -> ComparableProperties:
)
def _dated_cohort(
*dated: tuple[EpcPropertyData, date],
) -> ComparableProperties:
return ComparableProperties(
members=tuple(
Comparable(epc=e, certificate_number=str(i), registration_date=d)
for i, (e, d) in enumerate(dated)
)
)
def test_predicts_a_picture_by_copying_a_representative_template() -> None:
# Arrange — a single comparable with a distinctive structure (2 building
# parts, 92 m²); with nothing else to go on it is the template.
@ -189,6 +201,28 @@ def test_modes_roof_and_floor_insulation() -> None:
assert main.sap_floor_dimensions[0].floor_insulation == 2
def test_recency_weights_roof_insulation_mode() -> None:
# Arrange — an old majority (three 2015 certs at 100 mm) and a recent
# minority (two 2025 certs at 300 mm). Roof insulation is topped up over
# time, so the recent neighbours reflect the current state: the recency-
# weighted mode must pick 300 over the plain-majority 100.
cohort = _dated_cohort(
(_epc(roof_insulation_thickness=100), date(2015, 1, 1)),
(_epc(roof_insulation_thickness=100), date(2015, 1, 1)),
(_epc(roof_insulation_thickness=100), date(2015, 1, 1)),
(_epc(roof_insulation_thickness=300), date(2025, 1, 1)),
(_epc(roof_insulation_thickness=300), date(2025, 1, 1)),
)
# Act
predicted: EpcPropertyData = EpcPrediction().predict(
PredictionTarget(postcode="LS6 1AA", property_type="2"), cohort
)
# Assert — recency overrides the stale majority.
assert predicted.sap_building_parts[0].roof_insulation_thickness == 300
def test_applies_a_known_wall_override_over_the_mode() -> None:
# Arrange — the cohort mode is cavity (1), but we KNOW the target is solid
# brick (2), a Landlord Override. The known value must win over the estimate.