mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
feat(epc-prediction): recency-weighted mode for roof insulation (ADR-0029/0030)
Investigated recency-weighting (weight cohort votes by an exponential decay
in cert age). Key finding: it must be SELECTIVE. On the validation corpus it
HURTS permanent categoricals (wall 91.2->89.5, age 78.5->75.7 — discards
still-valid data) but clearly HELPS time-varying ones, where a recent
neighbour reflects the current physical state:
roof_insulation_thickness 56.7 -> 60.7% corpus (+4pp)
29.4 -> 41.2% fixture (+12pp)
So apply a recency-weighted mode only to roof_insulation_thickness (loft
top-ups happen over time); keep the plain mode for permanent categoricals.
tau = 4yr (~2.8yr half-life); falls back to plain mode when no registration
dates are lodged. Gate floor ratcheted 0.2941 -> 0.4118.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
9dd23477ac
commit
a5b7310911
3 changed files with 79 additions and 3 deletions
|
|
@ -10,8 +10,10 @@ logic — deterministic neighbour synthesis, not ML.
|
|||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import math
|
||||
import statistics
|
||||
from collections import Counter
|
||||
from collections import Counter, defaultdict
|
||||
from datetime import date
|
||||
from typing import Iterable, Optional, Union
|
||||
|
||||
from datatypes.epc.domain.epc_property_data import (
|
||||
|
|
@ -71,7 +73,10 @@ class EpcPrediction:
|
|||
main: SapBuildingPart = predicted.sap_building_parts[0]
|
||||
members = comparables.members
|
||||
for attr in _MAIN_PART_CATEGORICALS:
|
||||
mode = _mode(_main_part_attr(c, attr) for c in members)
|
||||
if attr in _RECENCY_WEIGHTED_CATEGORICALS:
|
||||
mode = _recency_weighted_mode(members, attr)
|
||||
else:
|
||||
mode = _mode(_main_part_attr(c, attr) for c in members)
|
||||
if mode is not None:
|
||||
setattr(main, attr, mode)
|
||||
floor_dims = main.sap_floor_dimensions
|
||||
|
|
@ -112,6 +117,19 @@ _FLOOR_DIM_CATEGORICALS: tuple[str, ...] = (
|
|||
"floor_insulation",
|
||||
)
|
||||
|
||||
# Categoricals whose physical value CHANGES over time (e.g. loft top-ups), so a
|
||||
# recent neighbour reflects the current state better than an old one — these take
|
||||
# a recency-WEIGHTED mode. Permanent categoricals (wall / age) take the plain
|
||||
# mode: recency-weighting them was net-negative on the validation corpus (it
|
||||
# discards data that is still valid). `_RECENCY_TAU_YEARS` is the exponential
|
||||
# decay constant (≈2.8-year half-life), chosen on the corpus (roof insulation
|
||||
# +4pp / +12pp on the fixture).
|
||||
_RECENCY_WEIGHTED_CATEGORICALS: frozenset[str] = frozenset(
|
||||
{"roof_insulation_thickness"}
|
||||
)
|
||||
_RECENCY_TAU_YEARS: float = 4.0
|
||||
_DAYS_PER_YEAR: float = 365.0
|
||||
|
||||
|
||||
def _main_part_attr(
|
||||
comparable: Comparable, attr: str
|
||||
|
|
@ -139,6 +157,30 @@ def _mode(
|
|||
return Counter(present).most_common(1)[0][0]
|
||||
|
||||
|
||||
def _recency_weighted_mode(
|
||||
members: tuple[Comparable, ...], attr: str
|
||||
) -> Optional[Union[int, str]]:
|
||||
"""The cohort mode of a main-part attribute, weighting each comparable's vote
|
||||
by recency — an exponential decay in the cert's age relative to the newest in
|
||||
the cohort. Newer neighbours dominate, so a stale majority can't outvote the
|
||||
current state. Falls back to a plain mode when no registration dates are
|
||||
lodged (all ages 0 ⇒ equal weight)."""
|
||||
newest: date = max(
|
||||
(c.registration_date or date.min for c in members), default=date.min
|
||||
)
|
||||
weights: dict[Union[int, str], float] = defaultdict(float)
|
||||
for comparable in members:
|
||||
value = _main_part_attr(comparable, attr)
|
||||
if value is None:
|
||||
continue
|
||||
lodged: date = comparable.registration_date or date.min
|
||||
age_years: float = (newest - lodged).days / _DAYS_PER_YEAR
|
||||
weights[value] += math.exp(-age_years / _RECENCY_TAU_YEARS)
|
||||
if not weights:
|
||||
return None
|
||||
return max(weights, key=lambda value: weights[value])
|
||||
|
||||
|
||||
def _int_mode(values: Iterable[Optional[int]]) -> Optional[int]:
|
||||
"""`_mode` narrowed to int-coded fields (keeps pyright strict happy when the
|
||||
target attribute is typed `Optional[int]`)."""
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ _RATE_FLOORS: dict[str, float] = {
|
|||
"has_hot_water_cylinder": 0.8889,
|
||||
"cylinder_insulation_type": 0.1667,
|
||||
"secondary_heating_type": 0.0000,
|
||||
"roof_insulation_thickness": 0.2941,
|
||||
"roof_insulation_thickness": 0.4118,
|
||||
"floor_insulation": 0.9062,
|
||||
"has_room_in_roof": 0.8333,
|
||||
"modal_glazing_type": 0.5000,
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ homogeneous categoricals to the recency-weighted cohort mode, apply Landlord
|
|||
Overrides on top. Pure domain logic.
|
||||
"""
|
||||
|
||||
from datetime import date
|
||||
from typing import Optional, Union
|
||||
|
||||
from datatypes.epc.domain.epc_property_data import (
|
||||
|
|
@ -68,6 +69,17 @@ def _cohort(*epcs: EpcPropertyData) -> ComparableProperties:
|
|||
)
|
||||
|
||||
|
||||
def _dated_cohort(
|
||||
*dated: tuple[EpcPropertyData, date],
|
||||
) -> ComparableProperties:
|
||||
return ComparableProperties(
|
||||
members=tuple(
|
||||
Comparable(epc=e, certificate_number=str(i), registration_date=d)
|
||||
for i, (e, d) in enumerate(dated)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def test_predicts_a_picture_by_copying_a_representative_template() -> None:
|
||||
# Arrange — a single comparable with a distinctive structure (2 building
|
||||
# parts, 92 m²); with nothing else to go on it is the template.
|
||||
|
|
@ -189,6 +201,28 @@ def test_modes_roof_and_floor_insulation() -> None:
|
|||
assert main.sap_floor_dimensions[0].floor_insulation == 2
|
||||
|
||||
|
||||
def test_recency_weights_roof_insulation_mode() -> None:
|
||||
# Arrange — an old majority (three 2015 certs at 100 mm) and a recent
|
||||
# minority (two 2025 certs at 300 mm). Roof insulation is topped up over
|
||||
# time, so the recent neighbours reflect the current state: the recency-
|
||||
# weighted mode must pick 300 over the plain-majority 100.
|
||||
cohort = _dated_cohort(
|
||||
(_epc(roof_insulation_thickness=100), date(2015, 1, 1)),
|
||||
(_epc(roof_insulation_thickness=100), date(2015, 1, 1)),
|
||||
(_epc(roof_insulation_thickness=100), date(2015, 1, 1)),
|
||||
(_epc(roof_insulation_thickness=300), date(2025, 1, 1)),
|
||||
(_epc(roof_insulation_thickness=300), date(2025, 1, 1)),
|
||||
)
|
||||
|
||||
# Act
|
||||
predicted: EpcPropertyData = EpcPrediction().predict(
|
||||
PredictionTarget(postcode="LS6 1AA", property_type="2"), cohort
|
||||
)
|
||||
|
||||
# Assert — recency overrides the stale majority.
|
||||
assert predicted.sap_building_parts[0].roof_insulation_thickness == 300
|
||||
|
||||
|
||||
def test_applies_a_known_wall_override_over_the_mode() -> None:
|
||||
# Arrange — the cohort mode is cavity (1), but we KNOW the target is solid
|
||||
# brick (2), a Landlord Override. The known value must win over the estimate.
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue