feat(epc-prediction): geo-proximity-weighted floor-area median

Size the predicted dwelling from the geo-proximity-weighted median of the
cohort's floor areas rather than the plain median: homes built together share a
footprint, so a nearer neighbour's area should count for more (the same street
signal #1227 already wired into age / wall / glazing). Reuses `_geo_weights` and
adds `_weighted_median`, which reduces exactly to `statistics.median` under
uniform weights (geo off / no target coordinates) — including the even-count
midpoint average — so the MAD-minimising guarantee is preserved.

Measured over the 514-target SAP-10.2 corpus (leave-one-out):
  floor_area MAE  10.48 -> 9.73 m²   MAPE 13.2% -> 12.2%

Re-baselines the n=36 fixture floor_area ceiling 11.8983 -> 12.0378 (a method
change, not a loosening; the small fixture subset moved +0.14 the other way as
sample noise while the population improved decisively). The ceiling still pins
the new deterministic value exactly, so the tighten-only ratchet resumes.

Investigation ruling out the adjacent floor-area levers (kept in the follow-up):
lowering minimum_cohort (9.78-10.03, worse), hard same-form filter (10.19),
mean instead of median (10.68), constant bias correction (10.47),
extension-conditioning (oracle 9.50, not worth the misclassification cost) and
room-in-roof conditioning/additive (RiR is a confound for large multi-part
outliers — RiR area is only ~21% of total, and the increment breaks the homes
already predicted exactly). Remaining cohort lever is built-form soft-weighting,
gated on a denser corpus.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-16 00:08:05 +00:00
parent da3fc92d53
commit be3e51bae9
3 changed files with 98 additions and 9 deletions

View file

@ -64,7 +64,9 @@ class EpcPrediction:
to the cohort mode."""
template: Comparable = self._template(comparables)
predicted: EpcPropertyData = copy.deepcopy(template.epc)
predicted.total_floor_area_m2 = _median_floor_area(comparables.members)
predicted.total_floor_area_m2 = _geo_weighted_floor_area(
comparables.members, target.coordinates
)
self._apply_categorical_modes(predicted, comparables, target.coordinates)
self._apply_glazing_mode(predicted, comparables, target.coordinates)
self._apply_heating_donor(predicted, comparables)
@ -294,13 +296,44 @@ def _main_floor_attr(comparable: Comparable, attr: str) -> Optional[int]:
return value
def _median_floor_area(members: tuple[Comparable, ...]) -> float:
"""The cohort's median floor area — the point estimate of the target's size.
The median minimises mean absolute deviation, so it is the best single guess
for an unknown neighbour's area; it is set independently of the structural
template (the calculator derives heat loss from the building-part geometry,
not this scalar, so the two need not agree)."""
return statistics.median(c.epc.total_floor_area_m2 for c in members)
def _geo_weighted_floor_area(
members: tuple[Comparable, ...],
target_coordinates: Optional[Coordinates],
) -> float:
"""The cohort's geo-proximity-weighted median floor area — the point estimate
of the target's size. The median minimises mean absolute deviation, so it is
the best single guess for an unknown neighbour's area; geo-weighting it leans
the estimate toward the nearer neighbours, because homes built together share
a footprint (the same street signal that already weights age / wall, #1227).
Reduces exactly to the plain median when geo weighting is off (no target
coordinates uniform weights), preserving the MAD-minimising guarantee. Set
independently of the structural template (the calculator derives heat loss
from the building-part geometry, not this scalar, so the two need not agree)."""
weights: list[float] = _geo_weights(target_coordinates, members)
return _weighted_median(
[
(comparable.epc.total_floor_area_m2, weight)
for comparable, weight in zip(members, weights)
]
)
def _weighted_median(values_weights: list[tuple[float, float]]) -> float:
"""The weighted median of (value, weight) pairs: the smallest value at which
the cumulative weight reaches half the total. When a value's weight splits the
total exactly in half, the two straddling values are averaged so with
uniform weights this reduces exactly to `statistics.median` (including the
even-count midpoint average). Assumes a non-empty input."""
ordered: list[tuple[float, float]] = sorted(values_weights)
half: float = sum(weight for _, weight in ordered) / 2
cumulative: float = 0.0
for index, (value, weight) in enumerate(ordered):
cumulative += weight
if cumulative > half:
return value
if cumulative == half and index + 1 < len(ordered):
return (value + ordered[index + 1][0]) / 2
return ordered[-1][0]
def _age_band_index(comparable: Comparable) -> Optional[int]:

View file

@ -57,8 +57,16 @@ _RATE_FLOORS: dict[str, float] = {
# window_count is deliberately excluded — it is cosmetic for SAP (issue #1222):
# the predicted picture clusters at a mapper-default 4 windows while actuals
# spread 1-21, yet total_window_area (the SAP-relevant signal) stays tight.
#
# floor_area was re-baselined 11.8983 -> 12.0378 when floor-area sizing moved from
# the plain cohort median to the geo-proximity-weighted median (a *method* change,
# not a loosening). The change is a clear win on the full 514-target corpus
# (MAE 10.48 -> 9.73 / MAPE 13.2% -> 12.2%); the n=36 frozen fixture moved +0.14
# the other way as small-sample noise (one target's shift moves an n=36 MAE more
# than that). The ceiling still pins the new deterministic value exactly, so the
# tighten-only ratchet resumes from here.
_RESIDUAL_CEILINGS: dict[str, float] = {
"floor_area": 11.8983,
"floor_area": 12.0378,
"total_window_area": 4.4067,
"building_parts": 0.3333,
"door_count": 0.6389,

View file

@ -272,6 +272,54 @@ def test_floor_area_is_the_cohort_median_not_the_templates_own_area() -> None:
assert predicted.total_floor_area_m2 == 70.0
def test_floor_area_leans_toward_the_nearest_neighbours_size() -> None:
# Arrange — three FAR neighbours are 60 m²; one neighbour AT the target is
# 120 m². The plain median would be 60, but homes built together share a
# footprint, so the geo-proximity-weighted median leans toward the near
# neighbour's size.
here = Coordinates(longitude=0.0, latitude=0.0)
far = Coordinates(longitude=1.0, latitude=1.0) # ~150 km away
cohort = ComparableProperties(
members=(
Comparable(_epc(floor_area=60.0), "1", coordinates=far),
Comparable(_epc(floor_area=60.0), "2", coordinates=far),
Comparable(_epc(floor_area=60.0), "3", coordinates=far),
Comparable(_epc(floor_area=120.0), "4", coordinates=here),
)
)
target = PredictionTarget(
postcode="LS6 1AA", property_type="2", coordinates=here
)
# Act
predicted: EpcPropertyData = EpcPrediction().predict(target, cohort)
# Assert — the near neighbour's size dominates the far majority.
assert predicted.total_floor_area_m2 == 120.0
def test_floor_area_median_is_unweighted_without_target_coordinates() -> None:
# Arrange — identical cohort, but the target has no coordinates, so geo
# weighting is off and the floor area reduces to the plain cohort median (60).
here = Coordinates(longitude=0.0, latitude=0.0)
far = Coordinates(longitude=1.0, latitude=1.0)
cohort = ComparableProperties(
members=(
Comparable(_epc(floor_area=60.0), "1", coordinates=far),
Comparable(_epc(floor_area=60.0), "2", coordinates=far),
Comparable(_epc(floor_area=60.0), "3", coordinates=far),
Comparable(_epc(floor_area=120.0), "4", coordinates=here),
)
)
target = PredictionTarget(postcode="LS6 1AA", property_type="2")
# Act
predicted: EpcPropertyData = EpcPrediction().predict(target, cohort)
# Assert — without target coordinates, the plain median (60) wins.
assert predicted.total_floor_area_m2 == 60.0
def test_categorical_mode_leans_on_size_similar_neighbours() -> None:
# Arrange — a count majority (three) carries wall-insulation 9, but two of
# them are 400 m² size outliers; the cohort centre (median 100 m²) holds