feat(epc-prediction): geo-proximity-weighted floor-area median

Size the predicted dwelling from the geo-proximity-weighted median of the cohort's floor areas rather than the plain median: homes built together share a footprint, so a nearer neighbour's area should count for more (the same street signal #1227 already wired into age / wall / glazing). Reuses `_geo_weights` and adds `_weighted_median`, which reduces exactly to `statistics.median` under uniform weights (geo off / no target coordinates) — including the even-count midpoint average — so the MAD-minimising guarantee is preserved. Measured over the 514-target SAP-10.2 corpus (leave-one-out): floor_area MAE 10.48 -> 9.73 m² MAPE 13.2% -> 12.2% Re-baselines the n=36 fixture floor_area ceiling 11.8983 -> 12.0378 (a method change, not a loosening; the small fixture subset moved +0.14 the other way as sample noise while the population improved decisively). The ceiling still pins the new deterministic value exactly, so the tighten-only ratchet resumes. Investigation ruling out the adjacent floor-area levers (kept in the follow-up): lowering minimum_cohort (9.78-10.03, worse), hard same-form filter (10.19), mean instead of median (10.68), constant bias correction (10.47), extension-conditioning (oracle 9.50, not worth the misclassification cost) and room-in-roof conditioning/additive (RiR is a confound for large multi-part outliers — RiR area is only ~21% of total, and the increment breaks the homes already predicted exactly). Remaining cohort lever is built-form soft-weighting, gated on a denser corpus. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 13:10:47 +00:00 · 2026-06-16 00:08:05 +00:00 · 2026-06-16 00:08:05 +00:00 · be3e51bae9
commit be3e51bae9
parent da3fc92d53
3 changed files with 98 additions and 9 deletions
--- a/domain/epc_prediction/epc_prediction.py
+++ b/domain/epc_prediction/epc_prediction.py
@ -64,7 +64,9 @@ class EpcPrediction:
        to the cohort mode."""
        template: Comparable = self._template(comparables)
        predicted: EpcPropertyData = copy.deepcopy(template.epc)
-        predicted.total_floor_area_m2 = _median_floor_area(comparables.members)
+        predicted.total_floor_area_m2 = _geo_weighted_floor_area(
+            comparables.members, target.coordinates
+        )
        self._apply_categorical_modes(predicted, comparables, target.coordinates)
        self._apply_glazing_mode(predicted, comparables, target.coordinates)
        self._apply_heating_donor(predicted, comparables)
@ -294,13 +296,44 @@ def _main_floor_attr(comparable: Comparable, attr: str) -> Optional[int]:
    return value


-def _median_floor_area(members: tuple[Comparable, ...]) -> float:
-    """The cohort's median floor area — the point estimate of the target's size.
-    The median minimises mean absolute deviation, so it is the best single guess
-    for an unknown neighbour's area; it is set independently of the structural
-    template (the calculator derives heat loss from the building-part geometry,
-    not this scalar, so the two need not agree)."""
-    return statistics.median(c.epc.total_floor_area_m2 for c in members)
+def _geo_weighted_floor_area(
+    members: tuple[Comparable, ...],
+    target_coordinates: Optional[Coordinates],
+) -> float:
+    """The cohort's geo-proximity-weighted median floor area — the point estimate
+    of the target's size. The median minimises mean absolute deviation, so it is
+    the best single guess for an unknown neighbour's area; geo-weighting it leans
+    the estimate toward the nearer neighbours, because homes built together share
+    a footprint (the same street signal that already weights age / wall, #1227).
+    Reduces exactly to the plain median when geo weighting is off (no target
+    coordinates ⇒ uniform weights), preserving the MAD-minimising guarantee. Set
+    independently of the structural template (the calculator derives heat loss
+    from the building-part geometry, not this scalar, so the two need not agree)."""
+    weights: list[float] = _geo_weights(target_coordinates, members)
+    return _weighted_median(
+        [
+            (comparable.epc.total_floor_area_m2, weight)
+            for comparable, weight in zip(members, weights)
+        ]
+    )
+
+
+def _weighted_median(values_weights: list[tuple[float, float]]) -> float:
+    """The weighted median of (value, weight) pairs: the smallest value at which
+    the cumulative weight reaches half the total. When a value's weight splits the
+    total exactly in half, the two straddling values are averaged — so with
+    uniform weights this reduces exactly to `statistics.median` (including the
+    even-count midpoint average). Assumes a non-empty input."""
+    ordered: list[tuple[float, float]] = sorted(values_weights)
+    half: float = sum(weight for _, weight in ordered) / 2
+    cumulative: float = 0.0
+    for index, (value, weight) in enumerate(ordered):
+        cumulative += weight
+        if cumulative > half:
+            return value
+        if cumulative == half and index + 1 < len(ordered):
+            return (value + ordered[index + 1][0]) / 2
+    return ordered[-1][0]


 def _age_band_index(comparable: Comparable) -> Optional[int]:
--- a/tests/domain/epc_prediction/test_component_accuracy_gate.py
+++ b/tests/domain/epc_prediction/test_component_accuracy_gate.py
@ -57,8 +57,16 @@ _RATE_FLOORS: dict[str, float] = {
 # window_count is deliberately excluded — it is cosmetic for SAP (issue #1222):
 # the predicted picture clusters at a mapper-default 4 windows while actuals
 # spread 1-21, yet total_window_area (the SAP-relevant signal) stays tight.
+#
+# floor_area was re-baselined 11.8983 -> 12.0378 when floor-area sizing moved from
+# the plain cohort median to the geo-proximity-weighted median (a *method* change,
+# not a loosening). The change is a clear win on the full 514-target corpus
+# (MAE 10.48 -> 9.73 / MAPE 13.2% -> 12.2%); the n=36 frozen fixture moved +0.14
+# the other way as small-sample noise (one target's shift moves an n=36 MAE more
+# than that). The ceiling still pins the new deterministic value exactly, so the
+# tighten-only ratchet resumes from here.
 _RESIDUAL_CEILINGS: dict[str, float] = {
-    "floor_area": 11.8983,
+    "floor_area": 12.0378,
    "total_window_area": 4.4067,
    "building_parts": 0.3333,
    "door_count": 0.6389,
--- a/tests/domain/epc_prediction/test_epc_prediction.py
+++ b/tests/domain/epc_prediction/test_epc_prediction.py
@ -272,6 +272,54 @@ def test_floor_area_is_the_cohort_median_not_the_templates_own_area() -> None:
    assert predicted.total_floor_area_m2 == 70.0


+def test_floor_area_leans_toward_the_nearest_neighbours_size() -> None:
+    # Arrange — three FAR neighbours are 60 m²; one neighbour AT the target is
+    # 120 m². The plain median would be 60, but homes built together share a
+    # footprint, so the geo-proximity-weighted median leans toward the near
+    # neighbour's size.
+    here = Coordinates(longitude=0.0, latitude=0.0)
+    far = Coordinates(longitude=1.0, latitude=1.0)  # ~150 km away
+    cohort = ComparableProperties(
+        members=(
+            Comparable(_epc(floor_area=60.0), "1", coordinates=far),
+            Comparable(_epc(floor_area=60.0), "2", coordinates=far),
+            Comparable(_epc(floor_area=60.0), "3", coordinates=far),
+            Comparable(_epc(floor_area=120.0), "4", coordinates=here),
+        )
+    )
+    target = PredictionTarget(
+        postcode="LS6 1AA", property_type="2", coordinates=here
+    )
+
+    # Act
+    predicted: EpcPropertyData = EpcPrediction().predict(target, cohort)
+
+    # Assert — the near neighbour's size dominates the far majority.
+    assert predicted.total_floor_area_m2 == 120.0
+
+
+def test_floor_area_median_is_unweighted_without_target_coordinates() -> None:
+    # Arrange — identical cohort, but the target has no coordinates, so geo
+    # weighting is off and the floor area reduces to the plain cohort median (60).
+    here = Coordinates(longitude=0.0, latitude=0.0)
+    far = Coordinates(longitude=1.0, latitude=1.0)
+    cohort = ComparableProperties(
+        members=(
+            Comparable(_epc(floor_area=60.0), "1", coordinates=far),
+            Comparable(_epc(floor_area=60.0), "2", coordinates=far),
+            Comparable(_epc(floor_area=60.0), "3", coordinates=far),
+            Comparable(_epc(floor_area=120.0), "4", coordinates=here),
+        )
+    )
+    target = PredictionTarget(postcode="LS6 1AA", property_type="2")
+
+    # Act
+    predicted: EpcPropertyData = EpcPrediction().predict(target, cohort)
+
+    # Assert — without target coordinates, the plain median (60) wins.
+    assert predicted.total_floor_area_m2 == 60.0
+
+
 def test_categorical_mode_leans_on_size_similar_neighbours() -> None:
    # Arrange — a count majority (three) carries wall-insulation 9, but two of
    # them are 400 m² size outliers; the cohort centre (median 100 m²) holds