mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
feat(epc-prediction): geo-proximity-weighted floor-area median
Size the predicted dwelling from the geo-proximity-weighted median of the cohort's floor areas rather than the plain median: homes built together share a footprint, so a nearer neighbour's area should count for more (the same street signal #1227 already wired into age / wall / glazing). Reuses `_geo_weights` and adds `_weighted_median`, which reduces exactly to `statistics.median` under uniform weights (geo off / no target coordinates) — including the even-count midpoint average — so the MAD-minimising guarantee is preserved. Measured over the 514-target SAP-10.2 corpus (leave-one-out): floor_area MAE 10.48 -> 9.73 m² MAPE 13.2% -> 12.2% Re-baselines the n=36 fixture floor_area ceiling 11.8983 -> 12.0378 (a method change, not a loosening; the small fixture subset moved +0.14 the other way as sample noise while the population improved decisively). The ceiling still pins the new deterministic value exactly, so the tighten-only ratchet resumes. Investigation ruling out the adjacent floor-area levers (kept in the follow-up): lowering minimum_cohort (9.78-10.03, worse), hard same-form filter (10.19), mean instead of median (10.68), constant bias correction (10.47), extension-conditioning (oracle 9.50, not worth the misclassification cost) and room-in-roof conditioning/additive (RiR is a confound for large multi-part outliers — RiR area is only ~21% of total, and the increment breaks the homes already predicted exactly). Remaining cohort lever is built-form soft-weighting, gated on a denser corpus. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
da3fc92d53
commit
be3e51bae9
3 changed files with 98 additions and 9 deletions
|
|
@ -64,7 +64,9 @@ class EpcPrediction:
|
|||
to the cohort mode."""
|
||||
template: Comparable = self._template(comparables)
|
||||
predicted: EpcPropertyData = copy.deepcopy(template.epc)
|
||||
predicted.total_floor_area_m2 = _median_floor_area(comparables.members)
|
||||
predicted.total_floor_area_m2 = _geo_weighted_floor_area(
|
||||
comparables.members, target.coordinates
|
||||
)
|
||||
self._apply_categorical_modes(predicted, comparables, target.coordinates)
|
||||
self._apply_glazing_mode(predicted, comparables, target.coordinates)
|
||||
self._apply_heating_donor(predicted, comparables)
|
||||
|
|
@ -294,13 +296,44 @@ def _main_floor_attr(comparable: Comparable, attr: str) -> Optional[int]:
|
|||
return value
|
||||
|
||||
|
||||
def _median_floor_area(members: tuple[Comparable, ...]) -> float:
|
||||
"""The cohort's median floor area — the point estimate of the target's size.
|
||||
The median minimises mean absolute deviation, so it is the best single guess
|
||||
for an unknown neighbour's area; it is set independently of the structural
|
||||
template (the calculator derives heat loss from the building-part geometry,
|
||||
not this scalar, so the two need not agree)."""
|
||||
return statistics.median(c.epc.total_floor_area_m2 for c in members)
|
||||
def _geo_weighted_floor_area(
|
||||
members: tuple[Comparable, ...],
|
||||
target_coordinates: Optional[Coordinates],
|
||||
) -> float:
|
||||
"""The cohort's geo-proximity-weighted median floor area — the point estimate
|
||||
of the target's size. The median minimises mean absolute deviation, so it is
|
||||
the best single guess for an unknown neighbour's area; geo-weighting it leans
|
||||
the estimate toward the nearer neighbours, because homes built together share
|
||||
a footprint (the same street signal that already weights age / wall, #1227).
|
||||
Reduces exactly to the plain median when geo weighting is off (no target
|
||||
coordinates ⇒ uniform weights), preserving the MAD-minimising guarantee. Set
|
||||
independently of the structural template (the calculator derives heat loss
|
||||
from the building-part geometry, not this scalar, so the two need not agree)."""
|
||||
weights: list[float] = _geo_weights(target_coordinates, members)
|
||||
return _weighted_median(
|
||||
[
|
||||
(comparable.epc.total_floor_area_m2, weight)
|
||||
for comparable, weight in zip(members, weights)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _weighted_median(values_weights: list[tuple[float, float]]) -> float:
|
||||
"""The weighted median of (value, weight) pairs: the smallest value at which
|
||||
the cumulative weight reaches half the total. When a value's weight splits the
|
||||
total exactly in half, the two straddling values are averaged — so with
|
||||
uniform weights this reduces exactly to `statistics.median` (including the
|
||||
even-count midpoint average). Assumes a non-empty input."""
|
||||
ordered: list[tuple[float, float]] = sorted(values_weights)
|
||||
half: float = sum(weight for _, weight in ordered) / 2
|
||||
cumulative: float = 0.0
|
||||
for index, (value, weight) in enumerate(ordered):
|
||||
cumulative += weight
|
||||
if cumulative > half:
|
||||
return value
|
||||
if cumulative == half and index + 1 < len(ordered):
|
||||
return (value + ordered[index + 1][0]) / 2
|
||||
return ordered[-1][0]
|
||||
|
||||
|
||||
def _age_band_index(comparable: Comparable) -> Optional[int]:
|
||||
|
|
|
|||
|
|
@ -57,8 +57,16 @@ _RATE_FLOORS: dict[str, float] = {
|
|||
# window_count is deliberately excluded — it is cosmetic for SAP (issue #1222):
|
||||
# the predicted picture clusters at a mapper-default 4 windows while actuals
|
||||
# spread 1-21, yet total_window_area (the SAP-relevant signal) stays tight.
|
||||
#
|
||||
# floor_area was re-baselined 11.8983 -> 12.0378 when floor-area sizing moved from
|
||||
# the plain cohort median to the geo-proximity-weighted median (a *method* change,
|
||||
# not a loosening). The change is a clear win on the full 514-target corpus
|
||||
# (MAE 10.48 -> 9.73 / MAPE 13.2% -> 12.2%); the n=36 frozen fixture moved +0.14
|
||||
# the other way as small-sample noise (one target's shift moves an n=36 MAE more
|
||||
# than that). The ceiling still pins the new deterministic value exactly, so the
|
||||
# tighten-only ratchet resumes from here.
|
||||
_RESIDUAL_CEILINGS: dict[str, float] = {
|
||||
"floor_area": 11.8983,
|
||||
"floor_area": 12.0378,
|
||||
"total_window_area": 4.4067,
|
||||
"building_parts": 0.3333,
|
||||
"door_count": 0.6389,
|
||||
|
|
|
|||
|
|
@ -272,6 +272,54 @@ def test_floor_area_is_the_cohort_median_not_the_templates_own_area() -> None:
|
|||
assert predicted.total_floor_area_m2 == 70.0
|
||||
|
||||
|
||||
def test_floor_area_leans_toward_the_nearest_neighbours_size() -> None:
|
||||
# Arrange — three FAR neighbours are 60 m²; one neighbour AT the target is
|
||||
# 120 m². The plain median would be 60, but homes built together share a
|
||||
# footprint, so the geo-proximity-weighted median leans toward the near
|
||||
# neighbour's size.
|
||||
here = Coordinates(longitude=0.0, latitude=0.0)
|
||||
far = Coordinates(longitude=1.0, latitude=1.0) # ~150 km away
|
||||
cohort = ComparableProperties(
|
||||
members=(
|
||||
Comparable(_epc(floor_area=60.0), "1", coordinates=far),
|
||||
Comparable(_epc(floor_area=60.0), "2", coordinates=far),
|
||||
Comparable(_epc(floor_area=60.0), "3", coordinates=far),
|
||||
Comparable(_epc(floor_area=120.0), "4", coordinates=here),
|
||||
)
|
||||
)
|
||||
target = PredictionTarget(
|
||||
postcode="LS6 1AA", property_type="2", coordinates=here
|
||||
)
|
||||
|
||||
# Act
|
||||
predicted: EpcPropertyData = EpcPrediction().predict(target, cohort)
|
||||
|
||||
# Assert — the near neighbour's size dominates the far majority.
|
||||
assert predicted.total_floor_area_m2 == 120.0
|
||||
|
||||
|
||||
def test_floor_area_median_is_unweighted_without_target_coordinates() -> None:
|
||||
# Arrange — identical cohort, but the target has no coordinates, so geo
|
||||
# weighting is off and the floor area reduces to the plain cohort median (60).
|
||||
here = Coordinates(longitude=0.0, latitude=0.0)
|
||||
far = Coordinates(longitude=1.0, latitude=1.0)
|
||||
cohort = ComparableProperties(
|
||||
members=(
|
||||
Comparable(_epc(floor_area=60.0), "1", coordinates=far),
|
||||
Comparable(_epc(floor_area=60.0), "2", coordinates=far),
|
||||
Comparable(_epc(floor_area=60.0), "3", coordinates=far),
|
||||
Comparable(_epc(floor_area=120.0), "4", coordinates=here),
|
||||
)
|
||||
)
|
||||
target = PredictionTarget(postcode="LS6 1AA", property_type="2")
|
||||
|
||||
# Act
|
||||
predicted: EpcPropertyData = EpcPrediction().predict(target, cohort)
|
||||
|
||||
# Assert — without target coordinates, the plain median (60) wins.
|
||||
assert predicted.total_floor_area_m2 == 60.0
|
||||
|
||||
|
||||
def test_categorical_mode_leans_on_size_similar_neighbours() -> None:
|
||||
# Arrange — a count majority (three) carries wall-insulation 9, but two of
|
||||
# them are 400 m² size outliers; the cohort centre (median 100 m²) holds
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue