feat(epc-prediction): cohort-median floor-area estimate (#1223)

Per-component method, not a global template change: the predicted floor
area is now the cohort median (the MAD-minimising point estimate of the
target's size) rather than whichever structural template's own area. The
calculator derives heat loss from building-part geometry, not this scalar,
so decoupling them is safe and the scalar becomes a better size estimate.

floor_area mean|.|: corpus (150pc/514 targets) 10.62 -> 10.48; fixture
12.2175 -> 11.8983 (ceiling ratcheted down). No other component moves.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-15 13:30:33 +00:00
parent 6e9f831296
commit 51cdc25ce8
3 changed files with 38 additions and 3 deletions

View file

@ -56,10 +56,13 @@ class EpcPrediction:
self, target: PredictionTarget, comparables: ComparableProperties
) -> EpcPropertyData:
"""Predict the target's EPC picture: copy a representative template's
structure (coherent for the calculator), then set the homogeneous
categoricals to the cohort mode."""
structure (coherent for the calculator), set the predicted floor area to
the cohort median (the best point estimate of the target's size, decoupled
from the one template's own area), then set the homogeneous categoricals
to the cohort mode."""
template: Comparable = self._template(comparables)
predicted: EpcPropertyData = copy.deepcopy(template.epc)
predicted.total_floor_area_m2 = _median_floor_area(comparables.members)
self._apply_categorical_modes(predicted, comparables)
self._apply_overrides(predicted, target)
return predicted
@ -213,6 +216,15 @@ def _main_floor_attr(comparable: Comparable, attr: str) -> Optional[int]:
return value
def _median_floor_area(members: tuple[Comparable, ...]) -> float:
"""The cohort's median floor area — the point estimate of the target's size.
The median minimises mean absolute deviation, so it is the best single guess
for an unknown neighbour's area; it is set independently of the structural
template (the calculator derives heat loss from the building-part geometry,
not this scalar, so the two need not agree)."""
return statistics.median(c.epc.total_floor_area_m2 for c in members)
def _age_band_index(comparable: Comparable) -> Optional[int]:
"""The main building part's construction-age-band position (A=0 … L=11), or
None when no recognisable band is lodged."""

View file

@ -57,7 +57,7 @@ _RATE_FLOORS: dict[str, float] = {
# the predicted picture clusters at a mapper-default 4 windows while actuals
# spread 1-21, yet total_window_area (the SAP-relevant signal) stays tight.
_RESIDUAL_CEILINGS: dict[str, float] = {
"floor_area": 12.2175,
"floor_area": 11.8983,
"total_window_area": 4.4067,
"building_parts": 0.3333,
"door_count": 0.6389,

View file

@ -226,6 +226,29 @@ def test_recency_weights_roof_insulation_mode() -> None:
assert predicted.sap_building_parts[0].roof_insulation_thickness == 300
def test_floor_area_is_the_cohort_median_not_the_templates_own_area() -> None:
# Arrange — an even-sized cohort whose median (70) falls between members, so
# the size-representative template (the first member closest to the median,
# 60 m²) does not itself sit on the median. The predicted floor area is a
# point estimate of the target's size, best served by the cohort median (the
# MAD-minimising estimator), decoupled from whichever template seeds the
# structure.
cohort = _cohort(
_epc(floor_area=40.0),
_epc(floor_area=60.0),
_epc(floor_area=80.0),
_epc(floor_area=100.0),
)
# Act
predicted: EpcPropertyData = EpcPrediction().predict(
PredictionTarget(postcode="LS6 1AA", property_type="2"), cohort
)
# Assert — the floor area is the cohort median (70), not the template's 60.
assert predicted.total_floor_area_m2 == 70.0
def test_categorical_mode_leans_on_size_similar_neighbours() -> None:
# Arrange — a count majority (three) carries wall-insulation 9, but two of
# them are 400 m² size outliers; the cohort centre (median 100 m²) holds