From 51cdc25ce89b7a8740ae44e4cb56108ae08bf4bd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 15 Jun 2026 13:30:33 +0000 Subject: [PATCH] feat(epc-prediction): cohort-median floor-area estimate (#1223) Per-component method, not a global template change: the predicted floor area is now the cohort median (the MAD-minimising point estimate of the target's size) rather than whichever structural template's own area. The calculator derives heat loss from building-part geometry, not this scalar, so decoupling them is safe and the scalar becomes a better size estimate. floor_area mean|.|: corpus (150pc/514 targets) 10.62 -> 10.48; fixture 12.2175 -> 11.8983 (ceiling ratcheted down). No other component moves. Co-Authored-By: Claude Opus 4.8 --- domain/epc_prediction/epc_prediction.py | 16 +++++++++++-- .../test_component_accuracy_gate.py | 2 +- .../epc_prediction/test_epc_prediction.py | 23 +++++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/domain/epc_prediction/epc_prediction.py b/domain/epc_prediction/epc_prediction.py index 184159da..f6bf2166 100644 --- a/domain/epc_prediction/epc_prediction.py +++ b/domain/epc_prediction/epc_prediction.py @@ -56,10 +56,13 @@ class EpcPrediction: self, target: PredictionTarget, comparables: ComparableProperties ) -> EpcPropertyData: """Predict the target's EPC picture: copy a representative template's - structure (coherent for the calculator), then set the homogeneous - categoricals to the cohort mode.""" + structure (coherent for the calculator), set the predicted floor area to + the cohort median (the best point estimate of the target's size, decoupled + from the one template's own area), then set the homogeneous categoricals + to the cohort mode.""" template: Comparable = self._template(comparables) predicted: EpcPropertyData = copy.deepcopy(template.epc) + predicted.total_floor_area_m2 = _median_floor_area(comparables.members) self._apply_categorical_modes(predicted, comparables) self._apply_overrides(predicted, target) return predicted @@ -213,6 +216,15 @@ def _main_floor_attr(comparable: Comparable, attr: str) -> Optional[int]: return value +def _median_floor_area(members: tuple[Comparable, ...]) -> float: + """The cohort's median floor area — the point estimate of the target's size. + The median minimises mean absolute deviation, so it is the best single guess + for an unknown neighbour's area; it is set independently of the structural + template (the calculator derives heat loss from the building-part geometry, + not this scalar, so the two need not agree).""" + return statistics.median(c.epc.total_floor_area_m2 for c in members) + + def _age_band_index(comparable: Comparable) -> Optional[int]: """The main building part's construction-age-band position (A=0 … L=11), or None when no recognisable band is lodged.""" diff --git a/tests/domain/epc_prediction/test_component_accuracy_gate.py b/tests/domain/epc_prediction/test_component_accuracy_gate.py index 82789304..fccbe437 100644 --- a/tests/domain/epc_prediction/test_component_accuracy_gate.py +++ b/tests/domain/epc_prediction/test_component_accuracy_gate.py @@ -57,7 +57,7 @@ _RATE_FLOORS: dict[str, float] = { # the predicted picture clusters at a mapper-default 4 windows while actuals # spread 1-21, yet total_window_area (the SAP-relevant signal) stays tight. _RESIDUAL_CEILINGS: dict[str, float] = { - "floor_area": 12.2175, + "floor_area": 11.8983, "total_window_area": 4.4067, "building_parts": 0.3333, "door_count": 0.6389, diff --git a/tests/domain/epc_prediction/test_epc_prediction.py b/tests/domain/epc_prediction/test_epc_prediction.py index 4cdb2794..6c103d57 100644 --- a/tests/domain/epc_prediction/test_epc_prediction.py +++ b/tests/domain/epc_prediction/test_epc_prediction.py @@ -226,6 +226,29 @@ def test_recency_weights_roof_insulation_mode() -> None: assert predicted.sap_building_parts[0].roof_insulation_thickness == 300 +def test_floor_area_is_the_cohort_median_not_the_templates_own_area() -> None: + # Arrange — an even-sized cohort whose median (70) falls between members, so + # the size-representative template (the first member closest to the median, + # 60 m²) does not itself sit on the median. The predicted floor area is a + # point estimate of the target's size, best served by the cohort median (the + # MAD-minimising estimator), decoupled from whichever template seeds the + # structure. + cohort = _cohort( + _epc(floor_area=40.0), + _epc(floor_area=60.0), + _epc(floor_area=80.0), + _epc(floor_area=100.0), + ) + + # Act + predicted: EpcPropertyData = EpcPrediction().predict( + PredictionTarget(postcode="LS6 1AA", property_type="2"), cohort + ) + + # Assert — the floor area is the cohort median (70), not the template's 60. + assert predicted.total_floor_area_m2 == 70.0 + + def test_categorical_mode_leans_on_size_similar_neighbours() -> None: # Arrange — a count majority (three) carries wall-insulation 9, but two of # them are 400 m² size outliers; the cohort centre (median 100 m²) holds