diff --git a/domain/epc_prediction/epc_prediction.py b/domain/epc_prediction/epc_prediction.py index 184159da..f6bf2166 100644 --- a/domain/epc_prediction/epc_prediction.py +++ b/domain/epc_prediction/epc_prediction.py @@ -56,10 +56,13 @@ class EpcPrediction: self, target: PredictionTarget, comparables: ComparableProperties ) -> EpcPropertyData: """Predict the target's EPC picture: copy a representative template's - structure (coherent for the calculator), then set the homogeneous - categoricals to the cohort mode.""" + structure (coherent for the calculator), set the predicted floor area to + the cohort median (the best point estimate of the target's size, decoupled + from the one template's own area), then set the homogeneous categoricals + to the cohort mode.""" template: Comparable = self._template(comparables) predicted: EpcPropertyData = copy.deepcopy(template.epc) + predicted.total_floor_area_m2 = _median_floor_area(comparables.members) self._apply_categorical_modes(predicted, comparables) self._apply_overrides(predicted, target) return predicted @@ -213,6 +216,15 @@ def _main_floor_attr(comparable: Comparable, attr: str) -> Optional[int]: return value +def _median_floor_area(members: tuple[Comparable, ...]) -> float: + """The cohort's median floor area — the point estimate of the target's size. + The median minimises mean absolute deviation, so it is the best single guess + for an unknown neighbour's area; it is set independently of the structural + template (the calculator derives heat loss from the building-part geometry, + not this scalar, so the two need not agree).""" + return statistics.median(c.epc.total_floor_area_m2 for c in members) + + def _age_band_index(comparable: Comparable) -> Optional[int]: """The main building part's construction-age-band position (A=0 … L=11), or None when no recognisable band is lodged.""" diff --git a/tests/domain/epc_prediction/test_component_accuracy_gate.py b/tests/domain/epc_prediction/test_component_accuracy_gate.py index 82789304..fccbe437 100644 --- a/tests/domain/epc_prediction/test_component_accuracy_gate.py +++ b/tests/domain/epc_prediction/test_component_accuracy_gate.py @@ -57,7 +57,7 @@ _RATE_FLOORS: dict[str, float] = { # the predicted picture clusters at a mapper-default 4 windows while actuals # spread 1-21, yet total_window_area (the SAP-relevant signal) stays tight. _RESIDUAL_CEILINGS: dict[str, float] = { - "floor_area": 12.2175, + "floor_area": 11.8983, "total_window_area": 4.4067, "building_parts": 0.3333, "door_count": 0.6389, diff --git a/tests/domain/epc_prediction/test_epc_prediction.py b/tests/domain/epc_prediction/test_epc_prediction.py index 4cdb2794..6c103d57 100644 --- a/tests/domain/epc_prediction/test_epc_prediction.py +++ b/tests/domain/epc_prediction/test_epc_prediction.py @@ -226,6 +226,29 @@ def test_recency_weights_roof_insulation_mode() -> None: assert predicted.sap_building_parts[0].roof_insulation_thickness == 300 +def test_floor_area_is_the_cohort_median_not_the_templates_own_area() -> None: + # Arrange — an even-sized cohort whose median (70) falls between members, so + # the size-representative template (the first member closest to the median, + # 60 m²) does not itself sit on the median. The predicted floor area is a + # point estimate of the target's size, best served by the cohort median (the + # MAD-minimising estimator), decoupled from whichever template seeds the + # structure. + cohort = _cohort( + _epc(floor_area=40.0), + _epc(floor_area=60.0), + _epc(floor_area=80.0), + _epc(floor_area=100.0), + ) + + # Act + predicted: EpcPropertyData = EpcPrediction().predict( + PredictionTarget(postcode="LS6 1AA", property_type="2"), cohort + ) + + # Assert — the floor area is the cohort median (70), not the template's 60. + assert predicted.total_floor_area_m2 == 70.0 + + def test_categorical_mode_leans_on_size_similar_neighbours() -> None: # Arrange — a count majority (three) carries wall-insulation 9, but two of # them are 400 m² size outliers; the cohort centre (median 100 m²) holds