From 4fa20ae76bcb137e39998c11a9cc7afbc996343a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 14 Jun 2026 00:05:40 +0000 Subject: [PATCH] fix(epc-prediction): size-representative template selection (ADR-0029) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Template (the comparable whose structure/geometry is copied wholesale) was members[0] — an arbitrary draw from the API search order. With floor area varying widely within a property_type cohort (NG71AA houses span 51-340 m2), this made the copied geometry noisy and systematically large. Pick the member whose floor area is closest to the cohort median instead, implementing ADR-0029 decision 4's unimplemented "closest on size" criterion while keeping the structure coherent (it is still one real property, so floor dims / windows / parts stay internally consistent for the calculator). Smoke corpus (29 leave-one-out predictions): floor_area mean|.| 68.0 -> 37.9 m2 (bias +46.8 -> -3.9) window_area mean|.| 11.1 -> 7.3 m2 parts mean|.| 1.00 -> 0.38 SAP |pred-calc - calc(actual)| MAE 7.19 -> 4.86 Co-Authored-By: Claude Opus 4.8 --- domain/epc_prediction/epc_prediction.py | 16 ++++++++++++-- .../epc_prediction/test_epc_prediction.py | 21 +++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/domain/epc_prediction/epc_prediction.py b/domain/epc_prediction/epc_prediction.py index 9806b87d..68624632 100644 --- a/domain/epc_prediction/epc_prediction.py +++ b/domain/epc_prediction/epc_prediction.py @@ -10,6 +10,7 @@ logic — deterministic neighbour synthesis, not ML. from __future__ import annotations import copy +import statistics from collections import Counter from typing import Iterable, Optional, Union @@ -41,8 +42,19 @@ class EpcPrediction: @staticmethod def _template(comparables: ComparableProperties) -> Comparable: - """The representative comparable whose structure seeds the prediction.""" - return comparables.members[0] + """The representative comparable whose structure seeds the prediction: + the member whose floor area is closest to the cohort median. A single + neighbour's geometry is copied wholesale, so a size-representative + template keeps the prediction off the cohort's size outliers (ADR-0029 + decision 4: closest on size).""" + members: tuple[Comparable, ...] = comparables.members + median_area: float = statistics.median( + c.epc.total_floor_area_m2 for c in members + ) + return min( + members, + key=lambda c: abs(c.epc.total_floor_area_m2 - median_area), + ) @staticmethod def _apply_categorical_modes( diff --git a/tests/domain/epc_prediction/test_epc_prediction.py b/tests/domain/epc_prediction/test_epc_prediction.py index 8e2a139c..43da0737 100644 --- a/tests/domain/epc_prediction/test_epc_prediction.py +++ b/tests/domain/epc_prediction/test_epc_prediction.py @@ -59,6 +59,27 @@ def test_predicts_a_picture_by_copying_a_representative_template() -> None: assert predicted is not template +def test_template_is_the_member_closest_to_the_cohort_median_size() -> None: + # Arrange — the cohort spans a wide range of sizes; members[0] is an atypical + # tiny 20 m² outlier. A single neighbour's geometry is copied wholesale, so + # the template must be the size-representative member (closest to the median), + # not whoever happens to come first (ADR-0029 decision 4: closest on size). + cohort = _cohort( + _epc(floor_area=20.0), + _epc(floor_area=80.0), + _epc(floor_area=200.0), + ) + + # Act + predicted: EpcPropertyData = EpcPrediction().predict( + PredictionTarget(postcode="LS6 1AA", property_type="2"), cohort + ) + + # Assert — the 80 m² member (the median) seeds the structure, not the 20 m² + # outlier sitting at members[0]. + assert predicted.total_floor_area_m2 == 80.0 + + def test_sets_main_wall_construction_to_the_cohort_mode() -> None: # Arrange — the template (members[0]) is solid brick (2), but the cohort # majority is cavity (1). The homogeneous categorical should follow the mode,