diff --git a/domain/epc_prediction/comparable_properties.py b/domain/epc_prediction/comparable_properties.py index 6b87881b..bccff188 100644 --- a/domain/epc_prediction/comparable_properties.py +++ b/domain/epc_prediction/comparable_properties.py @@ -9,6 +9,7 @@ IO (postcode search → per-cert fetch) lives behind a repository port. from __future__ import annotations from dataclasses import dataclass +from datetime import date from typing import Callable, Optional, Union from datatypes.epc.domain.epc_property_data import EpcPropertyData @@ -23,10 +24,12 @@ _DEFAULT_MINIMUM_COHORT = 5 class Comparable: """One candidate neighbour: its structured `EpcPropertyData` picture plus the register metadata not carried on the cert (identity for leave-one-out - exclusion; recency + address for weighting).""" + exclusion; recency + address for weighting + re-lodgement dedup).""" epc: EpcPropertyData certificate_number: str + address: Optional[str] = None + registration_date: Optional[date] = None @dataclass(frozen=True) @@ -74,10 +77,13 @@ def select_comparables( minimum_cohort: int = _DEFAULT_MINIMUM_COHORT, ) -> ComparableProperties: """Select the Comparable Properties for `target` from the raw postcode - cohort. Property type is an always-hard filter (a flat is never a comparable - for a house); built form is a conditioning filter on the relax ladder.""" + cohort. The register lists every historical lodgement, so first dedupe each + address to its latest cert (one comparable per real neighbour); then property + type is an always-hard filter (a flat is never a comparable for a house) and + built form is a conditioning filter on the relax ladder.""" + cohort = _dedupe_to_latest_per_address(candidates) cohort = [ - c for c in candidates if c.epc.property_type == target.property_type + c for c in cohort if c.epc.property_type == target.property_type ] cohort = _maybe_filter( cohort, @@ -94,6 +100,35 @@ def select_comparables( return ComparableProperties(members=tuple(cohort)) +def _dedupe_to_latest_per_address( + candidates: list[Comparable], +) -> list[Comparable]: + """Collapse the register's re-lodgements: keep one comparable per address — + the latest by registration date (ties broken by certificate number, for + determinism) — so a re-lodged neighbour does not count more than once. + Candidates with no address are passed through untouched (each is its own + neighbour). Input order is otherwise preserved.""" + latest: dict[str, Comparable] = {} + passthrough: list[Comparable] = [] + for c in candidates: + if c.address is None: + passthrough.append(c) + continue + incumbent = latest.get(c.address) + if incumbent is None or _recency_key(c) > _recency_key(incumbent): + latest[c.address] = c + return list(latest.values()) + passthrough + + +def _recency_key(comparable: Comparable) -> tuple[date, str]: + """Sort key making the most recent (then highest cert number) win. A missing + registration date sorts oldest.""" + return ( + comparable.registration_date or date.min, + comparable.certificate_number, + ) + + def _main_wall_construction(comparable: Comparable) -> object: """The main building part's wall construction, or None when no part lodged.""" parts = comparable.epc.sap_building_parts diff --git a/scripts/validate_epc_prediction.py b/scripts/validate_epc_prediction.py index 25890445..ab7e2e11 100644 --- a/scripts/validate_epc_prediction.py +++ b/scripts/validate_epc_prediction.py @@ -26,6 +26,7 @@ from __future__ import annotations import json import os import statistics +from datetime import date from pathlib import Path from typing import Optional @@ -45,20 +46,61 @@ CORPUS = Path(os.environ.get("EPC_PREDICTION_CORPUS", "/tmp/epc_prediction_corpu def _load_cohort(postcode: str, certs: list[str]) -> list[Comparable]: """Map a postcode's cached cert payloads to Comparables, skipping any the - mapper rejects (unsupported schema, malformed).""" + mapper rejects (unsupported schema, malformed). Address + registration date + come straight off the cached payload (the register metadata) so the harness + can dedupe re-lodgements and hold out a whole address.""" cohort: list[Comparable] = [] for cert in certs: path = CORPUS / postcode / f"{cert}.json" if not path.exists(): continue + raw = json.loads(path.read_text()) try: - epc = EpcPropertyDataMapper.from_api_response(json.loads(path.read_text())) + epc = EpcPropertyDataMapper.from_api_response(raw) except Exception: # noqa: BLE001 — a bad cert must not abort the sweep continue - cohort.append(Comparable(epc=epc, certificate_number=cert)) + cohort.append( + Comparable( + epc=epc, + certificate_number=cert, + address=_address(raw), + registration_date=_registration_date(raw), + ) + ) return cohort +def _address(raw: dict[str, object]) -> Optional[str]: + value = raw.get("address_line_1") + return str(value).strip().upper() if value else None + + +def _registration_date(raw: dict[str, object]) -> Optional[date]: + value = raw.get("registration_date") + return date.fromisoformat(str(value)) if value else None + + +def _ground_truth_properties(cohort: list[Comparable]) -> list[Comparable]: + """Collapse a postcode's certs to one held-out property per address — the + latest cert, the best ground truth. Comparables with no address each stand + alone.""" + latest: dict[str, Comparable] = {} + standalone: list[Comparable] = [] + for c in cohort: + if c.address is None: + standalone.append(c) + elif c.address not in latest or _recency(c) > _recency(latest[c.address]): + latest[c.address] = c + return list(latest.values()) + standalone + + +def _recency(comparable: Comparable) -> tuple[date, str]: + return ( + comparable.registration_date or date.min, + comparable.certificate_number, + ) + + def _sap(calculator: Sap10Calculator, epc: EpcPropertyData) -> Optional[float]: try: return calculator.calculate(epc).sap_score_continuous @@ -95,11 +137,18 @@ def main() -> None: for postcode, certs in index.items(): cohort = _load_cohort(postcode, certs) - if len(cohort) < 2: - skipped_no_cohort += len(cohort) + targets = _ground_truth_properties(cohort) + if len(targets) < 2: + skipped_no_cohort += len(targets) continue - for i, held_out in enumerate(cohort): - others = [c for j, c in enumerate(cohort) if j != i] + for held_out in targets: + # Exclude every cert of the held-out address (not just the held cert) + # so a re-lodgement of the same property cannot leak into the cohort. + others = [ + c + for c in cohort + if c.address is None or c.address != held_out.address + ] actual = held_out.epc target = PredictionTarget( postcode=postcode, diff --git a/tests/domain/epc_prediction/test_comparable_properties.py b/tests/domain/epc_prediction/test_comparable_properties.py index 4894c017..223ff601 100644 --- a/tests/domain/epc_prediction/test_comparable_properties.py +++ b/tests/domain/epc_prediction/test_comparable_properties.py @@ -5,6 +5,7 @@ hard filters on identity (property type, built form) + known overrides while enough remain, weighted by recency × similarity. Pure domain logic. """ +from datetime import date from typing import Optional, Union from datatypes.epc.domain.epc_property_data import EpcPropertyData, SapBuildingPart @@ -22,6 +23,8 @@ def _comparable( certificate_number: str, built_form: str = "1", wall_construction: Optional[Union[int, str]] = None, + address: Optional[str] = None, + registration_date: Optional[date] = None, ) -> Comparable: """A Comparable carrying only the fields under test (opaque EpcPropertyData with property_type / built_form / main wall set — the partial-instance idiom).""" @@ -32,7 +35,12 @@ def _comparable( if wall_construction is not None: main.wall_construction = wall_construction epc.sap_building_parts = [main] - return Comparable(epc=epc, certificate_number=certificate_number) + return Comparable( + epc=epc, + certificate_number=certificate_number, + address=address, + registration_date=registration_date, + ) def test_selects_only_candidates_of_the_same_property_type() -> None: @@ -51,6 +59,45 @@ def test_selects_only_candidates_of_the_same_property_type() -> None: assert {c.certificate_number for c in result.members} == {"A", "B"} +def test_dedupes_re_lodgements_to_the_latest_cert_per_address() -> None: + # Arrange — a register cohort with one address (FLAT 3) lodged three times. + # Comparables are one-per-real-neighbour, so a re-lodged address must not + # count three times towards the mode; the latest cert is its current state. + target = PredictionTarget(postcode="LS6 1AA", property_type="2") + candidates = [ + _comparable( + property_type="2", + certificate_number="OLD", + address="FLAT 3", + registration_date=date(2020, 4, 6), + ), + _comparable( + property_type="2", + certificate_number="MID", + address="FLAT 3", + registration_date=date(2021, 2, 1), + ), + _comparable( + property_type="2", + certificate_number="NEW", + address="FLAT 3", + registration_date=date(2025, 1, 20), + ), + _comparable( + property_type="2", + certificate_number="OTHER", + address="FLAT 5", + registration_date=date(2024, 9, 27), + ), + ] + + # Act + result: ComparableProperties = select_comparables(target, candidates) + + # Assert — FLAT 3 collapses to its latest cert; FLAT 5 is untouched. + assert {c.certificate_number for c in result.members} == {"NEW", "OTHER"} + + def test_filters_to_the_known_built_form_when_enough_remain() -> None: # Arrange — a mid-terrace target (built_form "4"); cohort of 5 mid-terraces # + 2 detached, all houses. The built form is known and leaves ≥ k, so it is