fix(epc-prediction): dedupe re-lodgements + leak-free leave-one-out (ADR-0029)

The register lists every historical lodgement, so a postcode cohort
contains the same physical address many times (LS61AA: 15 certs / 11
addresses; NG71AA: 15 / 9 — "FLAT 3" appears 3x in each). Two
consequences:

  - Production: a re-lodged neighbour was counting up to 3x towards the
    cohort mode. select_comparables now dedupes candidates to the latest
    cert per address (one comparable per real neighbour) — Comparable
    gains address + registration_date (the register metadata its docstring
    already anticipated, read straight off the cached payload).

  - Validation: leave-one-out leaked — predicting a flat from a near-
    identical re-lodgement of itself. The harness now holds out a whole
    address (excludes every sibling cert) and evaluates on the latest cert
    per address (the best ground truth).

Removing the leak gives the honest numbers (19 distinct addresses):
  wall_construction      93.1% -> 89.5%
  construction_age_band  65.5% -> 52.6%
  roof_construction      79.3% -> 68.4%
  floor_area mean|.|     37.9  -> 52.6 m2
The earlier figures were inflated by self-leakage; these are the real
accuracy to beat.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-14 00:40:23 +00:00
parent 54a57363f8
commit fa11df56c2
3 changed files with 143 additions and 12 deletions

View file

@ -9,6 +9,7 @@ IO (postcode search → per-cert fetch) lives behind a repository port.
from __future__ import annotations
from dataclasses import dataclass
from datetime import date
from typing import Callable, Optional, Union
from datatypes.epc.domain.epc_property_data import EpcPropertyData
@ -23,10 +24,12 @@ _DEFAULT_MINIMUM_COHORT = 5
class Comparable:
"""One candidate neighbour: its structured `EpcPropertyData` picture plus the
register metadata not carried on the cert (identity for leave-one-out
exclusion; recency + address for weighting)."""
exclusion; recency + address for weighting + re-lodgement dedup)."""
epc: EpcPropertyData
certificate_number: str
address: Optional[str] = None
registration_date: Optional[date] = None
@dataclass(frozen=True)
@ -74,10 +77,13 @@ def select_comparables(
minimum_cohort: int = _DEFAULT_MINIMUM_COHORT,
) -> ComparableProperties:
"""Select the Comparable Properties for `target` from the raw postcode
cohort. Property type is an always-hard filter (a flat is never a comparable
for a house); built form is a conditioning filter on the relax ladder."""
cohort. The register lists every historical lodgement, so first dedupe each
address to its latest cert (one comparable per real neighbour); then property
type is an always-hard filter (a flat is never a comparable for a house) and
built form is a conditioning filter on the relax ladder."""
cohort = _dedupe_to_latest_per_address(candidates)
cohort = [
c for c in candidates if c.epc.property_type == target.property_type
c for c in cohort if c.epc.property_type == target.property_type
]
cohort = _maybe_filter(
cohort,
@ -94,6 +100,35 @@ def select_comparables(
return ComparableProperties(members=tuple(cohort))
def _dedupe_to_latest_per_address(
candidates: list[Comparable],
) -> list[Comparable]:
"""Collapse the register's re-lodgements: keep one comparable per address —
the latest by registration date (ties broken by certificate number, for
determinism) so a re-lodged neighbour does not count more than once.
Candidates with no address are passed through untouched (each is its own
neighbour). Input order is otherwise preserved."""
latest: dict[str, Comparable] = {}
passthrough: list[Comparable] = []
for c in candidates:
if c.address is None:
passthrough.append(c)
continue
incumbent = latest.get(c.address)
if incumbent is None or _recency_key(c) > _recency_key(incumbent):
latest[c.address] = c
return list(latest.values()) + passthrough
def _recency_key(comparable: Comparable) -> tuple[date, str]:
"""Sort key making the most recent (then highest cert number) win. A missing
registration date sorts oldest."""
return (
comparable.registration_date or date.min,
comparable.certificate_number,
)
def _main_wall_construction(comparable: Comparable) -> object:
"""The main building part's wall construction, or None when no part lodged."""
parts = comparable.epc.sap_building_parts

View file

@ -26,6 +26,7 @@ from __future__ import annotations
import json
import os
import statistics
from datetime import date
from pathlib import Path
from typing import Optional
@ -45,20 +46,61 @@ CORPUS = Path(os.environ.get("EPC_PREDICTION_CORPUS", "/tmp/epc_prediction_corpu
def _load_cohort(postcode: str, certs: list[str]) -> list[Comparable]:
"""Map a postcode's cached cert payloads to Comparables, skipping any the
mapper rejects (unsupported schema, malformed)."""
mapper rejects (unsupported schema, malformed). Address + registration date
come straight off the cached payload (the register metadata) so the harness
can dedupe re-lodgements and hold out a whole address."""
cohort: list[Comparable] = []
for cert in certs:
path = CORPUS / postcode / f"{cert}.json"
if not path.exists():
continue
raw = json.loads(path.read_text())
try:
epc = EpcPropertyDataMapper.from_api_response(json.loads(path.read_text()))
epc = EpcPropertyDataMapper.from_api_response(raw)
except Exception: # noqa: BLE001 — a bad cert must not abort the sweep
continue
cohort.append(Comparable(epc=epc, certificate_number=cert))
cohort.append(
Comparable(
epc=epc,
certificate_number=cert,
address=_address(raw),
registration_date=_registration_date(raw),
)
)
return cohort
def _address(raw: dict[str, object]) -> Optional[str]:
value = raw.get("address_line_1")
return str(value).strip().upper() if value else None
def _registration_date(raw: dict[str, object]) -> Optional[date]:
value = raw.get("registration_date")
return date.fromisoformat(str(value)) if value else None
def _ground_truth_properties(cohort: list[Comparable]) -> list[Comparable]:
"""Collapse a postcode's certs to one held-out property per address — the
latest cert, the best ground truth. Comparables with no address each stand
alone."""
latest: dict[str, Comparable] = {}
standalone: list[Comparable] = []
for c in cohort:
if c.address is None:
standalone.append(c)
elif c.address not in latest or _recency(c) > _recency(latest[c.address]):
latest[c.address] = c
return list(latest.values()) + standalone
def _recency(comparable: Comparable) -> tuple[date, str]:
return (
comparable.registration_date or date.min,
comparable.certificate_number,
)
def _sap(calculator: Sap10Calculator, epc: EpcPropertyData) -> Optional[float]:
try:
return calculator.calculate(epc).sap_score_continuous
@ -95,11 +137,18 @@ def main() -> None:
for postcode, certs in index.items():
cohort = _load_cohort(postcode, certs)
if len(cohort) < 2:
skipped_no_cohort += len(cohort)
targets = _ground_truth_properties(cohort)
if len(targets) < 2:
skipped_no_cohort += len(targets)
continue
for i, held_out in enumerate(cohort):
others = [c for j, c in enumerate(cohort) if j != i]
for held_out in targets:
# Exclude every cert of the held-out address (not just the held cert)
# so a re-lodgement of the same property cannot leak into the cohort.
others = [
c
for c in cohort
if c.address is None or c.address != held_out.address
]
actual = held_out.epc
target = PredictionTarget(
postcode=postcode,

View file

@ -5,6 +5,7 @@ hard filters on identity (property type, built form) + known overrides while
enough remain, weighted by recency × similarity. Pure domain logic.
"""
from datetime import date
from typing import Optional, Union
from datatypes.epc.domain.epc_property_data import EpcPropertyData, SapBuildingPart
@ -22,6 +23,8 @@ def _comparable(
certificate_number: str,
built_form: str = "1",
wall_construction: Optional[Union[int, str]] = None,
address: Optional[str] = None,
registration_date: Optional[date] = None,
) -> Comparable:
"""A Comparable carrying only the fields under test (opaque EpcPropertyData
with property_type / built_form / main wall set the partial-instance idiom)."""
@ -32,7 +35,12 @@ def _comparable(
if wall_construction is not None:
main.wall_construction = wall_construction
epc.sap_building_parts = [main]
return Comparable(epc=epc, certificate_number=certificate_number)
return Comparable(
epc=epc,
certificate_number=certificate_number,
address=address,
registration_date=registration_date,
)
def test_selects_only_candidates_of_the_same_property_type() -> None:
@ -51,6 +59,45 @@ def test_selects_only_candidates_of_the_same_property_type() -> None:
assert {c.certificate_number for c in result.members} == {"A", "B"}
def test_dedupes_re_lodgements_to_the_latest_cert_per_address() -> None:
# Arrange — a register cohort with one address (FLAT 3) lodged three times.
# Comparables are one-per-real-neighbour, so a re-lodged address must not
# count three times towards the mode; the latest cert is its current state.
target = PredictionTarget(postcode="LS6 1AA", property_type="2")
candidates = [
_comparable(
property_type="2",
certificate_number="OLD",
address="FLAT 3",
registration_date=date(2020, 4, 6),
),
_comparable(
property_type="2",
certificate_number="MID",
address="FLAT 3",
registration_date=date(2021, 2, 1),
),
_comparable(
property_type="2",
certificate_number="NEW",
address="FLAT 3",
registration_date=date(2025, 1, 20),
),
_comparable(
property_type="2",
certificate_number="OTHER",
address="FLAT 5",
registration_date=date(2024, 9, 27),
),
]
# Act
result: ComparableProperties = select_comparables(target, candidates)
# Assert — FLAT 3 collapses to its latest cert; FLAT 5 is untouched.
assert {c.certificate_number for c in result.members} == {"NEW", "OTHER"}
def test_filters_to_the_known_built_form_when_enough_remain() -> None:
# Arrange — a mid-terrace target (built_form "4"); cohort of 5 mid-terraces
# + 2 detached, all houses. The built form is known and leaves ≥ k, so it is