mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
fix(epc-prediction): dedupe re-lodgements + leak-free leave-one-out (ADR-0029)
The register lists every historical lodgement, so a postcode cohort
contains the same physical address many times (LS61AA: 15 certs / 11
addresses; NG71AA: 15 / 9 — "FLAT 3" appears 3x in each). Two
consequences:
- Production: a re-lodged neighbour was counting up to 3x towards the
cohort mode. select_comparables now dedupes candidates to the latest
cert per address (one comparable per real neighbour) — Comparable
gains address + registration_date (the register metadata its docstring
already anticipated, read straight off the cached payload).
- Validation: leave-one-out leaked — predicting a flat from a near-
identical re-lodgement of itself. The harness now holds out a whole
address (excludes every sibling cert) and evaluates on the latest cert
per address (the best ground truth).
Removing the leak gives the honest numbers (19 distinct addresses):
wall_construction 93.1% -> 89.5%
construction_age_band 65.5% -> 52.6%
roof_construction 79.3% -> 68.4%
floor_area mean|.| 37.9 -> 52.6 m2
The earlier figures were inflated by self-leakage; these are the real
accuracy to beat.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
54a57363f8
commit
fa11df56c2
3 changed files with 143 additions and 12 deletions
|
|
@ -9,6 +9,7 @@ IO (postcode search → per-cert fetch) lives behind a repository port.
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
from datatypes.epc.domain.epc_property_data import EpcPropertyData
|
||||
|
|
@ -23,10 +24,12 @@ _DEFAULT_MINIMUM_COHORT = 5
|
|||
class Comparable:
|
||||
"""One candidate neighbour: its structured `EpcPropertyData` picture plus the
|
||||
register metadata not carried on the cert (identity for leave-one-out
|
||||
exclusion; recency + address for weighting)."""
|
||||
exclusion; recency + address for weighting + re-lodgement dedup)."""
|
||||
|
||||
epc: EpcPropertyData
|
||||
certificate_number: str
|
||||
address: Optional[str] = None
|
||||
registration_date: Optional[date] = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
|
@ -74,10 +77,13 @@ def select_comparables(
|
|||
minimum_cohort: int = _DEFAULT_MINIMUM_COHORT,
|
||||
) -> ComparableProperties:
|
||||
"""Select the Comparable Properties for `target` from the raw postcode
|
||||
cohort. Property type is an always-hard filter (a flat is never a comparable
|
||||
for a house); built form is a conditioning filter on the relax ladder."""
|
||||
cohort. The register lists every historical lodgement, so first dedupe each
|
||||
address to its latest cert (one comparable per real neighbour); then property
|
||||
type is an always-hard filter (a flat is never a comparable for a house) and
|
||||
built form is a conditioning filter on the relax ladder."""
|
||||
cohort = _dedupe_to_latest_per_address(candidates)
|
||||
cohort = [
|
||||
c for c in candidates if c.epc.property_type == target.property_type
|
||||
c for c in cohort if c.epc.property_type == target.property_type
|
||||
]
|
||||
cohort = _maybe_filter(
|
||||
cohort,
|
||||
|
|
@ -94,6 +100,35 @@ def select_comparables(
|
|||
return ComparableProperties(members=tuple(cohort))
|
||||
|
||||
|
||||
def _dedupe_to_latest_per_address(
|
||||
candidates: list[Comparable],
|
||||
) -> list[Comparable]:
|
||||
"""Collapse the register's re-lodgements: keep one comparable per address —
|
||||
the latest by registration date (ties broken by certificate number, for
|
||||
determinism) — so a re-lodged neighbour does not count more than once.
|
||||
Candidates with no address are passed through untouched (each is its own
|
||||
neighbour). Input order is otherwise preserved."""
|
||||
latest: dict[str, Comparable] = {}
|
||||
passthrough: list[Comparable] = []
|
||||
for c in candidates:
|
||||
if c.address is None:
|
||||
passthrough.append(c)
|
||||
continue
|
||||
incumbent = latest.get(c.address)
|
||||
if incumbent is None or _recency_key(c) > _recency_key(incumbent):
|
||||
latest[c.address] = c
|
||||
return list(latest.values()) + passthrough
|
||||
|
||||
|
||||
def _recency_key(comparable: Comparable) -> tuple[date, str]:
|
||||
"""Sort key making the most recent (then highest cert number) win. A missing
|
||||
registration date sorts oldest."""
|
||||
return (
|
||||
comparable.registration_date or date.min,
|
||||
comparable.certificate_number,
|
||||
)
|
||||
|
||||
|
||||
def _main_wall_construction(comparable: Comparable) -> object:
|
||||
"""The main building part's wall construction, or None when no part lodged."""
|
||||
parts = comparable.epc.sap_building_parts
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ from __future__ import annotations
|
|||
import json
|
||||
import os
|
||||
import statistics
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
|
@ -45,20 +46,61 @@ CORPUS = Path(os.environ.get("EPC_PREDICTION_CORPUS", "/tmp/epc_prediction_corpu
|
|||
|
||||
def _load_cohort(postcode: str, certs: list[str]) -> list[Comparable]:
|
||||
"""Map a postcode's cached cert payloads to Comparables, skipping any the
|
||||
mapper rejects (unsupported schema, malformed)."""
|
||||
mapper rejects (unsupported schema, malformed). Address + registration date
|
||||
come straight off the cached payload (the register metadata) so the harness
|
||||
can dedupe re-lodgements and hold out a whole address."""
|
||||
cohort: list[Comparable] = []
|
||||
for cert in certs:
|
||||
path = CORPUS / postcode / f"{cert}.json"
|
||||
if not path.exists():
|
||||
continue
|
||||
raw = json.loads(path.read_text())
|
||||
try:
|
||||
epc = EpcPropertyDataMapper.from_api_response(json.loads(path.read_text()))
|
||||
epc = EpcPropertyDataMapper.from_api_response(raw)
|
||||
except Exception: # noqa: BLE001 — a bad cert must not abort the sweep
|
||||
continue
|
||||
cohort.append(Comparable(epc=epc, certificate_number=cert))
|
||||
cohort.append(
|
||||
Comparable(
|
||||
epc=epc,
|
||||
certificate_number=cert,
|
||||
address=_address(raw),
|
||||
registration_date=_registration_date(raw),
|
||||
)
|
||||
)
|
||||
return cohort
|
||||
|
||||
|
||||
def _address(raw: dict[str, object]) -> Optional[str]:
|
||||
value = raw.get("address_line_1")
|
||||
return str(value).strip().upper() if value else None
|
||||
|
||||
|
||||
def _registration_date(raw: dict[str, object]) -> Optional[date]:
|
||||
value = raw.get("registration_date")
|
||||
return date.fromisoformat(str(value)) if value else None
|
||||
|
||||
|
||||
def _ground_truth_properties(cohort: list[Comparable]) -> list[Comparable]:
|
||||
"""Collapse a postcode's certs to one held-out property per address — the
|
||||
latest cert, the best ground truth. Comparables with no address each stand
|
||||
alone."""
|
||||
latest: dict[str, Comparable] = {}
|
||||
standalone: list[Comparable] = []
|
||||
for c in cohort:
|
||||
if c.address is None:
|
||||
standalone.append(c)
|
||||
elif c.address not in latest or _recency(c) > _recency(latest[c.address]):
|
||||
latest[c.address] = c
|
||||
return list(latest.values()) + standalone
|
||||
|
||||
|
||||
def _recency(comparable: Comparable) -> tuple[date, str]:
|
||||
return (
|
||||
comparable.registration_date or date.min,
|
||||
comparable.certificate_number,
|
||||
)
|
||||
|
||||
|
||||
def _sap(calculator: Sap10Calculator, epc: EpcPropertyData) -> Optional[float]:
|
||||
try:
|
||||
return calculator.calculate(epc).sap_score_continuous
|
||||
|
|
@ -95,11 +137,18 @@ def main() -> None:
|
|||
|
||||
for postcode, certs in index.items():
|
||||
cohort = _load_cohort(postcode, certs)
|
||||
if len(cohort) < 2:
|
||||
skipped_no_cohort += len(cohort)
|
||||
targets = _ground_truth_properties(cohort)
|
||||
if len(targets) < 2:
|
||||
skipped_no_cohort += len(targets)
|
||||
continue
|
||||
for i, held_out in enumerate(cohort):
|
||||
others = [c for j, c in enumerate(cohort) if j != i]
|
||||
for held_out in targets:
|
||||
# Exclude every cert of the held-out address (not just the held cert)
|
||||
# so a re-lodgement of the same property cannot leak into the cohort.
|
||||
others = [
|
||||
c
|
||||
for c in cohort
|
||||
if c.address is None or c.address != held_out.address
|
||||
]
|
||||
actual = held_out.epc
|
||||
target = PredictionTarget(
|
||||
postcode=postcode,
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ hard filters on identity (property type, built form) + known overrides while
|
|||
enough remain, weighted by recency × similarity. Pure domain logic.
|
||||
"""
|
||||
|
||||
from datetime import date
|
||||
from typing import Optional, Union
|
||||
|
||||
from datatypes.epc.domain.epc_property_data import EpcPropertyData, SapBuildingPart
|
||||
|
|
@ -22,6 +23,8 @@ def _comparable(
|
|||
certificate_number: str,
|
||||
built_form: str = "1",
|
||||
wall_construction: Optional[Union[int, str]] = None,
|
||||
address: Optional[str] = None,
|
||||
registration_date: Optional[date] = None,
|
||||
) -> Comparable:
|
||||
"""A Comparable carrying only the fields under test (opaque EpcPropertyData
|
||||
with property_type / built_form / main wall set — the partial-instance idiom)."""
|
||||
|
|
@ -32,7 +35,12 @@ def _comparable(
|
|||
if wall_construction is not None:
|
||||
main.wall_construction = wall_construction
|
||||
epc.sap_building_parts = [main]
|
||||
return Comparable(epc=epc, certificate_number=certificate_number)
|
||||
return Comparable(
|
||||
epc=epc,
|
||||
certificate_number=certificate_number,
|
||||
address=address,
|
||||
registration_date=registration_date,
|
||||
)
|
||||
|
||||
|
||||
def test_selects_only_candidates_of_the_same_property_type() -> None:
|
||||
|
|
@ -51,6 +59,45 @@ def test_selects_only_candidates_of_the_same_property_type() -> None:
|
|||
assert {c.certificate_number for c in result.members} == {"A", "B"}
|
||||
|
||||
|
||||
def test_dedupes_re_lodgements_to_the_latest_cert_per_address() -> None:
|
||||
# Arrange — a register cohort with one address (FLAT 3) lodged three times.
|
||||
# Comparables are one-per-real-neighbour, so a re-lodged address must not
|
||||
# count three times towards the mode; the latest cert is its current state.
|
||||
target = PredictionTarget(postcode="LS6 1AA", property_type="2")
|
||||
candidates = [
|
||||
_comparable(
|
||||
property_type="2",
|
||||
certificate_number="OLD",
|
||||
address="FLAT 3",
|
||||
registration_date=date(2020, 4, 6),
|
||||
),
|
||||
_comparable(
|
||||
property_type="2",
|
||||
certificate_number="MID",
|
||||
address="FLAT 3",
|
||||
registration_date=date(2021, 2, 1),
|
||||
),
|
||||
_comparable(
|
||||
property_type="2",
|
||||
certificate_number="NEW",
|
||||
address="FLAT 3",
|
||||
registration_date=date(2025, 1, 20),
|
||||
),
|
||||
_comparable(
|
||||
property_type="2",
|
||||
certificate_number="OTHER",
|
||||
address="FLAT 5",
|
||||
registration_date=date(2024, 9, 27),
|
||||
),
|
||||
]
|
||||
|
||||
# Act
|
||||
result: ComparableProperties = select_comparables(target, candidates)
|
||||
|
||||
# Assert — FLAT 3 collapses to its latest cert; FLAT 5 is untouched.
|
||||
assert {c.certificate_number for c in result.members} == {"NEW", "OTHER"}
|
||||
|
||||
|
||||
def test_filters_to_the_known_built_form_when_enough_remain() -> None:
|
||||
# Arrange — a mid-terrace target (built_form "4"); cohort of 5 mid-terraces
|
||||
# + 2 detached, all houses. The built form is known and leaves ≥ k, so it is
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue