feat(epc-prediction): thread coordinates onto Comparable + target (#1227)

Adds coordinates: Optional[Coordinates] to Comparable and PredictionTarget
(data carriers — the pure predictor stays IO-free), and wires load_corpus to
read an optional _coordinates.json sidecar ({uprn: [lon, lat]}) and populate
each Comparable from its cert's uprn; iter_predictions threads the held-out
target's coordinates through. Absent sidecar -> geo-weighting stays off (no
behaviour change yet — weighting lands next slice). fetch_corpus_coordinates
now writes the sidecar into the corpus dir. load_corpus populates 99% of
corpus comparables.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-15 14:46:01 +00:00
parent 95719dd587
commit fdc314c857
5 changed files with 65 additions and 4 deletions

View file

@ -13,6 +13,7 @@ from datetime import date
from typing import Callable, Optional, Union
from datatypes.epc.domain.epc_property_data import EpcPropertyData
from domain.geospatial.coordinates import Coordinates
# Default floor on the cohort: a conditioning filter (built form, a known
# override) is applied only while at least this many comparables survive it,
@ -30,6 +31,10 @@ class Comparable:
certificate_number: str
address: Optional[str] = None
registration_date: Optional[date] = None
# Resolved from the neighbour's UPRN at the boundary (the harness / modelling
# orchestrator), so the pure predictor can weight by physical distance to the
# target without an IO dependency. None when no UPRN/coordinate is available.
coordinates: Optional[Coordinates] = None
@dataclass(frozen=True)
@ -45,6 +50,9 @@ class PredictionTarget:
# A known Landlord Override (e.g. solid brick) conditions cohort selection —
# matching comparables are emphasised while enough remain (ADR-0029).
wall_construction: Optional[Union[int, str]] = None
# The target Property's own coordinates (resolved from its UPRN), against
# which neighbours are distance-weighted. None disables geo-weighting.
coordinates: Optional[Coordinates] = None
@dataclass(frozen=True)

View file

@ -115,6 +115,7 @@ def iter_predictions(
postcode=actual.postcode,
property_type=actual.property_type or "",
built_form=actual.built_form,
coordinates=held_out.coordinates,
)
comparables = select_comparables(target, others)
if not comparables.members:

View file

@ -21,6 +21,7 @@ from typing import Any, Optional
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.epc_prediction.comparable_properties import Comparable
from domain.geospatial.coordinates import Coordinates
# Identifying free-text fields blanked when freezing a payload into the committed
# fixture (postcode is kept — it is coarse open data and the cohort key).
@ -36,14 +37,18 @@ def load_corpus(corpus_dir: Path) -> list[list[Comparable]]:
f"no corpus index at {index_path} — run a corpus fetch first"
)
index: dict[str, list[str]] = json.loads(index_path.read_text())
coordinates = load_coordinates(corpus_dir)
return [
_load_cohort(corpus_dir, postcode, certs)
_load_cohort(corpus_dir, postcode, certs, coordinates)
for postcode, certs in index.items()
]
def _load_cohort(
corpus_dir: Path, postcode: str, certs: list[str]
corpus_dir: Path,
postcode: str,
certs: list[str],
coordinates: dict[int, Coordinates],
) -> list[Comparable]:
cohort: list[Comparable] = []
for cert in certs:
@ -55,17 +60,38 @@ def _load_cohort(
epc = EpcPropertyDataMapper.from_api_response(raw)
except Exception: # noqa: BLE001 — a bad cert must not abort the sweep
continue
uprn = _uprn(raw)
cohort.append(
Comparable(
epc=epc,
certificate_number=cert,
address=_address(raw),
registration_date=_registration_date(raw),
coordinates=coordinates.get(uprn) if uprn is not None else None,
)
)
return cohort
def load_coordinates(corpus_dir: Path) -> dict[int, Coordinates]:
"""The optional `_coordinates.json` sidecar (`{uprn: [lon, lat]}`), resolved
from the OS Open-UPRN data by `fetch_corpus_coordinates.py`. Absent for a
corpus without geo data geo-weighting then simply stays off."""
path = corpus_dir / "_coordinates.json"
if not path.exists():
return {}
raw: dict[str, list[float]] = json.loads(path.read_text())
return {
int(uprn): Coordinates(longitude=lon_lat[0], latitude=lon_lat[1])
for uprn, lon_lat in raw.items()
}
def _uprn(raw: dict[str, Any]) -> Optional[int]:
value = raw.get("uprn")
return int(value) if value is not None else None
def stable_hash(prefix: str, value: str) -> str:
"""A short, deterministic, one-way token for a free-text identifier. Stable
across re-lodgements of the same address (normalised first), so dedup still

View file

@ -30,7 +30,8 @@ import boto3
import pandas as pd
CORPUS = Path(os.environ.get("EPC_PREDICTION_CORPUS", "/tmp/epc_prediction_corpus"))
OUT = CORPUS.parent / "epc_prediction_corpus_coords.json"
# Sidecar inside the corpus dir, so `load_corpus` picks it up automatically.
OUT = CORPUS / "_coordinates.json"
_BUCKET = os.environ["DATA_BUCKET"]
_META_KEY = "spatial/filename_meta.parquet"

View file

@ -3,7 +3,11 @@ cert payload for the committed fixture without disturbing the component data the
scorer reads. Pure dict-in / dict-out.
"""
from harness.epc_prediction_corpus import anonymise_payload
import json
from pathlib import Path
from domain.geospatial.coordinates import Coordinates
from harness.epc_prediction_corpus import load_coordinates, anonymise_payload
def _payload() -> dict[str, object]:
@ -69,3 +73,24 @@ def test_does_not_mutate_the_input() -> None:
# Assert — the caller's payload is left intact.
assert raw["address_line_1"] == "12 Acacia Avenue"
def test_loads_the_coordinates_sidecar(tmp_path: Path) -> None:
# Arrange — a `_coordinates.json` sidecar mapping UPRN -> [lon, lat].
(tmp_path / "_coordinates.json").write_text(
json.dumps({"100024": [-1.5, 53.4]})
)
# Act
coordinates = load_coordinates(tmp_path)
# Assert — parsed into UPRN-keyed Coordinates.
assert coordinates == {100024: Coordinates(longitude=-1.5, latitude=53.4)}
def test_coordinates_sidecar_absent_yields_empty(tmp_path: Path) -> None:
# Arrange / Act — no sidecar present (a corpus without geo data).
coordinates = load_coordinates(tmp_path)
# Assert — geo-weighting simply stays off.
assert coordinates == {}