From fdc314c8574c86bbe629cb39a413dfa6abaf1684 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 15 Jun 2026 14:46:01 +0000 Subject: [PATCH] feat(epc-prediction): thread coordinates onto Comparable + target (#1227) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds coordinates: Optional[Coordinates] to Comparable and PredictionTarget (data carriers — the pure predictor stays IO-free), and wires load_corpus to read an optional _coordinates.json sidecar ({uprn: [lon, lat]}) and populate each Comparable from its cert's uprn; iter_predictions threads the held-out target's coordinates through. Absent sidecar -> geo-weighting stays off (no behaviour change yet — weighting lands next slice). fetch_corpus_coordinates now writes the sidecar into the corpus dir. load_corpus populates 99% of corpus comparables. Co-Authored-By: Claude Opus 4.8 --- .../epc_prediction/comparable_properties.py | 8 +++++ domain/epc_prediction/validation.py | 1 + harness/epc_prediction_corpus.py | 30 +++++++++++++++++-- scripts/fetch_corpus_coordinates.py | 3 +- tests/harness/test_epc_prediction_corpus.py | 27 ++++++++++++++++- 5 files changed, 65 insertions(+), 4 deletions(-) diff --git a/domain/epc_prediction/comparable_properties.py b/domain/epc_prediction/comparable_properties.py index bccff188..dedbc9e9 100644 --- a/domain/epc_prediction/comparable_properties.py +++ b/domain/epc_prediction/comparable_properties.py @@ -13,6 +13,7 @@ from datetime import date from typing import Callable, Optional, Union from datatypes.epc.domain.epc_property_data import EpcPropertyData +from domain.geospatial.coordinates import Coordinates # Default floor on the cohort: a conditioning filter (built form, a known # override) is applied only while at least this many comparables survive it, @@ -30,6 +31,10 @@ class Comparable: certificate_number: str address: Optional[str] = None registration_date: Optional[date] = None + # Resolved from the neighbour's UPRN at the boundary (the harness / modelling + # orchestrator), so the pure predictor can weight by physical distance to the + # target without an IO dependency. None when no UPRN/coordinate is available. + coordinates: Optional[Coordinates] = None @dataclass(frozen=True) @@ -45,6 +50,9 @@ class PredictionTarget: # A known Landlord Override (e.g. solid brick) conditions cohort selection — # matching comparables are emphasised while enough remain (ADR-0029). wall_construction: Optional[Union[int, str]] = None + # The target Property's own coordinates (resolved from its UPRN), against + # which neighbours are distance-weighted. None disables geo-weighting. + coordinates: Optional[Coordinates] = None @dataclass(frozen=True) diff --git a/domain/epc_prediction/validation.py b/domain/epc_prediction/validation.py index d778246e..0e1234a6 100644 --- a/domain/epc_prediction/validation.py +++ b/domain/epc_prediction/validation.py @@ -115,6 +115,7 @@ def iter_predictions( postcode=actual.postcode, property_type=actual.property_type or "", built_form=actual.built_form, + coordinates=held_out.coordinates, ) comparables = select_comparables(target, others) if not comparables.members: diff --git a/harness/epc_prediction_corpus.py b/harness/epc_prediction_corpus.py index 6b1761e8..e29117da 100644 --- a/harness/epc_prediction_corpus.py +++ b/harness/epc_prediction_corpus.py @@ -21,6 +21,7 @@ from typing import Any, Optional from datatypes.epc.domain.mapper import EpcPropertyDataMapper from domain.epc_prediction.comparable_properties import Comparable +from domain.geospatial.coordinates import Coordinates # Identifying free-text fields blanked when freezing a payload into the committed # fixture (postcode is kept — it is coarse open data and the cohort key). @@ -36,14 +37,18 @@ def load_corpus(corpus_dir: Path) -> list[list[Comparable]]: f"no corpus index at {index_path} — run a corpus fetch first" ) index: dict[str, list[str]] = json.loads(index_path.read_text()) + coordinates = load_coordinates(corpus_dir) return [ - _load_cohort(corpus_dir, postcode, certs) + _load_cohort(corpus_dir, postcode, certs, coordinates) for postcode, certs in index.items() ] def _load_cohort( - corpus_dir: Path, postcode: str, certs: list[str] + corpus_dir: Path, + postcode: str, + certs: list[str], + coordinates: dict[int, Coordinates], ) -> list[Comparable]: cohort: list[Comparable] = [] for cert in certs: @@ -55,17 +60,38 @@ def _load_cohort( epc = EpcPropertyDataMapper.from_api_response(raw) except Exception: # noqa: BLE001 — a bad cert must not abort the sweep continue + uprn = _uprn(raw) cohort.append( Comparable( epc=epc, certificate_number=cert, address=_address(raw), registration_date=_registration_date(raw), + coordinates=coordinates.get(uprn) if uprn is not None else None, ) ) return cohort +def load_coordinates(corpus_dir: Path) -> dict[int, Coordinates]: + """The optional `_coordinates.json` sidecar (`{uprn: [lon, lat]}`), resolved + from the OS Open-UPRN data by `fetch_corpus_coordinates.py`. Absent for a + corpus without geo data — geo-weighting then simply stays off.""" + path = corpus_dir / "_coordinates.json" + if not path.exists(): + return {} + raw: dict[str, list[float]] = json.loads(path.read_text()) + return { + int(uprn): Coordinates(longitude=lon_lat[0], latitude=lon_lat[1]) + for uprn, lon_lat in raw.items() + } + + +def _uprn(raw: dict[str, Any]) -> Optional[int]: + value = raw.get("uprn") + return int(value) if value is not None else None + + def stable_hash(prefix: str, value: str) -> str: """A short, deterministic, one-way token for a free-text identifier. Stable across re-lodgements of the same address (normalised first), so dedup still diff --git a/scripts/fetch_corpus_coordinates.py b/scripts/fetch_corpus_coordinates.py index bd87acf1..8dca8516 100644 --- a/scripts/fetch_corpus_coordinates.py +++ b/scripts/fetch_corpus_coordinates.py @@ -30,7 +30,8 @@ import boto3 import pandas as pd CORPUS = Path(os.environ.get("EPC_PREDICTION_CORPUS", "/tmp/epc_prediction_corpus")) -OUT = CORPUS.parent / "epc_prediction_corpus_coords.json" +# Sidecar inside the corpus dir, so `load_corpus` picks it up automatically. +OUT = CORPUS / "_coordinates.json" _BUCKET = os.environ["DATA_BUCKET"] _META_KEY = "spatial/filename_meta.parquet" diff --git a/tests/harness/test_epc_prediction_corpus.py b/tests/harness/test_epc_prediction_corpus.py index 543f29af..877739d8 100644 --- a/tests/harness/test_epc_prediction_corpus.py +++ b/tests/harness/test_epc_prediction_corpus.py @@ -3,7 +3,11 @@ cert payload for the committed fixture without disturbing the component data the scorer reads. Pure dict-in / dict-out. """ -from harness.epc_prediction_corpus import anonymise_payload +import json +from pathlib import Path + +from domain.geospatial.coordinates import Coordinates +from harness.epc_prediction_corpus import load_coordinates, anonymise_payload def _payload() -> dict[str, object]: @@ -69,3 +73,24 @@ def test_does_not_mutate_the_input() -> None: # Assert — the caller's payload is left intact. assert raw["address_line_1"] == "12 Acacia Avenue" + + +def test_loads_the_coordinates_sidecar(tmp_path: Path) -> None: + # Arrange — a `_coordinates.json` sidecar mapping UPRN -> [lon, lat]. + (tmp_path / "_coordinates.json").write_text( + json.dumps({"100024": [-1.5, 53.4]}) + ) + + # Act + coordinates = load_coordinates(tmp_path) + + # Assert — parsed into UPRN-keyed Coordinates. + assert coordinates == {100024: Coordinates(longitude=-1.5, latitude=53.4)} + + +def test_coordinates_sidecar_absent_yields_empty(tmp_path: Path) -> None: + # Arrange / Act — no sidecar present (a corpus without geo data). + coordinates = load_coordinates(tmp_path) + + # Assert — geo-weighting simply stays off. + assert coordinates == {}