mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
feat(epc-prediction): thread coordinates onto Comparable + target (#1227)
Adds coordinates: Optional[Coordinates] to Comparable and PredictionTarget
(data carriers — the pure predictor stays IO-free), and wires load_corpus to
read an optional _coordinates.json sidecar ({uprn: [lon, lat]}) and populate
each Comparable from its cert's uprn; iter_predictions threads the held-out
target's coordinates through. Absent sidecar -> geo-weighting stays off (no
behaviour change yet — weighting lands next slice). fetch_corpus_coordinates
now writes the sidecar into the corpus dir. load_corpus populates 99% of
corpus comparables.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
95719dd587
commit
fdc314c857
5 changed files with 65 additions and 4 deletions
|
|
@ -13,6 +13,7 @@ from datetime import date
|
|||
from typing import Callable, Optional, Union
|
||||
|
||||
from datatypes.epc.domain.epc_property_data import EpcPropertyData
|
||||
from domain.geospatial.coordinates import Coordinates
|
||||
|
||||
# Default floor on the cohort: a conditioning filter (built form, a known
|
||||
# override) is applied only while at least this many comparables survive it,
|
||||
|
|
@ -30,6 +31,10 @@ class Comparable:
|
|||
certificate_number: str
|
||||
address: Optional[str] = None
|
||||
registration_date: Optional[date] = None
|
||||
# Resolved from the neighbour's UPRN at the boundary (the harness / modelling
|
||||
# orchestrator), so the pure predictor can weight by physical distance to the
|
||||
# target without an IO dependency. None when no UPRN/coordinate is available.
|
||||
coordinates: Optional[Coordinates] = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
|
@ -45,6 +50,9 @@ class PredictionTarget:
|
|||
# A known Landlord Override (e.g. solid brick) conditions cohort selection —
|
||||
# matching comparables are emphasised while enough remain (ADR-0029).
|
||||
wall_construction: Optional[Union[int, str]] = None
|
||||
# The target Property's own coordinates (resolved from its UPRN), against
|
||||
# which neighbours are distance-weighted. None disables geo-weighting.
|
||||
coordinates: Optional[Coordinates] = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
|
|
|||
|
|
@ -115,6 +115,7 @@ def iter_predictions(
|
|||
postcode=actual.postcode,
|
||||
property_type=actual.property_type or "",
|
||||
built_form=actual.built_form,
|
||||
coordinates=held_out.coordinates,
|
||||
)
|
||||
comparables = select_comparables(target, others)
|
||||
if not comparables.members:
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ from typing import Any, Optional
|
|||
|
||||
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
||||
from domain.epc_prediction.comparable_properties import Comparable
|
||||
from domain.geospatial.coordinates import Coordinates
|
||||
|
||||
# Identifying free-text fields blanked when freezing a payload into the committed
|
||||
# fixture (postcode is kept — it is coarse open data and the cohort key).
|
||||
|
|
@ -36,14 +37,18 @@ def load_corpus(corpus_dir: Path) -> list[list[Comparable]]:
|
|||
f"no corpus index at {index_path} — run a corpus fetch first"
|
||||
)
|
||||
index: dict[str, list[str]] = json.loads(index_path.read_text())
|
||||
coordinates = load_coordinates(corpus_dir)
|
||||
return [
|
||||
_load_cohort(corpus_dir, postcode, certs)
|
||||
_load_cohort(corpus_dir, postcode, certs, coordinates)
|
||||
for postcode, certs in index.items()
|
||||
]
|
||||
|
||||
|
||||
def _load_cohort(
|
||||
corpus_dir: Path, postcode: str, certs: list[str]
|
||||
corpus_dir: Path,
|
||||
postcode: str,
|
||||
certs: list[str],
|
||||
coordinates: dict[int, Coordinates],
|
||||
) -> list[Comparable]:
|
||||
cohort: list[Comparable] = []
|
||||
for cert in certs:
|
||||
|
|
@ -55,17 +60,38 @@ def _load_cohort(
|
|||
epc = EpcPropertyDataMapper.from_api_response(raw)
|
||||
except Exception: # noqa: BLE001 — a bad cert must not abort the sweep
|
||||
continue
|
||||
uprn = _uprn(raw)
|
||||
cohort.append(
|
||||
Comparable(
|
||||
epc=epc,
|
||||
certificate_number=cert,
|
||||
address=_address(raw),
|
||||
registration_date=_registration_date(raw),
|
||||
coordinates=coordinates.get(uprn) if uprn is not None else None,
|
||||
)
|
||||
)
|
||||
return cohort
|
||||
|
||||
|
||||
def load_coordinates(corpus_dir: Path) -> dict[int, Coordinates]:
|
||||
"""The optional `_coordinates.json` sidecar (`{uprn: [lon, lat]}`), resolved
|
||||
from the OS Open-UPRN data by `fetch_corpus_coordinates.py`. Absent for a
|
||||
corpus without geo data — geo-weighting then simply stays off."""
|
||||
path = corpus_dir / "_coordinates.json"
|
||||
if not path.exists():
|
||||
return {}
|
||||
raw: dict[str, list[float]] = json.loads(path.read_text())
|
||||
return {
|
||||
int(uprn): Coordinates(longitude=lon_lat[0], latitude=lon_lat[1])
|
||||
for uprn, lon_lat in raw.items()
|
||||
}
|
||||
|
||||
|
||||
def _uprn(raw: dict[str, Any]) -> Optional[int]:
|
||||
value = raw.get("uprn")
|
||||
return int(value) if value is not None else None
|
||||
|
||||
|
||||
def stable_hash(prefix: str, value: str) -> str:
|
||||
"""A short, deterministic, one-way token for a free-text identifier. Stable
|
||||
across re-lodgements of the same address (normalised first), so dedup still
|
||||
|
|
|
|||
|
|
@ -30,7 +30,8 @@ import boto3
|
|||
import pandas as pd
|
||||
|
||||
CORPUS = Path(os.environ.get("EPC_PREDICTION_CORPUS", "/tmp/epc_prediction_corpus"))
|
||||
OUT = CORPUS.parent / "epc_prediction_corpus_coords.json"
|
||||
# Sidecar inside the corpus dir, so `load_corpus` picks it up automatically.
|
||||
OUT = CORPUS / "_coordinates.json"
|
||||
_BUCKET = os.environ["DATA_BUCKET"]
|
||||
_META_KEY = "spatial/filename_meta.parquet"
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,11 @@ cert payload for the committed fixture without disturbing the component data the
|
|||
scorer reads. Pure dict-in / dict-out.
|
||||
"""
|
||||
|
||||
from harness.epc_prediction_corpus import anonymise_payload
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from domain.geospatial.coordinates import Coordinates
|
||||
from harness.epc_prediction_corpus import load_coordinates, anonymise_payload
|
||||
|
||||
|
||||
def _payload() -> dict[str, object]:
|
||||
|
|
@ -69,3 +73,24 @@ def test_does_not_mutate_the_input() -> None:
|
|||
|
||||
# Assert — the caller's payload is left intact.
|
||||
assert raw["address_line_1"] == "12 Acacia Avenue"
|
||||
|
||||
|
||||
def test_loads_the_coordinates_sidecar(tmp_path: Path) -> None:
|
||||
# Arrange — a `_coordinates.json` sidecar mapping UPRN -> [lon, lat].
|
||||
(tmp_path / "_coordinates.json").write_text(
|
||||
json.dumps({"100024": [-1.5, 53.4]})
|
||||
)
|
||||
|
||||
# Act
|
||||
coordinates = load_coordinates(tmp_path)
|
||||
|
||||
# Assert — parsed into UPRN-keyed Coordinates.
|
||||
assert coordinates == {100024: Coordinates(longitude=-1.5, latitude=53.4)}
|
||||
|
||||
|
||||
def test_coordinates_sidecar_absent_yields_empty(tmp_path: Path) -> None:
|
||||
# Arrange / Act — no sidecar present (a corpus without geo data).
|
||||
coordinates = load_coordinates(tmp_path)
|
||||
|
||||
# Assert — geo-weighting simply stays off.
|
||||
assert coordinates == {}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue