mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
chore(epc-prediction): dense-corpus fetcher + cross-postcode geo no-go
Build a geographically DENSE postcode-clustered corpus to test cross-postcode geo expansion (the handover's anticipated "real geo payoff"). The gov EPC API has no area/prefix search (a partial postcode 400s; the old opendatacommunities partial-search API is decommissioned), so neighbourhood enumeration is external: seed K postcodes nationally, expand each via postcodes.io's nearest-postcode endpoint into every unit within RADIUS_M, pull each one's full EPC cohort. postcodes.io is a corpus-BUILD dependency only — the predictor stays pure. Same on-disk layout as the scattered corpus, so load_corpus + the coords resolver consume it unchanged. MEASURE-FIRST RESULT — cross-postcode expansion is a NO-GO. On a 2-seed pilot (York YO19 + Islington N51, 81 postcodes / 1558 certs, 140 SAP-10.2 targets), pooling nearby postcodes regresses accuracy across the board: same-postcode FA_MAE 9.53 wall 92% age 72% floor_con 85% cylinder 91% cross <=0.3km FA_MAE 13.1 wall 80% age 61% floor_con 82% cylinder 79% Even as a thin-cohort top-up it hurts (thin n=18: FA 5.24 -> 7.15). Root cause: the postcode boundary is itself a strong homogeneity prior (a postcode is one coherent street/development), so same-postcode neighbours beat geographically near cross-boundary ones even when the home postcode is sparse (and they rarely are — median same-postcode cohort here is 34). Geo-proximity helps WITHIN a postcode (#1227) but does not survive crossing the boundary. Cross-postcode geo closed; geo weighting stays intra-postcode. Tooling kept (reusable). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
be3e51bae9
commit
58d5b17145
1 changed files with 197 additions and 0 deletions
197
scripts/fetch_epc_prediction_dense_corpus.py
Normal file
197
scripts/fetch_epc_prediction_dense_corpus.py
Normal file
|
|
@ -0,0 +1,197 @@
|
|||
"""Build a *geographically dense* postcode-clustered corpus for EPC Prediction
|
||||
(cross-postcode geo expansion — follow-up to ADR-0029 / issue #1227, #1237).
|
||||
|
||||
WHY A SECOND CORPUS
|
||||
-------------------
|
||||
`fetch_epc_prediction_corpus.py` samples *scattered* national postcodes — fine
|
||||
for intra-postcode validation, but a held-out target's true geo-neighbours (the
|
||||
adjacent postcodes on its street) are NOT in that corpus, so the cross-postcode
|
||||
geo lever (distance-weighting a cohort that spans postcode boundaries) and
|
||||
built-form-aware sizing (#1237) cannot be measured on it.
|
||||
|
||||
This builds dense clusters instead: each of K reproducible seed postcodes is
|
||||
expanded — via postcodes.io's nearest-postcode endpoint — into EVERY unit
|
||||
postcode within `RADIUS_M`, and each of those gets its full EPC cohort pulled.
|
||||
The result is a handful of dense neighbourhoods (a target's real neighbours ARE
|
||||
in-corpus) spread across the country (the seeds are nationally sampled, so the
|
||||
validation set stays diverse).
|
||||
|
||||
postcodes.io is a CORPUS-BUILD dependency only (a free, public, OGL postcode
|
||||
service) — the predictor stays pure. The gov EPC API has no area/prefix search
|
||||
(a partial postcode 400s; only a full unit is accepted), which is why the
|
||||
neighbour enumeration is external.
|
||||
|
||||
USAGE
|
||||
-----
|
||||
PYTHONPATH=. python scripts/fetch_epc_prediction_dense_corpus.py # full
|
||||
PYTHONPATH=. python scripts/fetch_epc_prediction_dense_corpus.py --pilot # 2 seeds
|
||||
|
||||
Resumable — re-running skips cached certs. Token from `backend/.env`. Cache dir
|
||||
defaults to `/tmp/epc_prediction_dense_corpus` (separate from the scattered one),
|
||||
overridable via `EPC_PREDICTION_DENSE_CORPUS`. Layout matches the other corpus
|
||||
(`<POSTCODE_NOSPACE>/<cert>.json` + `_index.json`), so `load_corpus` and the
|
||||
coordinate resolver consume it unchanged.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import httpx
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv("backend/.env")
|
||||
TOKEN = os.environ["OPEN_EPC_API_TOKEN"]
|
||||
BASE = "https://api.get-energy-performance-data.communities.gov.uk"
|
||||
H = {"Authorization": f"Bearer {TOKEN}", "Accept": "application/json"}
|
||||
POSTCODES_IO = "https://api.postcodes.io"
|
||||
CACHE = Path(
|
||||
os.environ.get("EPC_PREDICTION_DENSE_CORPUS", "/tmp/epc_prediction_dense_corpus")
|
||||
)
|
||||
CACHE.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Seed sampling mirrors the scattered fetch (random search pages → an unbiased
|
||||
# national postcode spread), then each seed is densified. `date_end` must be
|
||||
# strictly before today.
|
||||
WINDOW = {"date_start": "2026-01-01", "date_end": "2026-05-31"}
|
||||
TOTAL_PAGES = 7402
|
||||
SEED_PAGES = 8 # random search pages → seed postcodes
|
||||
N_SEEDS = 25 # dense neighbourhood clusters to build
|
||||
RADIUS_M = 300 # postcodes.io nearest-postcode radius around each seed
|
||||
MAX_PER_SEED = 60 # cap unit postcodes per seed (dense urban seeds can be huge)
|
||||
random.seed(2026) # reproducible draw
|
||||
|
||||
|
||||
def _get(url: str, params: dict[str, Any], headers: Optional[dict[str, str]] = None,
|
||||
timeout: float = 20.0, tries: int = 5):
|
||||
"""GET with retry/backoff on 429 + 5xx (honours Retry-After)."""
|
||||
r = None
|
||||
for i in range(tries):
|
||||
try:
|
||||
r = httpx.get(url, params=params, headers=headers or {}, timeout=timeout)
|
||||
except httpx.HTTPError:
|
||||
time.sleep(1.5 * (i + 1))
|
||||
continue
|
||||
if r.status_code == 429 or r.status_code >= 500:
|
||||
ra = r.headers.get("Retry-After")
|
||||
time.sleep(float(ra) if ra else 1.5 * (i + 1))
|
||||
continue
|
||||
return r
|
||||
return r
|
||||
|
||||
|
||||
def _normalise_postcode(postcode: str) -> str:
|
||||
return postcode.replace(" ", "").upper()
|
||||
|
||||
|
||||
def sample_seed_postcodes(n_seeds: int) -> list[str]:
|
||||
"""Draw distinct seed postcodes from random search pages across the window."""
|
||||
pages = sorted(random.sample(range(1, TOTAL_PAGES + 1), SEED_PAGES))
|
||||
seen: dict[str, None] = {}
|
||||
for p in pages:
|
||||
r = _get(
|
||||
f"{BASE}/api/domestic/search",
|
||||
{**WINDOW, "current_page": p, "page_size": 100},
|
||||
headers=H,
|
||||
)
|
||||
if r is None or not r.is_success:
|
||||
print(f" seed page {p} -> {getattr(r, 'status_code', 'ERR')}")
|
||||
continue
|
||||
for row in r.json().get("data", []):
|
||||
pc = row.get("postcode")
|
||||
if pc:
|
||||
seen[pc] = None
|
||||
if len(seen) >= n_seeds:
|
||||
break
|
||||
return list(seen)[:n_seeds]
|
||||
|
||||
|
||||
def nearby_postcodes(seed: str) -> list[str]:
|
||||
"""Every unit postcode within `RADIUS_M` of `seed`, via postcodes.io's
|
||||
nearest-postcode endpoint (seeded on the seed's own coordinates). Returns the
|
||||
seed itself plus its neighbours (deduped, capped)."""
|
||||
s = _get(f"{POSTCODES_IO}/postcodes/{seed.replace(' ', '%20')}", {})
|
||||
if s is None or not s.is_success:
|
||||
return [seed]
|
||||
res: dict[str, Any] = s.json().get("result") or {}
|
||||
lat: Any = res.get("latitude")
|
||||
lon: Any = res.get("longitude")
|
||||
if lat is None or lon is None:
|
||||
return [seed]
|
||||
r = _get(
|
||||
f"{POSTCODES_IO}/postcodes",
|
||||
{"lon": lon, "lat": lat, "radius": RADIUS_M, "limit": 100},
|
||||
)
|
||||
if r is None or not r.is_success:
|
||||
return [seed]
|
||||
items: list[dict[str, Any]] = r.json().get("result") or []
|
||||
found: list[str] = [str(x["postcode"]) for x in items if x.get("postcode")]
|
||||
ordered = [seed] + [p for p in found if p != seed]
|
||||
return ordered[:MAX_PER_SEED]
|
||||
|
||||
|
||||
def cohort_cert_numbers(postcode: str) -> list[str]:
|
||||
r = _get(f"{BASE}/api/domestic/search", {"postcode": postcode}, headers=H)
|
||||
if r is None or not r.is_success:
|
||||
return []
|
||||
return [
|
||||
row["certificateNumber"]
|
||||
for row in r.json().get("data", [])
|
||||
if row.get("certificateNumber")
|
||||
]
|
||||
|
||||
|
||||
def fetch_cert(postcode_nospace: str, cert: str) -> bool:
|
||||
"""Fetch + cache one cert's raw `data` payload (True on success / cached)."""
|
||||
out = CACHE / postcode_nospace / f"{cert}.json"
|
||||
if out.exists():
|
||||
return True
|
||||
r = _get(f"{BASE}/api/certificate", {"certificate_number": cert}, headers=H)
|
||||
if r is None or not r.is_success:
|
||||
return False
|
||||
try:
|
||||
payload = r.json()["data"]
|
||||
except (KeyError, ValueError):
|
||||
return False
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_text(json.dumps(payload))
|
||||
return True
|
||||
|
||||
|
||||
def main() -> None:
|
||||
pilot = "--pilot" in sys.argv
|
||||
n_seeds = 2 if pilot else N_SEEDS
|
||||
print(f"sampling {n_seeds} seed postcodes ...")
|
||||
seeds = sample_seed_postcodes(n_seeds)
|
||||
print(f"seeds: {seeds}")
|
||||
|
||||
index: dict[str, list[str]] = {}
|
||||
t0 = time.time()
|
||||
total_certs = 0
|
||||
for si, seed in enumerate(seeds, 1):
|
||||
neighbourhood = nearby_postcodes(seed)
|
||||
print(f"\n[seed {si}/{len(seeds)}] {seed}: {len(neighbourhood)} postcodes "
|
||||
f"within {RADIUS_M}m")
|
||||
for pc in neighbourhood:
|
||||
nospace = _normalise_postcode(pc)
|
||||
if nospace in index:
|
||||
continue # neighbourhoods can overlap; fetch each postcode once
|
||||
certs = cohort_cert_numbers(pc)
|
||||
fetched = [c for c in certs if fetch_cert(nospace, c)]
|
||||
if fetched:
|
||||
index[nospace] = fetched
|
||||
total_certs += len(fetched)
|
||||
print(f" cumulative: {len(index)} postcodes, {total_certs} certs")
|
||||
(CACHE / "_index.json").write_text(json.dumps(index, indent=2))
|
||||
print(
|
||||
f"\nDONE in {time.time() - t0:.0f}s: {len(seeds)} seeds, "
|
||||
f"{len(index)} postcodes, {total_certs} certs under {CACHE}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Reference in a new issue