From 58d5b171453eee6e7dfdfcf80d5ef7d7e962ce69 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 16 Jun 2026 03:03:15 +0000 Subject: [PATCH] chore(epc-prediction): dense-corpus fetcher + cross-postcode geo no-go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build a geographically DENSE postcode-clustered corpus to test cross-postcode geo expansion (the handover's anticipated "real geo payoff"). The gov EPC API has no area/prefix search (a partial postcode 400s; the old opendatacommunities partial-search API is decommissioned), so neighbourhood enumeration is external: seed K postcodes nationally, expand each via postcodes.io's nearest-postcode endpoint into every unit within RADIUS_M, pull each one's full EPC cohort. postcodes.io is a corpus-BUILD dependency only — the predictor stays pure. Same on-disk layout as the scattered corpus, so load_corpus + the coords resolver consume it unchanged. MEASURE-FIRST RESULT — cross-postcode expansion is a NO-GO. On a 2-seed pilot (York YO19 + Islington N51, 81 postcodes / 1558 certs, 140 SAP-10.2 targets), pooling nearby postcodes regresses accuracy across the board: same-postcode FA_MAE 9.53 wall 92% age 72% floor_con 85% cylinder 91% cross <=0.3km FA_MAE 13.1 wall 80% age 61% floor_con 82% cylinder 79% Even as a thin-cohort top-up it hurts (thin n=18: FA 5.24 -> 7.15). Root cause: the postcode boundary is itself a strong homogeneity prior (a postcode is one coherent street/development), so same-postcode neighbours beat geographically near cross-boundary ones even when the home postcode is sparse (and they rarely are — median same-postcode cohort here is 34). Geo-proximity helps WITHIN a postcode (#1227) but does not survive crossing the boundary. Cross-postcode geo closed; geo weighting stays intra-postcode. Tooling kept (reusable). Co-Authored-By: Claude Opus 4.8 --- scripts/fetch_epc_prediction_dense_corpus.py | 197 +++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 scripts/fetch_epc_prediction_dense_corpus.py diff --git a/scripts/fetch_epc_prediction_dense_corpus.py b/scripts/fetch_epc_prediction_dense_corpus.py new file mode 100644 index 00000000..97ed0aba --- /dev/null +++ b/scripts/fetch_epc_prediction_dense_corpus.py @@ -0,0 +1,197 @@ +"""Build a *geographically dense* postcode-clustered corpus for EPC Prediction +(cross-postcode geo expansion — follow-up to ADR-0029 / issue #1227, #1237). + +WHY A SECOND CORPUS +------------------- +`fetch_epc_prediction_corpus.py` samples *scattered* national postcodes — fine +for intra-postcode validation, but a held-out target's true geo-neighbours (the +adjacent postcodes on its street) are NOT in that corpus, so the cross-postcode +geo lever (distance-weighting a cohort that spans postcode boundaries) and +built-form-aware sizing (#1237) cannot be measured on it. + +This builds dense clusters instead: each of K reproducible seed postcodes is +expanded — via postcodes.io's nearest-postcode endpoint — into EVERY unit +postcode within `RADIUS_M`, and each of those gets its full EPC cohort pulled. +The result is a handful of dense neighbourhoods (a target's real neighbours ARE +in-corpus) spread across the country (the seeds are nationally sampled, so the +validation set stays diverse). + +postcodes.io is a CORPUS-BUILD dependency only (a free, public, OGL postcode +service) — the predictor stays pure. The gov EPC API has no area/prefix search +(a partial postcode 400s; only a full unit is accepted), which is why the +neighbour enumeration is external. + +USAGE +----- + PYTHONPATH=. python scripts/fetch_epc_prediction_dense_corpus.py # full + PYTHONPATH=. python scripts/fetch_epc_prediction_dense_corpus.py --pilot # 2 seeds + +Resumable — re-running skips cached certs. Token from `backend/.env`. Cache dir +defaults to `/tmp/epc_prediction_dense_corpus` (separate from the scattered one), +overridable via `EPC_PREDICTION_DENSE_CORPUS`. Layout matches the other corpus +(`/.json` + `_index.json`), so `load_corpus` and the +coordinate resolver consume it unchanged. +""" + +import json +import os +import random +import sys +import time +from pathlib import Path +from typing import Any, Optional + +import httpx +from dotenv import load_dotenv + +load_dotenv("backend/.env") +TOKEN = os.environ["OPEN_EPC_API_TOKEN"] +BASE = "https://api.get-energy-performance-data.communities.gov.uk" +H = {"Authorization": f"Bearer {TOKEN}", "Accept": "application/json"} +POSTCODES_IO = "https://api.postcodes.io" +CACHE = Path( + os.environ.get("EPC_PREDICTION_DENSE_CORPUS", "/tmp/epc_prediction_dense_corpus") +) +CACHE.mkdir(parents=True, exist_ok=True) + +# Seed sampling mirrors the scattered fetch (random search pages → an unbiased +# national postcode spread), then each seed is densified. `date_end` must be +# strictly before today. +WINDOW = {"date_start": "2026-01-01", "date_end": "2026-05-31"} +TOTAL_PAGES = 7402 +SEED_PAGES = 8 # random search pages → seed postcodes +N_SEEDS = 25 # dense neighbourhood clusters to build +RADIUS_M = 300 # postcodes.io nearest-postcode radius around each seed +MAX_PER_SEED = 60 # cap unit postcodes per seed (dense urban seeds can be huge) +random.seed(2026) # reproducible draw + + +def _get(url: str, params: dict[str, Any], headers: Optional[dict[str, str]] = None, + timeout: float = 20.0, tries: int = 5): + """GET with retry/backoff on 429 + 5xx (honours Retry-After).""" + r = None + for i in range(tries): + try: + r = httpx.get(url, params=params, headers=headers or {}, timeout=timeout) + except httpx.HTTPError: + time.sleep(1.5 * (i + 1)) + continue + if r.status_code == 429 or r.status_code >= 500: + ra = r.headers.get("Retry-After") + time.sleep(float(ra) if ra else 1.5 * (i + 1)) + continue + return r + return r + + +def _normalise_postcode(postcode: str) -> str: + return postcode.replace(" ", "").upper() + + +def sample_seed_postcodes(n_seeds: int) -> list[str]: + """Draw distinct seed postcodes from random search pages across the window.""" + pages = sorted(random.sample(range(1, TOTAL_PAGES + 1), SEED_PAGES)) + seen: dict[str, None] = {} + for p in pages: + r = _get( + f"{BASE}/api/domestic/search", + {**WINDOW, "current_page": p, "page_size": 100}, + headers=H, + ) + if r is None or not r.is_success: + print(f" seed page {p} -> {getattr(r, 'status_code', 'ERR')}") + continue + for row in r.json().get("data", []): + pc = row.get("postcode") + if pc: + seen[pc] = None + if len(seen) >= n_seeds: + break + return list(seen)[:n_seeds] + + +def nearby_postcodes(seed: str) -> list[str]: + """Every unit postcode within `RADIUS_M` of `seed`, via postcodes.io's + nearest-postcode endpoint (seeded on the seed's own coordinates). Returns the + seed itself plus its neighbours (deduped, capped).""" + s = _get(f"{POSTCODES_IO}/postcodes/{seed.replace(' ', '%20')}", {}) + if s is None or not s.is_success: + return [seed] + res: dict[str, Any] = s.json().get("result") or {} + lat: Any = res.get("latitude") + lon: Any = res.get("longitude") + if lat is None or lon is None: + return [seed] + r = _get( + f"{POSTCODES_IO}/postcodes", + {"lon": lon, "lat": lat, "radius": RADIUS_M, "limit": 100}, + ) + if r is None or not r.is_success: + return [seed] + items: list[dict[str, Any]] = r.json().get("result") or [] + found: list[str] = [str(x["postcode"]) for x in items if x.get("postcode")] + ordered = [seed] + [p for p in found if p != seed] + return ordered[:MAX_PER_SEED] + + +def cohort_cert_numbers(postcode: str) -> list[str]: + r = _get(f"{BASE}/api/domestic/search", {"postcode": postcode}, headers=H) + if r is None or not r.is_success: + return [] + return [ + row["certificateNumber"] + for row in r.json().get("data", []) + if row.get("certificateNumber") + ] + + +def fetch_cert(postcode_nospace: str, cert: str) -> bool: + """Fetch + cache one cert's raw `data` payload (True on success / cached).""" + out = CACHE / postcode_nospace / f"{cert}.json" + if out.exists(): + return True + r = _get(f"{BASE}/api/certificate", {"certificate_number": cert}, headers=H) + if r is None or not r.is_success: + return False + try: + payload = r.json()["data"] + except (KeyError, ValueError): + return False + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(json.dumps(payload)) + return True + + +def main() -> None: + pilot = "--pilot" in sys.argv + n_seeds = 2 if pilot else N_SEEDS + print(f"sampling {n_seeds} seed postcodes ...") + seeds = sample_seed_postcodes(n_seeds) + print(f"seeds: {seeds}") + + index: dict[str, list[str]] = {} + t0 = time.time() + total_certs = 0 + for si, seed in enumerate(seeds, 1): + neighbourhood = nearby_postcodes(seed) + print(f"\n[seed {si}/{len(seeds)}] {seed}: {len(neighbourhood)} postcodes " + f"within {RADIUS_M}m") + for pc in neighbourhood: + nospace = _normalise_postcode(pc) + if nospace in index: + continue # neighbourhoods can overlap; fetch each postcode once + certs = cohort_cert_numbers(pc) + fetched = [c for c in certs if fetch_cert(nospace, c)] + if fetched: + index[nospace] = fetched + total_certs += len(fetched) + print(f" cumulative: {len(index)} postcodes, {total_certs} certs") + (CACHE / "_index.json").write_text(json.dumps(index, indent=2)) + print( + f"\nDONE in {time.time() - t0:.0f}s: {len(seeds)} seeds, " + f"{len(index)} postcodes, {total_certs} certs under {CACHE}" + ) + + +if __name__ == "__main__": + main()