From 58d5b171453eee6e7dfdfcf80d5ef7d7e962ce69 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 16 Jun 2026 03:03:15 +0000
Subject: [PATCH] chore(epc-prediction): dense-corpus fetcher + cross-postcode
 geo no-go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Build a geographically DENSE postcode-clustered corpus to test cross-postcode
geo expansion (the handover's anticipated "real geo payoff"). The gov EPC API
has no area/prefix search (a partial postcode 400s; the old opendatacommunities
partial-search API is decommissioned), so neighbourhood enumeration is external:
seed K postcodes nationally, expand each via postcodes.io's nearest-postcode
endpoint into every unit within RADIUS_M, pull each one's full EPC cohort.
postcodes.io is a corpus-BUILD dependency only — the predictor stays pure. Same
on-disk layout as the scattered corpus, so load_corpus + the coords resolver
consume it unchanged.

MEASURE-FIRST RESULT — cross-postcode expansion is a NO-GO. On a 2-seed pilot
(York YO19 + Islington N51, 81 postcodes / 1558 certs, 140 SAP-10.2 targets),
pooling nearby postcodes regresses accuracy across the board:
  same-postcode  FA_MAE 9.53  wall 92%  age 72%  floor_con 85%  cylinder 91%
  cross <=0.3km  FA_MAE 13.1  wall 80%  age 61%  floor_con 82%  cylinder 79%
Even as a thin-cohort top-up it hurts (thin n=18: FA 5.24 -> 7.15). Root cause:
the postcode boundary is itself a strong homogeneity prior (a postcode is one
coherent street/development), so same-postcode neighbours beat geographically
near cross-boundary ones even when the home postcode is sparse (and they rarely
are — median same-postcode cohort here is 34). Geo-proximity helps WITHIN a
postcode (#1227) but does not survive crossing the boundary. Cross-postcode geo
closed; geo weighting stays intra-postcode. Tooling kept (reusable).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 scripts/fetch_epc_prediction_dense_corpus.py | 197 +++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 scripts/fetch_epc_prediction_dense_corpus.py

diff --git a/scripts/fetch_epc_prediction_dense_corpus.py b/scripts/fetch_epc_prediction_dense_corpus.py
new file mode 100644
index 00000000..97ed0aba
--- /dev/null
+++ b/scripts/fetch_epc_prediction_dense_corpus.py
@@ -0,0 +1,197 @@
+"""Build a *geographically dense* postcode-clustered corpus for EPC Prediction
+(cross-postcode geo expansion — follow-up to ADR-0029 / issue #1227, #1237).
+
+WHY A SECOND CORPUS
+-------------------
+`fetch_epc_prediction_corpus.py` samples *scattered* national postcodes — fine
+for intra-postcode validation, but a held-out target's true geo-neighbours (the
+adjacent postcodes on its street) are NOT in that corpus, so the cross-postcode
+geo lever (distance-weighting a cohort that spans postcode boundaries) and
+built-form-aware sizing (#1237) cannot be measured on it.
+
+This builds dense clusters instead: each of K reproducible seed postcodes is
+expanded — via postcodes.io's nearest-postcode endpoint — into EVERY unit
+postcode within `RADIUS_M`, and each of those gets its full EPC cohort pulled.
+The result is a handful of dense neighbourhoods (a target's real neighbours ARE
+in-corpus) spread across the country (the seeds are nationally sampled, so the
+validation set stays diverse).
+
+postcodes.io is a CORPUS-BUILD dependency only (a free, public, OGL postcode
+service) — the predictor stays pure. The gov EPC API has no area/prefix search
+(a partial postcode 400s; only a full unit is accepted), which is why the
+neighbour enumeration is external.
+
+USAGE
+-----
+    PYTHONPATH=. python scripts/fetch_epc_prediction_dense_corpus.py          # full
+    PYTHONPATH=. python scripts/fetch_epc_prediction_dense_corpus.py --pilot  # 2 seeds
+
+Resumable — re-running skips cached certs. Token from `backend/.env`. Cache dir
+defaults to `/tmp/epc_prediction_dense_corpus` (separate from the scattered one),
+overridable via `EPC_PREDICTION_DENSE_CORPUS`. Layout matches the other corpus
+(`<POSTCODE_NOSPACE>/<cert>.json` + `_index.json`), so `load_corpus` and the
+coordinate resolver consume it unchanged.
+"""
+
+import json
+import os
+import random
+import sys
+import time
+from pathlib import Path
+from typing import Any, Optional
+
+import httpx
+from dotenv import load_dotenv
+
+load_dotenv("backend/.env")
+TOKEN = os.environ["OPEN_EPC_API_TOKEN"]
+BASE = "https://api.get-energy-performance-data.communities.gov.uk"
+H = {"Authorization": f"Bearer {TOKEN}", "Accept": "application/json"}
+POSTCODES_IO = "https://api.postcodes.io"
+CACHE = Path(
+    os.environ.get("EPC_PREDICTION_DENSE_CORPUS", "/tmp/epc_prediction_dense_corpus")
+)
+CACHE.mkdir(parents=True, exist_ok=True)
+
+# Seed sampling mirrors the scattered fetch (random search pages → an unbiased
+# national postcode spread), then each seed is densified. `date_end` must be
+# strictly before today.
+WINDOW = {"date_start": "2026-01-01", "date_end": "2026-05-31"}
+TOTAL_PAGES = 7402
+SEED_PAGES = 8         # random search pages → seed postcodes
+N_SEEDS = 25           # dense neighbourhood clusters to build
+RADIUS_M = 300         # postcodes.io nearest-postcode radius around each seed
+MAX_PER_SEED = 60      # cap unit postcodes per seed (dense urban seeds can be huge)
+random.seed(2026)      # reproducible draw
+
+
+def _get(url: str, params: dict[str, Any], headers: Optional[dict[str, str]] = None,
+         timeout: float = 20.0, tries: int = 5):
+    """GET with retry/backoff on 429 + 5xx (honours Retry-After)."""
+    r = None
+    for i in range(tries):
+        try:
+            r = httpx.get(url, params=params, headers=headers or {}, timeout=timeout)
+        except httpx.HTTPError:
+            time.sleep(1.5 * (i + 1))
+            continue
+        if r.status_code == 429 or r.status_code >= 500:
+            ra = r.headers.get("Retry-After")
+            time.sleep(float(ra) if ra else 1.5 * (i + 1))
+            continue
+        return r
+    return r
+
+
+def _normalise_postcode(postcode: str) -> str:
+    return postcode.replace(" ", "").upper()
+
+
+def sample_seed_postcodes(n_seeds: int) -> list[str]:
+    """Draw distinct seed postcodes from random search pages across the window."""
+    pages = sorted(random.sample(range(1, TOTAL_PAGES + 1), SEED_PAGES))
+    seen: dict[str, None] = {}
+    for p in pages:
+        r = _get(
+            f"{BASE}/api/domestic/search",
+            {**WINDOW, "current_page": p, "page_size": 100},
+            headers=H,
+        )
+        if r is None or not r.is_success:
+            print(f"  seed page {p} -> {getattr(r, 'status_code', 'ERR')}")
+            continue
+        for row in r.json().get("data", []):
+            pc = row.get("postcode")
+            if pc:
+                seen[pc] = None
+        if len(seen) >= n_seeds:
+            break
+    return list(seen)[:n_seeds]
+
+
+def nearby_postcodes(seed: str) -> list[str]:
+    """Every unit postcode within `RADIUS_M` of `seed`, via postcodes.io's
+    nearest-postcode endpoint (seeded on the seed's own coordinates). Returns the
+    seed itself plus its neighbours (deduped, capped)."""
+    s = _get(f"{POSTCODES_IO}/postcodes/{seed.replace(' ', '%20')}", {})
+    if s is None or not s.is_success:
+        return [seed]
+    res: dict[str, Any] = s.json().get("result") or {}
+    lat: Any = res.get("latitude")
+    lon: Any = res.get("longitude")
+    if lat is None or lon is None:
+        return [seed]
+    r = _get(
+        f"{POSTCODES_IO}/postcodes",
+        {"lon": lon, "lat": lat, "radius": RADIUS_M, "limit": 100},
+    )
+    if r is None or not r.is_success:
+        return [seed]
+    items: list[dict[str, Any]] = r.json().get("result") or []
+    found: list[str] = [str(x["postcode"]) for x in items if x.get("postcode")]
+    ordered = [seed] + [p for p in found if p != seed]
+    return ordered[:MAX_PER_SEED]
+
+
+def cohort_cert_numbers(postcode: str) -> list[str]:
+    r = _get(f"{BASE}/api/domestic/search", {"postcode": postcode}, headers=H)
+    if r is None or not r.is_success:
+        return []
+    return [
+        row["certificateNumber"]
+        for row in r.json().get("data", [])
+        if row.get("certificateNumber")
+    ]
+
+
+def fetch_cert(postcode_nospace: str, cert: str) -> bool:
+    """Fetch + cache one cert's raw `data` payload (True on success / cached)."""
+    out = CACHE / postcode_nospace / f"{cert}.json"
+    if out.exists():
+        return True
+    r = _get(f"{BASE}/api/certificate", {"certificate_number": cert}, headers=H)
+    if r is None or not r.is_success:
+        return False
+    try:
+        payload = r.json()["data"]
+    except (KeyError, ValueError):
+        return False
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(payload))
+    return True
+
+
+def main() -> None:
+    pilot = "--pilot" in sys.argv
+    n_seeds = 2 if pilot else N_SEEDS
+    print(f"sampling {n_seeds} seed postcodes ...")
+    seeds = sample_seed_postcodes(n_seeds)
+    print(f"seeds: {seeds}")
+
+    index: dict[str, list[str]] = {}
+    t0 = time.time()
+    total_certs = 0
+    for si, seed in enumerate(seeds, 1):
+        neighbourhood = nearby_postcodes(seed)
+        print(f"\n[seed {si}/{len(seeds)}] {seed}: {len(neighbourhood)} postcodes "
+              f"within {RADIUS_M}m")
+        for pc in neighbourhood:
+            nospace = _normalise_postcode(pc)
+            if nospace in index:
+                continue  # neighbourhoods can overlap; fetch each postcode once
+            certs = cohort_cert_numbers(pc)
+            fetched = [c for c in certs if fetch_cert(nospace, c)]
+            if fetched:
+                index[nospace] = fetched
+                total_certs += len(fetched)
+        print(f"  cumulative: {len(index)} postcodes, {total_certs} certs")
+    (CACHE / "_index.json").write_text(json.dumps(index, indent=2))
+    print(
+        f"\nDONE in {time.time() - t0:.0f}s: {len(seeds)} seeds, "
+        f"{len(index)} postcodes, {total_certs} certs under {CACHE}"
+    )
+
+
+if __name__ == "__main__":
+    main()