Model/harness/epc_bulk.py
Khalim Conn-Kowlessar afabfa0147 feat(modelling): sample a year from the EPC bulk export, offline-ready
fetch_epc_bulk_sample streams certificates-<year>.json out of the bulk ZIP via
range requests, keeps the first N SAP-version matches, and writes each cert's
inner document to <out>/<cert>.json for run_property_report. Stops after N, so
only the member prefix transfers, not the 15.7 GB archive (RangeFile.bytes_read
reports the true transfer vs the absolute ZIP offset). Verified on 2026: 100
SAP-10.2 certs -> report ran 81 scorable (MAE 2.03), 46 flagged, 19 raises
(11 full-SAP schema 19.1.0, 7 unmapped floor_construction 0/3, 1 missing
post_town) — real shadow-validation signal vs the curated golden 57.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 12:20:57 +00:00

96 lines
3.5 KiB
Python

"""Read the gov EPC **bulk** export without downloading the 15.7 GB archive.
The live API's bulk endpoint (`/api/files/domestic/json`) 302-redirects to a
temporary S3 ZIP holding one NDJSON member per year (`certificates-<year>.json`,
e.g. 2026 is ~559 MB compressed / ~7.6 GB uncompressed). Each NDJSON line is a
warehouse record whose per-cert payload is a *stringified* `document` field; the
parsed document is the same shape `EpcPropertyDataMapper.from_api_response`
already handles (`RdSAP-Schema-21.0.1`, `sap_building_parts`,
`energy_rating_current`, ...).
`RangeFile` exposes the S3 object as a seekable file backed by HTTP range
requests, so `zipfile` reads the central directory and streams a single member's
deflate stream — and a sampler can stop early after N records, fetching only the
compressed prefix it needs. The line-level parsing is pure and unit-tested here;
the network wiring lives in `scripts/fetch_epc_bulk_sample.py`.
"""
from __future__ import annotations
import io
import json
from typing import Any, Optional
import httpx
def parse_bulk_line(line: str) -> Optional[tuple[str, dict[str, Any]]]:
"""Parse one NDJSON bulk record into `(certificate_number, document)`,
unwrapping the stringified `document`. Blank lines return None."""
stripped: str = line.strip()
if not stripped:
return None
record: dict[str, Any] = json.loads(stripped)
raw_document: Any = record["document"]
document: dict[str, Any] = (
json.loads(raw_document) if isinstance(raw_document, str) else raw_document
)
return record["certificate_number"], document
def is_sap_version(document: dict[str, Any], wanted: str) -> bool:
"""True when the document's `sap_version` equals `wanted` (the export carries
it as a number, so compare on the string form)."""
version: Any = document.get("sap_version")
return version is not None and str(version) == wanted
class RangeFile(io.RawIOBase):
"""A seekable read-only file over an HTTP object that supports byte ranges
(an S3 presigned URL). Each `read` issues a `Range` GET, so `zipfile` can
parse the central directory and stream one member without downloading the
whole archive."""
def __init__(self, url: str, size: int) -> None:
self._url = url
self._size = size
self._pos = 0
self._client = httpx.Client(timeout=120)
# Bytes actually transferred — distinct from `tell()`, which is the
# absolute offset (a deep member sits GBs into the archive).
self.bytes_read = 0
def seekable(self) -> bool:
return True
def readable(self) -> bool:
return True
def tell(self) -> int:
return self._pos
def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
if whence == io.SEEK_SET:
self._pos = offset
elif whence == io.SEEK_CUR:
self._pos += offset
elif whence == io.SEEK_END:
self._pos = self._size + offset
return self._pos
def read(self, size: Optional[int] = -1) -> bytes:
if size is None or size < 0:
size = self._size - self._pos
if size == 0 or self._pos >= self._size:
return b""
end: int = min(self._pos + size, self._size) - 1
resp = self._client.get(self._url, headers={"Range": f"bytes={self._pos}-{end}"})
resp.raise_for_status()
data: bytes = resp.content
self._pos += len(data)
self.bytes_read += len(data)
return data
def close(self) -> None:
self._client.close()
super().close()