mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
fetch_epc_bulk_sample streams certificates-<year>.json out of the bulk ZIP via range requests, keeps the first N SAP-version matches, and writes each cert's inner document to <out>/<cert>.json for run_property_report. Stops after N, so only the member prefix transfers, not the 15.7 GB archive (RangeFile.bytes_read reports the true transfer vs the absolute ZIP offset). Verified on 2026: 100 SAP-10.2 certs -> report ran 81 scorable (MAE 2.03), 46 flagged, 19 raises (11 full-SAP schema 19.1.0, 7 unmapped floor_construction 0/3, 1 missing post_town) — real shadow-validation signal vs the curated golden 57. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
96 lines
3.5 KiB
Python
96 lines
3.5 KiB
Python
"""Read the gov EPC **bulk** export without downloading the 15.7 GB archive.
|
|
|
|
The live API's bulk endpoint (`/api/files/domestic/json`) 302-redirects to a
|
|
temporary S3 ZIP holding one NDJSON member per year (`certificates-<year>.json`,
|
|
e.g. 2026 is ~559 MB compressed / ~7.6 GB uncompressed). Each NDJSON line is a
|
|
warehouse record whose per-cert payload is a *stringified* `document` field; the
|
|
parsed document is the same shape `EpcPropertyDataMapper.from_api_response`
|
|
already handles (`RdSAP-Schema-21.0.1`, `sap_building_parts`,
|
|
`energy_rating_current`, ...).
|
|
|
|
`RangeFile` exposes the S3 object as a seekable file backed by HTTP range
|
|
requests, so `zipfile` reads the central directory and streams a single member's
|
|
deflate stream — and a sampler can stop early after N records, fetching only the
|
|
compressed prefix it needs. The line-level parsing is pure and unit-tested here;
|
|
the network wiring lives in `scripts/fetch_epc_bulk_sample.py`.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import json
|
|
from typing import Any, Optional
|
|
|
|
import httpx
|
|
|
|
|
|
def parse_bulk_line(line: str) -> Optional[tuple[str, dict[str, Any]]]:
|
|
"""Parse one NDJSON bulk record into `(certificate_number, document)`,
|
|
unwrapping the stringified `document`. Blank lines return None."""
|
|
stripped: str = line.strip()
|
|
if not stripped:
|
|
return None
|
|
record: dict[str, Any] = json.loads(stripped)
|
|
raw_document: Any = record["document"]
|
|
document: dict[str, Any] = (
|
|
json.loads(raw_document) if isinstance(raw_document, str) else raw_document
|
|
)
|
|
return record["certificate_number"], document
|
|
|
|
|
|
def is_sap_version(document: dict[str, Any], wanted: str) -> bool:
|
|
"""True when the document's `sap_version` equals `wanted` (the export carries
|
|
it as a number, so compare on the string form)."""
|
|
version: Any = document.get("sap_version")
|
|
return version is not None and str(version) == wanted
|
|
|
|
|
|
class RangeFile(io.RawIOBase):
|
|
"""A seekable read-only file over an HTTP object that supports byte ranges
|
|
(an S3 presigned URL). Each `read` issues a `Range` GET, so `zipfile` can
|
|
parse the central directory and stream one member without downloading the
|
|
whole archive."""
|
|
|
|
def __init__(self, url: str, size: int) -> None:
|
|
self._url = url
|
|
self._size = size
|
|
self._pos = 0
|
|
self._client = httpx.Client(timeout=120)
|
|
# Bytes actually transferred — distinct from `tell()`, which is the
|
|
# absolute offset (a deep member sits GBs into the archive).
|
|
self.bytes_read = 0
|
|
|
|
def seekable(self) -> bool:
|
|
return True
|
|
|
|
def readable(self) -> bool:
|
|
return True
|
|
|
|
def tell(self) -> int:
|
|
return self._pos
|
|
|
|
def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
|
|
if whence == io.SEEK_SET:
|
|
self._pos = offset
|
|
elif whence == io.SEEK_CUR:
|
|
self._pos += offset
|
|
elif whence == io.SEEK_END:
|
|
self._pos = self._size + offset
|
|
return self._pos
|
|
|
|
def read(self, size: Optional[int] = -1) -> bytes:
|
|
if size is None or size < 0:
|
|
size = self._size - self._pos
|
|
if size == 0 or self._pos >= self._size:
|
|
return b""
|
|
end: int = min(self._pos + size, self._size) - 1
|
|
resp = self._client.get(self._url, headers={"Range": f"bytes={self._pos}-{end}"})
|
|
resp.raise_for_status()
|
|
data: bytes = resp.content
|
|
self._pos += len(data)
|
|
self.bytes_read += len(data)
|
|
return data
|
|
|
|
def close(self) -> None:
|
|
self._client.close()
|
|
super().close()
|