Model/harness/epc_bulk.py

"""Read the gov EPC **bulk** export without downloading the 15.7 GB archive.

The live API's bulk endpoint (`/api/files/domestic/json`) 302-redirects to a
temporary S3 ZIP holding one NDJSON member per year (`certificates-<year>.json`,
e.g. 2026 is ~559 MB compressed / ~7.6 GB uncompressed). Each NDJSON line is a
warehouse record whose per-cert payload is a *stringified* `document` field; the
parsed document is the same shape `EpcPropertyDataMapper.from_api_response`
already handles (`RdSAP-Schema-21.0.1`, `sap_building_parts`,
`energy_rating_current`, ...).

`RangeFile` exposes the S3 object as a seekable file backed by HTTP range
requests, so `zipfile` reads the central directory and streams a single member's
deflate stream — and a sampler can stop early after N records, fetching only the
compressed prefix it needs. The line-level parsing is pure and unit-tested here;
the network wiring lives in `scripts/fetch_epc_bulk_sample.py`.
"""

from __future__ import annotations

import io
import json
from typing import Any, Optional

import httpx


def parse_bulk_line(line: str) -> Optional[tuple[str, dict[str, Any]]]:
    """Parse one NDJSON bulk record into `(certificate_number, document)`,
    unwrapping the stringified `document`. Blank lines return None."""
    stripped: str = line.strip()
    if not stripped:
        return None
    record: dict[str, Any] = json.loads(stripped)
    raw_document: Any = record["document"]
    document: dict[str, Any] = (
        json.loads(raw_document) if isinstance(raw_document, str) else raw_document
    )
    return record["certificate_number"], document


def is_sap_version(document: dict[str, Any], wanted: str) -> bool:
    """True when the document's `sap_version` equals `wanted` (the export carries
    it as a number, so compare on the string form)."""
    version: Any = document.get("sap_version")
    return version is not None and str(version) == wanted


class RangeFile(io.RawIOBase):
    """A seekable read-only file over an HTTP object that supports byte ranges
    (an S3 presigned URL). Each `read` issues a `Range` GET, so `zipfile` can
    parse the central directory and stream one member without downloading the
    whole archive."""

    def __init__(self, url: str, size: int) -> None:
        self._url = url
        self._size = size
        self._pos = 0
        self._client = httpx.Client(timeout=120)
        # Bytes actually transferred — distinct from `tell()`, which is the
        # absolute offset (a deep member sits GBs into the archive).
        self.bytes_read = 0

    def seekable(self) -> bool:
        return True

    def readable(self) -> bool:
        return True

    def tell(self) -> int:
        return self._pos

    def seek(self, offset: int, whence: int = io.SEEK_SET) -> int:
        if whence == io.SEEK_SET:
            self._pos = offset
        elif whence == io.SEEK_CUR:
            self._pos += offset
        elif whence == io.SEEK_END:
            self._pos = self._size + offset
        return self._pos

    def read(self, size: Optional[int] = -1) -> bytes:
        if size is None or size < 0:
            size = self._size - self._pos
        if size == 0 or self._pos >= self._size:
            return b""
        end: int = min(self._pos + size, self._size) - 1
        resp = self._client.get(self._url, headers={"Range": f"bytes={self._pos}-{end}"})
        resp.raise_for_status()
        data: bytes = resp.content
        self._pos += len(data)
        self.bytes_read += len(data)
        return data

    def close(self) -> None:
        self._client.close()
        super().close()