Model/tests/harness/test_epc_bulk.py
Khalim Conn-Kowlessar cf8e5b9ec6 feat(modelling): read the gov EPC bulk export via HTTP range requests
The bulk endpoint 302-redirects to a 15.7 GB S3 ZIP with one NDJSON member per
year; each line wraps the per-cert payload in a stringified 'document' that
parses to the same RdSAP-Schema-21.0.1 shape from_api_response already handles.
parse_bulk_line unwraps a record; is_sap_version filters to SAP 10.2; RangeFile
exposes the S3 object as a seekable file so zipfile streams a single year's
member (and a sampler stops early) without downloading the whole archive.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-04 12:16:18 +00:00

43 lines
1.6 KiB
Python

"""Parse records from the gov EPC bulk export (NDJSON, stringified `document`)."""
from __future__ import annotations
import json
from harness.epc_bulk import is_sap_version, parse_bulk_line
def test_parse_bulk_line_unwraps_the_stringified_document() -> None:
# Arrange — a bulk record wraps the per-cert payload in a `document` string.
inner: dict[str, object] = {
"schema_type": "RdSAP-Schema-21.0.1",
"sap_version": 10.2,
"energy_rating_current": 71,
}
line: str = json.dumps(
{"certificate_number": "0000-1111-2222-3333-4444", "document": json.dumps(inner)}
)
# Act
parsed = parse_bulk_line(line)
# Assert — the cert number and the parsed inner document come back.
assert parsed is not None
cert_number, document = parsed
assert cert_number == "0000-1111-2222-3333-4444"
assert document["schema_type"] == "RdSAP-Schema-21.0.1"
assert document["energy_rating_current"] == 71
def test_parse_bulk_line_ignores_blank_lines() -> None:
# Arrange / Act / Assert — trailing/blank NDJSON lines are skipped.
assert parse_bulk_line("") is None
assert parse_bulk_line(" \n") is None
def test_is_sap_version_matches_regardless_of_numeric_or_string_form() -> None:
# Arrange / Act / Assert — the export carries sap_version as a number.
assert is_sap_version({"sap_version": 10.2}, "10.2") is True
assert is_sap_version({"sap_version": "10.2"}, "10.2") is True
assert is_sap_version({"sap_version": 10.1}, "10.2") is False
assert is_sap_version({}, "10.2") is False