Model/scripts/fetch_cohort2_api_jsons.py
Khalim Conn-Kowlessar 22ae6f4d77 Slice S0380.39: bulk-fetch 38 cohort-2 EPC API JSONs for cross-mapper parity
Adds scripts/fetch_cohort2_api_jsons.py (throwaway one-off) plus 38
golden fixtures under domain/sap10_calculator/rdsap/tests/fixtures/golden/
covering every cert in "sap worksheets/additional with api 2/".

Each JSON is the inner `data` payload from the gov.uk EPB
/api/certificate endpoint — the same shape EpcPropertyDataMapper
.from_api_response consumes today.

Required prerequisite for Slice B (parametrized API-path chain test
that mirrors the cohort-2 Summary-path sweep at 1e-4 vs worksheet).
Per the cross-mapper-parity primitive: API EPC and Elmhurst EPC must
produce SAP within 1e-4 of each other and of the worksheet — the SAP
cascade is the load-bearing equivalence check.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 16:40:58 +00:00

85 lines
2.5 KiB
Python

"""Throwaway one-off: bulk-fetch cohort-2 EPC API JSONs from gov.uk EPB.
Persists the inner `data` payload (as returned by EpcClientService._fetch_certificate)
to domain/sap10_calculator/rdsap/tests/fixtures/golden/<cert>.json. Skips certs
whose JSON already exists.
"""
from __future__ import annotations
import json
import os
import sys
from pathlib import Path
from typing import Any
import httpx
from dotenv import load_dotenv
REPO_ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO_ROOT))
from backend.epc_client._retry import call_with_retry
from backend.epc_client.epc_client_service import EpcClientService
from backend.epc_client.exceptions import (
EpcApiError,
EpcNotFoundError,
EpcRateLimitError,
)
def _fetch_raw(token: str, cert_num: str) -> dict[str, Any]:
resp = httpx.get(
f"{EpcClientService.BASE_URL}/api/certificate",
params={"certificate_number": cert_num},
headers={"Authorization": f"Bearer {token}", "Accept": "application/json"},
timeout=EpcClientService.REQUEST_TIMEOUT,
)
if resp.status_code == 404:
raise EpcNotFoundError(cert_num)
if resp.status_code == 429:
raise EpcRateLimitError("Rate limited by EPC API")
if not resp.is_success:
raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}")
payload: dict[str, Any] = resp.json()["data"]
return payload
def main() -> int:
load_dotenv(REPO_ROOT / "backend" / ".env")
token = os.environ["OPEN_EPC_API_TOKEN"]
src = REPO_ROOT / "sap worksheets" / "additional with api 2"
dst = REPO_ROOT / "domain" / "sap10_calculator" / "rdsap" / "tests" / "fixtures" / "golden"
fetched = 0
skipped = 0
missing: list[str] = []
for cd in sorted(src.iterdir()):
if not cd.is_dir():
continue
out_path = dst / f"{cd.name}.json"
if out_path.exists():
print(f"skip {cd.name}")
skipped += 1
continue
cert_num = cd.name
try:
raw = call_with_retry(lambda: _fetch_raw(token, cert_num))
except EpcNotFoundError:
print(f"404 {cd.name}")
missing.append(cd.name)
continue
out_path.write_text(json.dumps(raw, indent=2))
print(f"fetch {cd.name}")
fetched += 1
print(f"\nfetched={fetched} skipped={skipped} missing={len(missing)}")
if missing:
print("missing:")
for c in missing:
print(f" {c}")
return 0
if __name__ == "__main__":
sys.exit(main())