Dispatch and map RdSAP-Schema-18.0 certs end-to-end 🟥

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 13:10:47 +00:00 · 2026-06-11 11:12:53 +00:00 · 2026-06-11 11:12:53 +00:00 · cfc337f04a
commit cfc337f04a
parent 362cd20f11
5 changed files with 2252 additions and 0 deletions
--- a/backend/epc_api/json_samples/RdSAP-Schema-17.1/corpus.jsonl
+++ b/backend/epc_api/json_samples/RdSAP-Schema-17.1/corpus.jsonl
--- a/backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl
+++ b/backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl
--- a/datatypes/epc/domain/tests/test_from_rdsap_schema.py
+++ b/datatypes/epc/domain/tests/test_from_rdsap_schema.py
@ -1424,3 +1424,45 @@ class TestRdSap20_0_0ReducedFieldSynthesis:
        assert len(result.sap_windows) == len(lodged)
        total_area = sum(w.window_width * w.window_height for w in result.sap_windows)
        assert total_area == pytest.approx(expected_total)
 # ---------------------------------------------------------------------------
 # RdSAP 18.0 Reduced-Field Synthesis (ADR-0028 — inherit-and-validate). 18.0 is
 # the same pre-SAP10 reduced family as 20.0.0: glazed_area *band* not window m²,
 # bath/shower *room counts* not bath counts, lighting OUTLET counts not bulbs.
 # The mapper synthesises the measured form from the cert alone (no neighbour
 # data), reusing 20.0.0's coefficients (validated against 18.0's own band-4 rich
 # certs: observed 0.223 ≈ 0.148 × 1.51). Each test name pins one assumption,
 # because a pre-SAP10 cert has no same-spec lodged figure (Validation-Cohort).
 # ---------------------------------------------------------------------------
 _CORPUS_18_0 = os.path.join(
    os.path.dirname(__file__),
    "../../../../backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl",
 )
 def _load_18_0_corpus() -> list[Dict[str, Any]]:
    if not os.path.exists(_CORPUS_18_0):
        return []
    with open(_CORPUS_18_0) as f:
        return [json.loads(line) for line in f if line.strip()]
 class TestRdSap18_0ReducedFieldSynthesis:
    def test_cert_dispatches_and_maps_without_missing_required_field(self) -> None:
        # Arrange — the placeholder 18.0 schema was generated from one example, so
        # 986/1000 corpus certs fail to parse (over-constrained required fields),
        # and `from_api_response` never dispatched RdSAP-Schema-18.0 at all.
        # Dispatch + required→optional must let a real cert through end-to-end.
        corpus = _load_18_0_corpus()
        if not corpus:
            pytest.skip("no RdSAP-Schema-18.0 corpus harvested")
        cert = corpus[0]
        # Act
        result = EpcPropertyDataMapper.from_api_response(cert)
        # Assert
        assert isinstance(result, EpcPropertyData)
--- a/scripts/eon/find_epc_data.py
+++ b/scripts/eon/find_epc_data.py
@ -0,0 +1,114 @@
 from __future__ import annotations
 import os
 from pathlib import Path
 from typing import Any, Optional
 import pandas as pd
 from dotenv import load_dotenv
 from datatypes.epc.domain.epc import Epc
 from datatypes.epc.domain.mapper import EpcPropertyDataMapper
 from domain.sap10_calculator.calculator import calculate_sap_from_inputs
 from domain.sap10_calculator.rdsap.cert_to_inputs import cert_to_inputs
 from infrastructure.epc_client.epc_client_service import EpcClientService
 # UPRNs to compare. Most are RdSAP 20.0.0 (pre-SAP10) certs — the ones the
 # Reduced-Field Synthesis mapper (ADR-0027) re-maps so the SAP10 calculator can
 # re-score them. The commented rows are non-20.0.0 neighbours kept for context.
 UPRNS: list[int] = [
    10003318624,  # 20.0.0  Flat 1, 6 Alexandra Gardens, PO38 1EE
    10003318625,  # 20.0.0  Flat 2, 6 Alexandra Gardens, PO38 1EE
    10003318626,  # 20.0.0  Flat 3, 6 Alexandra Gardens, PO38 1EE
    # 10003318698,  # 17.1    Flat 4, 6 Alexandra Gardens, PO38 1EE
    100062430247,  # 20.0.0  Flat 5, Adelaide Court, Adelaide Place, PO33 3DG
    100062430248,  # 20.0.0  Flat 6, Adelaide Court, Adelaide Place, PO33 3DG
    100062430250,  # 20.0.0  Flat 8, Adelaide Court, Adelaide Place, PO33 3DG
    100062429797,  # 20.0.0  Flat 1, 10-11 Cross Street, PO33 2AD
    10003320577,  # 20.0.0  Flat 3, 10-11 Cross Street, PO33 2AD
    # 10003320573,  # 18.0    Flat 7, 10-11 Cross Street, PO33 2AD
    10024248769,  # 20.0.0  Flat 8, 10-11 Cross Street, PO33 2AD
    # 10024248772,  # 18.0    Flat 9, 10-11 Cross Street, PO33 2AD
 ]
 def fetch_raw_cert(service: EpcClientService, uprn: int) -> Optional[dict[str, Any]]:
    """Pull the latest raw certificate dict for a UPRN straight off the EPC
    client. We want the RAW cert (not the mapped EpcPropertyData) because the
    lodged SAP score lives there as `energy_rating_current` — the mapper does
    not carry it onto the domain object.
    """
    results = service._search(uprn=uprn)  # pyright: ignore[reportPrivateUsage]
    if not results:
        return None
    latest = max(results, key=lambda r: r.registration_date)
    return service._fetch_certificate(  # pyright: ignore[reportPrivateUsage]
        latest.certificate_number
    )
 def compare_sap(raw: dict[str, Any]) -> dict[str, object]:
    """Re-score a raw cert through our SAP10 calculator and line it up against
    the figure the surveyor lodged. For a 20.0.0 cert the calculated value is
    the counterfactual "what EPC would this get under today's spec" (ADR-0027).
    """
    epc = EpcPropertyDataMapper.from_api_response(raw)
    result = calculate_sap_from_inputs(cert_to_inputs(epc))
    # Lodged Performance: the surveyor's original SAP score, read directly from
    # the raw cert. Bands are derived from the score the same way for both sides.
    lodged_score = raw.get("energy_rating_current")
    lodged_band = (
        Epc.from_sap_score(lodged_score).value if lodged_score is not None else "?"
    )
    our_band = Epc.from_sap_score(result.sap_score).value
    return {
        "address": epc.address_line_1,
        "postcode": epc.postcode,
        # The SAP methodology version (RdSAP 2012 lodges 9.9x); the *schema*
        # version (20.0.0) is annotated in the UPRNS list above.
        "sap_ver": raw.get("sap_version"),
        "lodged_sap": lodged_score,
        "lodged_band": lodged_band,
        "our_sap": result.sap_score,
        "our_band": our_band,
        "delta": (
            result.sap_score - lodged_score if lodged_score is not None else None
        ),
    }
 def main() -> None:
    # Mirror conftest.py: pull OPEN_EPC_API_TOKEN out of backend/.env so the
    # script runs standalone (`python scripts/eon/find_epc_data.py`).
    repo_root = Path(__file__).resolve().parents[2]
    load_dotenv(repo_root / "backend" / ".env")
    token = os.getenv("OPEN_EPC_API_TOKEN")
    if token is None:
        raise RuntimeError("OPEN_EPC_API_TOKEN not defined in env")
    service = EpcClientService(auth_token=token)
    rows: list[dict[str, object]] = []
    for uprn in UPRNS:
        raw = fetch_raw_cert(service, uprn)
        if raw is None:
            print(f"UPRN {uprn}: no EPC found")
            continue
        try:
            rows.append({"uprn": uprn, **compare_sap(raw)})
        except Exception as exc:  # surface, don't abort the whole sweep
            print(f"UPRN {uprn}: failed to score — {type(exc).__name__}: {exc}")
    if not rows:
        print("No certs scored.")
        return
    table = pd.DataFrame(rows)
    with pd.option_context("display.max_columns", None, "display.width", None):
        print(table.to_string(index=False))
 if __name__ == "__main__":
    main()
--- a/scripts/eon/harvest_certs.py
+++ b/scripts/eon/harvest_certs.py
@ -0,0 +1,96 @@
 """Harvest raw EPC certificates into a JSONL corpus for mapper tests.
 Source: the bulk EPC dumps in downloads/certificates-YYYY.json. Each line is
    {"certificate_number": "...", "document": "<json string>", ...}
 where ``document`` is the cert in the exact shape
 ``EpcClientService._fetch_certificate`` returns and
 ``EpcPropertyDataMapper.from_api_response`` consumes (it has ``schema_type``,
 ``roofs``, ``walls`` ... and matches the committed json_samples).
 We want a balanced sample per schema so we can build out and regression-test
 the mappers (notably the incomplete ``RdSapSchema20.0.0``). Schema version
 tracks the dump year, so we read each target schema from a year that's rich in
 it and stop once its cap is full — no need to stream whole multi-GB files.
 Year -> dominant schema (see downloads/README.txt):
    2026 -> RdSAP-Schema-21.0.1
    2021-2024 -> RdSAP-Schema-20.0.0
 SAP-Schema-18.0.0 is a minority schema (~12% of the 2021 dump) but each year
 holds ~1.6M lines, so 2021 still yields well over 1000 — it just scans deeper
 before the cap fills. SAP-Schema-17.1 is richest in the 2019 dump (~20%).
 21.0.0 is skipped — it's effectively absent from these dumps.
 Run cell by cell. No API token needed — this is pure local streaming.
 """
 from __future__ import annotations
 import json
 from pathlib import Path
 import pandas as pd
 DOWNLOADS = Path("downloads")
 SAMPLES = Path("backend/epc_api/json_samples")
 # One corpus per schema, written into that schema's own json_samples folder
 # (alongside its epc.json) as corpus.jsonl. Each schema is read from a year
 # where it dominates, so we hit the cap within the first few-thousand lines.
 SOURCES: list[tuple[str, str, int]] = [
    # ("certificates-2026.json", "RdSAP-Schema-21.0.1", 1000),
    # ("certificates-2022.json", "RdSAP-Schema-20.0.0", 1000),
    # pre-SAP10 RdSAP family — NOT the SAP-Schema-* full/design-SAP family.
    # schema_type scan: RdSAP-Schema-18.0 is ~82% of certificates-2018.json,
    # 17.1 dominant in 2017, 17.0 dominant in 2015.
    ("certificates-2018.json", "RdSAP-Schema-18.0", 1000),
    ("certificates-2017.json", "RdSAP-Schema-17.1", 1000),
    # ("certificates-2015.json", "RdSAP-Schema-17.0", 1000),
 ]
 def corpus_path(schema: str) -> Path:
    return SAMPLES / schema / "corpus.jsonl"
 # %%
 def harvest_one(filename: str, schema: str, cap: int) -> list[dict[str, object]]:
    """Stream `filename`, returning up to `cap` cert docs of `schema`."""
    path = DOWNLOADS / filename
    docs: list[dict[str, object]] = []
    scanned = 0
    with path.open() as fh:
        for line in fh:
            if len(docs) >= cap:
                break
            scanned += 1
            try:
                doc = json.loads(json.loads(line)["document"])
            except (json.JSONDecodeError, KeyError):
                continue
            if doc.get("schema_type") == schema:
                docs.append(doc)
        print(f"{schema}: {len(docs)}/{cap} from {filename} (scanned {scanned} lines)")
    return docs
 # %%
 # Build one corpus per schema, into that schema's json_samples folder.
 # Overwrites each run — deterministic and cheap.
 for filename, schema, cap in SOURCES:
    out_path = corpus_path(schema)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w") as out:
        for doc in harvest_one(filename, schema, cap):
            out.write(json.dumps(doc) + "\n")
    print(f"wrote {out_path}")
 # %%
 # Sanity-check each corpus: line count per schema.
 for _, schema, _ in SOURCES:
    path = corpus_path(schema)
    n = sum(1 for line in path.read_text().splitlines() if line.strip())
    print(f"{schema}: {n} ({path})")