Dispatch and map RdSAP-Schema-18.0 certs end-to-end 🟥

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-30 13:10:47 +00:00 · 2026-06-11 11:12:53 +00:00 · 2026-06-11 11:12:53 +00:00 · cfc337f04a
commit cfc337f04a
parent 362cd20f11
5 changed files with 2252 additions and 0 deletions
--- a/backend/epc_api/json_samples/RdSAP-Schema-17.1/corpus.jsonl
+++ b/backend/epc_api/json_samples/RdSAP-Schema-17.1/corpus.jsonl
--- a/backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl
+++ b/backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl
--- a/datatypes/epc/domain/tests/test_from_rdsap_schema.py
+++ b/datatypes/epc/domain/tests/test_from_rdsap_schema.py
@ -1424,3 +1424,45 @@ class TestRdSap20_0_0ReducedFieldSynthesis:
        assert len(result.sap_windows) == len(lodged)
        total_area = sum(w.window_width * w.window_height for w in result.sap_windows)
        assert total_area == pytest.approx(expected_total)
+
+
+# ---------------------------------------------------------------------------
+# RdSAP 18.0 Reduced-Field Synthesis (ADR-0028 — inherit-and-validate). 18.0 is
+# the same pre-SAP10 reduced family as 20.0.0: glazed_area *band* not window m²,
+# bath/shower *room counts* not bath counts, lighting OUTLET counts not bulbs.
+# The mapper synthesises the measured form from the cert alone (no neighbour
+# data), reusing 20.0.0's coefficients (validated against 18.0's own band-4 rich
+# certs: observed 0.223 ≈ 0.148 × 1.51). Each test name pins one assumption,
+# because a pre-SAP10 cert has no same-spec lodged figure (Validation-Cohort).
+# ---------------------------------------------------------------------------
+
+_CORPUS_18_0 = os.path.join(
+    os.path.dirname(__file__),
+    "../../../../backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl",
+)
+
+
+def _load_18_0_corpus() -> list[Dict[str, Any]]:
+    if not os.path.exists(_CORPUS_18_0):
+        return []
+    with open(_CORPUS_18_0) as f:
+        return [json.loads(line) for line in f if line.strip()]
+
+
+class TestRdSap18_0ReducedFieldSynthesis:
+
+    def test_cert_dispatches_and_maps_without_missing_required_field(self) -> None:
+        # Arrange — the placeholder 18.0 schema was generated from one example, so
+        # 986/1000 corpus certs fail to parse (over-constrained required fields),
+        # and `from_api_response` never dispatched RdSAP-Schema-18.0 at all.
+        # Dispatch + required→optional must let a real cert through end-to-end.
+        corpus = _load_18_0_corpus()
+        if not corpus:
+            pytest.skip("no RdSAP-Schema-18.0 corpus harvested")
+        cert = corpus[0]
+
+        # Act
+        result = EpcPropertyDataMapper.from_api_response(cert)
+
+        # Assert
+        assert isinstance(result, EpcPropertyData)
--- a/scripts/eon/find_epc_data.py
+++ b/scripts/eon/find_epc_data.py
@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Any, Optional
+
+import pandas as pd
+from dotenv import load_dotenv
+
+from datatypes.epc.domain.epc import Epc
+from datatypes.epc.domain.mapper import EpcPropertyDataMapper
+from domain.sap10_calculator.calculator import calculate_sap_from_inputs
+from domain.sap10_calculator.rdsap.cert_to_inputs import cert_to_inputs
+from infrastructure.epc_client.epc_client_service import EpcClientService
+
+# UPRNs to compare. Most are RdSAP 20.0.0 (pre-SAP10) certs — the ones the
+# Reduced-Field Synthesis mapper (ADR-0027) re-maps so the SAP10 calculator can
+# re-score them. The commented rows are non-20.0.0 neighbours kept for context.
+UPRNS: list[int] = [
+    10003318624,  # 20.0.0  Flat 1, 6 Alexandra Gardens, PO38 1EE
+    10003318625,  # 20.0.0  Flat 2, 6 Alexandra Gardens, PO38 1EE
+    10003318626,  # 20.0.0  Flat 3, 6 Alexandra Gardens, PO38 1EE
+    # 10003318698,  # 17.1    Flat 4, 6 Alexandra Gardens, PO38 1EE
+    100062430247,  # 20.0.0  Flat 5, Adelaide Court, Adelaide Place, PO33 3DG
+    100062430248,  # 20.0.0  Flat 6, Adelaide Court, Adelaide Place, PO33 3DG
+    100062430250,  # 20.0.0  Flat 8, Adelaide Court, Adelaide Place, PO33 3DG
+    100062429797,  # 20.0.0  Flat 1, 10-11 Cross Street, PO33 2AD
+    10003320577,  # 20.0.0  Flat 3, 10-11 Cross Street, PO33 2AD
+    # 10003320573,  # 18.0    Flat 7, 10-11 Cross Street, PO33 2AD
+    10024248769,  # 20.0.0  Flat 8, 10-11 Cross Street, PO33 2AD
+    # 10024248772,  # 18.0    Flat 9, 10-11 Cross Street, PO33 2AD
+]
+
+
+def fetch_raw_cert(service: EpcClientService, uprn: int) -> Optional[dict[str, Any]]:
+    """Pull the latest raw certificate dict for a UPRN straight off the EPC
+    client. We want the RAW cert (not the mapped EpcPropertyData) because the
+    lodged SAP score lives there as `energy_rating_current` — the mapper does
+    not carry it onto the domain object.
+    """
+    results = service._search(uprn=uprn)  # pyright: ignore[reportPrivateUsage]
+    if not results:
+        return None
+    latest = max(results, key=lambda r: r.registration_date)
+    return service._fetch_certificate(  # pyright: ignore[reportPrivateUsage]
+        latest.certificate_number
+    )
+
+
+def compare_sap(raw: dict[str, Any]) -> dict[str, object]:
+    """Re-score a raw cert through our SAP10 calculator and line it up against
+    the figure the surveyor lodged. For a 20.0.0 cert the calculated value is
+    the counterfactual "what EPC would this get under today's spec" (ADR-0027).
+    """
+    epc = EpcPropertyDataMapper.from_api_response(raw)
+    result = calculate_sap_from_inputs(cert_to_inputs(epc))
+
+    # Lodged Performance: the surveyor's original SAP score, read directly from
+    # the raw cert. Bands are derived from the score the same way for both sides.
+    lodged_score = raw.get("energy_rating_current")
+    lodged_band = (
+        Epc.from_sap_score(lodged_score).value if lodged_score is not None else "?"
+    )
+    our_band = Epc.from_sap_score(result.sap_score).value
+
+    return {
+        "address": epc.address_line_1,
+        "postcode": epc.postcode,
+        # The SAP methodology version (RdSAP 2012 lodges 9.9x); the *schema*
+        # version (20.0.0) is annotated in the UPRNS list above.
+        "sap_ver": raw.get("sap_version"),
+        "lodged_sap": lodged_score,
+        "lodged_band": lodged_band,
+        "our_sap": result.sap_score,
+        "our_band": our_band,
+        "delta": (
+            result.sap_score - lodged_score if lodged_score is not None else None
+        ),
+    }
+
+
+def main() -> None:
+    # Mirror conftest.py: pull OPEN_EPC_API_TOKEN out of backend/.env so the
+    # script runs standalone (`python scripts/eon/find_epc_data.py`).
+    repo_root = Path(__file__).resolve().parents[2]
+    load_dotenv(repo_root / "backend" / ".env")
+
+    token = os.getenv("OPEN_EPC_API_TOKEN")
+    if token is None:
+        raise RuntimeError("OPEN_EPC_API_TOKEN not defined in env")
+    service = EpcClientService(auth_token=token)
+
+    rows: list[dict[str, object]] = []
+    for uprn in UPRNS:
+        raw = fetch_raw_cert(service, uprn)
+        if raw is None:
+            print(f"UPRN {uprn}: no EPC found")
+            continue
+        try:
+            rows.append({"uprn": uprn, **compare_sap(raw)})
+        except Exception as exc:  # surface, don't abort the whole sweep
+            print(f"UPRN {uprn}: failed to score — {type(exc).__name__}: {exc}")
+
+    if not rows:
+        print("No certs scored.")
+        return
+
+    table = pd.DataFrame(rows)
+    with pd.option_context("display.max_columns", None, "display.width", None):
+        print(table.to_string(index=False))
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/eon/harvest_certs.py
+++ b/scripts/eon/harvest_certs.py
@ -0,0 +1,96 @@
+"""Harvest raw EPC certificates into a JSONL corpus for mapper tests.
+
+Source: the bulk EPC dumps in downloads/certificates-YYYY.json. Each line is
+
+    {"certificate_number": "...", "document": "<json string>", ...}
+
+where ``document`` is the cert in the exact shape
+``EpcClientService._fetch_certificate`` returns and
+``EpcPropertyDataMapper.from_api_response`` consumes (it has ``schema_type``,
+``roofs``, ``walls`` ... and matches the committed json_samples).
+
+We want a balanced sample per schema so we can build out and regression-test
+the mappers (notably the incomplete ``RdSapSchema20.0.0``). Schema version
+tracks the dump year, so we read each target schema from a year that's rich in
+it and stop once its cap is full — no need to stream whole multi-GB files.
+
+Year -> dominant schema (see downloads/README.txt):
+    2026 -> RdSAP-Schema-21.0.1
+    2021-2024 -> RdSAP-Schema-20.0.0
+
+SAP-Schema-18.0.0 is a minority schema (~12% of the 2021 dump) but each year
+holds ~1.6M lines, so 2021 still yields well over 1000 — it just scans deeper
+before the cap fills. SAP-Schema-17.1 is richest in the 2019 dump (~20%).
+
+21.0.0 is skipped — it's effectively absent from these dumps.
+
+Run cell by cell. No API token needed — this is pure local streaming.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import pandas as pd
+
+DOWNLOADS = Path("downloads")
+SAMPLES = Path("backend/epc_api/json_samples")
+
+# One corpus per schema, written into that schema's own json_samples folder
+# (alongside its epc.json) as corpus.jsonl. Each schema is read from a year
+# where it dominates, so we hit the cap within the first few-thousand lines.
+SOURCES: list[tuple[str, str, int]] = [
+    # ("certificates-2026.json", "RdSAP-Schema-21.0.1", 1000),
+    # ("certificates-2022.json", "RdSAP-Schema-20.0.0", 1000),
+    # pre-SAP10 RdSAP family — NOT the SAP-Schema-* full/design-SAP family.
+    # schema_type scan: RdSAP-Schema-18.0 is ~82% of certificates-2018.json,
+    # 17.1 dominant in 2017, 17.0 dominant in 2015.
+    ("certificates-2018.json", "RdSAP-Schema-18.0", 1000),
+    ("certificates-2017.json", "RdSAP-Schema-17.1", 1000),
+    # ("certificates-2015.json", "RdSAP-Schema-17.0", 1000),
+]
+
+
+def corpus_path(schema: str) -> Path:
+    return SAMPLES / schema / "corpus.jsonl"
+
+
+# %%
+def harvest_one(filename: str, schema: str, cap: int) -> list[dict[str, object]]:
+    """Stream `filename`, returning up to `cap` cert docs of `schema`."""
+    path = DOWNLOADS / filename
+    docs: list[dict[str, object]] = []
+    scanned = 0
+    with path.open() as fh:
+        for line in fh:
+            if len(docs) >= cap:
+                break
+            scanned += 1
+            try:
+                doc = json.loads(json.loads(line)["document"])
+            except (json.JSONDecodeError, KeyError):
+                continue
+            if doc.get("schema_type") == schema:
+                docs.append(doc)
+        print(f"{schema}: {len(docs)}/{cap} from {filename} (scanned {scanned} lines)")
+    return docs
+
+
+# %%
+# Build one corpus per schema, into that schema's json_samples folder.
+# Overwrites each run — deterministic and cheap.
+for filename, schema, cap in SOURCES:
+    out_path = corpus_path(schema)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w") as out:
+        for doc in harvest_one(filename, schema, cap):
+            out.write(json.dumps(doc) + "\n")
+    print(f"wrote {out_path}")
+
+# %%
+# Sanity-check each corpus: line count per schema.
+for _, schema, _ in SOURCES:
+    path = corpus_path(schema)
+    n = sum(1 for line in path.read_text().splitlines() if line.strip())
+    print(f"{schema}: {n} ({path})")