Dispatch and map RdSAP-Schema-18.0 certs end-to-end 🟥

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Jun-te Kim 2026-06-11 11:12:53 +00:00
parent 362cd20f11
commit cfc337f04a
5 changed files with 2252 additions and 0 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1424,3 +1424,45 @@ class TestRdSap20_0_0ReducedFieldSynthesis:
assert len(result.sap_windows) == len(lodged)
total_area = sum(w.window_width * w.window_height for w in result.sap_windows)
assert total_area == pytest.approx(expected_total)
# ---------------------------------------------------------------------------
# RdSAP 18.0 Reduced-Field Synthesis (ADR-0028 — inherit-and-validate). 18.0 is
# the same pre-SAP10 reduced family as 20.0.0: glazed_area *band* not window m²,
# bath/shower *room counts* not bath counts, lighting OUTLET counts not bulbs.
# The mapper synthesises the measured form from the cert alone (no neighbour
# data), reusing 20.0.0's coefficients (validated against 18.0's own band-4 rich
# certs: observed 0.223 ≈ 0.148 × 1.51). Each test name pins one assumption,
# because a pre-SAP10 cert has no same-spec lodged figure (Validation-Cohort).
# ---------------------------------------------------------------------------
_CORPUS_18_0 = os.path.join(
os.path.dirname(__file__),
"../../../../backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl",
)
def _load_18_0_corpus() -> list[Dict[str, Any]]:
if not os.path.exists(_CORPUS_18_0):
return []
with open(_CORPUS_18_0) as f:
return [json.loads(line) for line in f if line.strip()]
class TestRdSap18_0ReducedFieldSynthesis:
def test_cert_dispatches_and_maps_without_missing_required_field(self) -> None:
# Arrange — the placeholder 18.0 schema was generated from one example, so
# 986/1000 corpus certs fail to parse (over-constrained required fields),
# and `from_api_response` never dispatched RdSAP-Schema-18.0 at all.
# Dispatch + required→optional must let a real cert through end-to-end.
corpus = _load_18_0_corpus()
if not corpus:
pytest.skip("no RdSAP-Schema-18.0 corpus harvested")
cert = corpus[0]
# Act
result = EpcPropertyDataMapper.from_api_response(cert)
# Assert
assert isinstance(result, EpcPropertyData)

View file

@ -0,0 +1,114 @@
from __future__ import annotations
import os
from pathlib import Path
from typing import Any, Optional
import pandas as pd
from dotenv import load_dotenv
from datatypes.epc.domain.epc import Epc
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.sap10_calculator.calculator import calculate_sap_from_inputs
from domain.sap10_calculator.rdsap.cert_to_inputs import cert_to_inputs
from infrastructure.epc_client.epc_client_service import EpcClientService
# UPRNs to compare. Most are RdSAP 20.0.0 (pre-SAP10) certs — the ones the
# Reduced-Field Synthesis mapper (ADR-0027) re-maps so the SAP10 calculator can
# re-score them. The commented rows are non-20.0.0 neighbours kept for context.
UPRNS: list[int] = [
10003318624, # 20.0.0 Flat 1, 6 Alexandra Gardens, PO38 1EE
10003318625, # 20.0.0 Flat 2, 6 Alexandra Gardens, PO38 1EE
10003318626, # 20.0.0 Flat 3, 6 Alexandra Gardens, PO38 1EE
# 10003318698, # 17.1 Flat 4, 6 Alexandra Gardens, PO38 1EE
100062430247, # 20.0.0 Flat 5, Adelaide Court, Adelaide Place, PO33 3DG
100062430248, # 20.0.0 Flat 6, Adelaide Court, Adelaide Place, PO33 3DG
100062430250, # 20.0.0 Flat 8, Adelaide Court, Adelaide Place, PO33 3DG
100062429797, # 20.0.0 Flat 1, 10-11 Cross Street, PO33 2AD
10003320577, # 20.0.0 Flat 3, 10-11 Cross Street, PO33 2AD
# 10003320573, # 18.0 Flat 7, 10-11 Cross Street, PO33 2AD
10024248769, # 20.0.0 Flat 8, 10-11 Cross Street, PO33 2AD
# 10024248772, # 18.0 Flat 9, 10-11 Cross Street, PO33 2AD
]
def fetch_raw_cert(service: EpcClientService, uprn: int) -> Optional[dict[str, Any]]:
"""Pull the latest raw certificate dict for a UPRN straight off the EPC
client. We want the RAW cert (not the mapped EpcPropertyData) because the
lodged SAP score lives there as `energy_rating_current` the mapper does
not carry it onto the domain object.
"""
results = service._search(uprn=uprn) # pyright: ignore[reportPrivateUsage]
if not results:
return None
latest = max(results, key=lambda r: r.registration_date)
return service._fetch_certificate( # pyright: ignore[reportPrivateUsage]
latest.certificate_number
)
def compare_sap(raw: dict[str, Any]) -> dict[str, object]:
"""Re-score a raw cert through our SAP10 calculator and line it up against
the figure the surveyor lodged. For a 20.0.0 cert the calculated value is
the counterfactual "what EPC would this get under today's spec" (ADR-0027).
"""
epc = EpcPropertyDataMapper.from_api_response(raw)
result = calculate_sap_from_inputs(cert_to_inputs(epc))
# Lodged Performance: the surveyor's original SAP score, read directly from
# the raw cert. Bands are derived from the score the same way for both sides.
lodged_score = raw.get("energy_rating_current")
lodged_band = (
Epc.from_sap_score(lodged_score).value if lodged_score is not None else "?"
)
our_band = Epc.from_sap_score(result.sap_score).value
return {
"address": epc.address_line_1,
"postcode": epc.postcode,
# The SAP methodology version (RdSAP 2012 lodges 9.9x); the *schema*
# version (20.0.0) is annotated in the UPRNS list above.
"sap_ver": raw.get("sap_version"),
"lodged_sap": lodged_score,
"lodged_band": lodged_band,
"our_sap": result.sap_score,
"our_band": our_band,
"delta": (
result.sap_score - lodged_score if lodged_score is not None else None
),
}
def main() -> None:
# Mirror conftest.py: pull OPEN_EPC_API_TOKEN out of backend/.env so the
# script runs standalone (`python scripts/eon/find_epc_data.py`).
repo_root = Path(__file__).resolve().parents[2]
load_dotenv(repo_root / "backend" / ".env")
token = os.getenv("OPEN_EPC_API_TOKEN")
if token is None:
raise RuntimeError("OPEN_EPC_API_TOKEN not defined in env")
service = EpcClientService(auth_token=token)
rows: list[dict[str, object]] = []
for uprn in UPRNS:
raw = fetch_raw_cert(service, uprn)
if raw is None:
print(f"UPRN {uprn}: no EPC found")
continue
try:
rows.append({"uprn": uprn, **compare_sap(raw)})
except Exception as exc: # surface, don't abort the whole sweep
print(f"UPRN {uprn}: failed to score — {type(exc).__name__}: {exc}")
if not rows:
print("No certs scored.")
return
table = pd.DataFrame(rows)
with pd.option_context("display.max_columns", None, "display.width", None):
print(table.to_string(index=False))
if __name__ == "__main__":
main()

View file

@ -0,0 +1,96 @@
"""Harvest raw EPC certificates into a JSONL corpus for mapper tests.
Source: the bulk EPC dumps in downloads/certificates-YYYY.json. Each line is
{"certificate_number": "...", "document": "<json string>", ...}
where ``document`` is the cert in the exact shape
``EpcClientService._fetch_certificate`` returns and
``EpcPropertyDataMapper.from_api_response`` consumes (it has ``schema_type``,
``roofs``, ``walls`` ... and matches the committed json_samples).
We want a balanced sample per schema so we can build out and regression-test
the mappers (notably the incomplete ``RdSapSchema20.0.0``). Schema version
tracks the dump year, so we read each target schema from a year that's rich in
it and stop once its cap is full no need to stream whole multi-GB files.
Year -> dominant schema (see downloads/README.txt):
2026 -> RdSAP-Schema-21.0.1
2021-2024 -> RdSAP-Schema-20.0.0
SAP-Schema-18.0.0 is a minority schema (~12% of the 2021 dump) but each year
holds ~1.6M lines, so 2021 still yields well over 1000 it just scans deeper
before the cap fills. SAP-Schema-17.1 is richest in the 2019 dump (~20%).
21.0.0 is skipped it's effectively absent from these dumps.
Run cell by cell. No API token needed this is pure local streaming.
"""
from __future__ import annotations
import json
from pathlib import Path
import pandas as pd
DOWNLOADS = Path("downloads")
SAMPLES = Path("backend/epc_api/json_samples")
# One corpus per schema, written into that schema's own json_samples folder
# (alongside its epc.json) as corpus.jsonl. Each schema is read from a year
# where it dominates, so we hit the cap within the first few-thousand lines.
SOURCES: list[tuple[str, str, int]] = [
# ("certificates-2026.json", "RdSAP-Schema-21.0.1", 1000),
# ("certificates-2022.json", "RdSAP-Schema-20.0.0", 1000),
# pre-SAP10 RdSAP family — NOT the SAP-Schema-* full/design-SAP family.
# schema_type scan: RdSAP-Schema-18.0 is ~82% of certificates-2018.json,
# 17.1 dominant in 2017, 17.0 dominant in 2015.
("certificates-2018.json", "RdSAP-Schema-18.0", 1000),
("certificates-2017.json", "RdSAP-Schema-17.1", 1000),
# ("certificates-2015.json", "RdSAP-Schema-17.0", 1000),
]
def corpus_path(schema: str) -> Path:
return SAMPLES / schema / "corpus.jsonl"
# %%
def harvest_one(filename: str, schema: str, cap: int) -> list[dict[str, object]]:
"""Stream `filename`, returning up to `cap` cert docs of `schema`."""
path = DOWNLOADS / filename
docs: list[dict[str, object]] = []
scanned = 0
with path.open() as fh:
for line in fh:
if len(docs) >= cap:
break
scanned += 1
try:
doc = json.loads(json.loads(line)["document"])
except (json.JSONDecodeError, KeyError):
continue
if doc.get("schema_type") == schema:
docs.append(doc)
print(f"{schema}: {len(docs)}/{cap} from {filename} (scanned {scanned} lines)")
return docs
# %%
# Build one corpus per schema, into that schema's json_samples folder.
# Overwrites each run — deterministic and cheap.
for filename, schema, cap in SOURCES:
out_path = corpus_path(schema)
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w") as out:
for doc in harvest_one(filename, schema, cap):
out.write(json.dumps(doc) + "\n")
print(f"wrote {out_path}")
# %%
# Sanity-check each corpus: line count per schema.
for _, schema, _ in SOURCES:
path = corpus_path(schema)
n = sum(1 for line in path.read_text().splitlines() if line.strip())
print(f"{schema}: {n} ({path})")