mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
Dispatch and map RdSAP-Schema-18.0 certs end-to-end 🟥
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
362cd20f11
commit
cfc337f04a
5 changed files with 2252 additions and 0 deletions
1000
backend/epc_api/json_samples/RdSAP-Schema-17.1/corpus.jsonl
Normal file
1000
backend/epc_api/json_samples/RdSAP-Schema-17.1/corpus.jsonl
Normal file
File diff suppressed because one or more lines are too long
1000
backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl
Normal file
1000
backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -1424,3 +1424,45 @@ class TestRdSap20_0_0ReducedFieldSynthesis:
|
||||||
assert len(result.sap_windows) == len(lodged)
|
assert len(result.sap_windows) == len(lodged)
|
||||||
total_area = sum(w.window_width * w.window_height for w in result.sap_windows)
|
total_area = sum(w.window_width * w.window_height for w in result.sap_windows)
|
||||||
assert total_area == pytest.approx(expected_total)
|
assert total_area == pytest.approx(expected_total)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# RdSAP 18.0 Reduced-Field Synthesis (ADR-0028 — inherit-and-validate). 18.0 is
|
||||||
|
# the same pre-SAP10 reduced family as 20.0.0: glazed_area *band* not window m²,
|
||||||
|
# bath/shower *room counts* not bath counts, lighting OUTLET counts not bulbs.
|
||||||
|
# The mapper synthesises the measured form from the cert alone (no neighbour
|
||||||
|
# data), reusing 20.0.0's coefficients (validated against 18.0's own band-4 rich
|
||||||
|
# certs: observed 0.223 ≈ 0.148 × 1.51). Each test name pins one assumption,
|
||||||
|
# because a pre-SAP10 cert has no same-spec lodged figure (Validation-Cohort).
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_CORPUS_18_0 = os.path.join(
|
||||||
|
os.path.dirname(__file__),
|
||||||
|
"../../../../backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_18_0_corpus() -> list[Dict[str, Any]]:
|
||||||
|
if not os.path.exists(_CORPUS_18_0):
|
||||||
|
return []
|
||||||
|
with open(_CORPUS_18_0) as f:
|
||||||
|
return [json.loads(line) for line in f if line.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
class TestRdSap18_0ReducedFieldSynthesis:
|
||||||
|
|
||||||
|
def test_cert_dispatches_and_maps_without_missing_required_field(self) -> None:
|
||||||
|
# Arrange — the placeholder 18.0 schema was generated from one example, so
|
||||||
|
# 986/1000 corpus certs fail to parse (over-constrained required fields),
|
||||||
|
# and `from_api_response` never dispatched RdSAP-Schema-18.0 at all.
|
||||||
|
# Dispatch + required→optional must let a real cert through end-to-end.
|
||||||
|
corpus = _load_18_0_corpus()
|
||||||
|
if not corpus:
|
||||||
|
pytest.skip("no RdSAP-Schema-18.0 corpus harvested")
|
||||||
|
cert = corpus[0]
|
||||||
|
|
||||||
|
# Act
|
||||||
|
result = EpcPropertyDataMapper.from_api_response(cert)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert isinstance(result, EpcPropertyData)
|
||||||
|
|
|
||||||
114
scripts/eon/find_epc_data.py
Normal file
114
scripts/eon/find_epc_data.py
Normal file
|
|
@ -0,0 +1,114 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
from datatypes.epc.domain.epc import Epc
|
||||||
|
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
||||||
|
from domain.sap10_calculator.calculator import calculate_sap_from_inputs
|
||||||
|
from domain.sap10_calculator.rdsap.cert_to_inputs import cert_to_inputs
|
||||||
|
from infrastructure.epc_client.epc_client_service import EpcClientService
|
||||||
|
|
||||||
|
# UPRNs to compare. Most are RdSAP 20.0.0 (pre-SAP10) certs — the ones the
|
||||||
|
# Reduced-Field Synthesis mapper (ADR-0027) re-maps so the SAP10 calculator can
|
||||||
|
# re-score them. The commented rows are non-20.0.0 neighbours kept for context.
|
||||||
|
UPRNS: list[int] = [
|
||||||
|
10003318624, # 20.0.0 Flat 1, 6 Alexandra Gardens, PO38 1EE
|
||||||
|
10003318625, # 20.0.0 Flat 2, 6 Alexandra Gardens, PO38 1EE
|
||||||
|
10003318626, # 20.0.0 Flat 3, 6 Alexandra Gardens, PO38 1EE
|
||||||
|
# 10003318698, # 17.1 Flat 4, 6 Alexandra Gardens, PO38 1EE
|
||||||
|
100062430247, # 20.0.0 Flat 5, Adelaide Court, Adelaide Place, PO33 3DG
|
||||||
|
100062430248, # 20.0.0 Flat 6, Adelaide Court, Adelaide Place, PO33 3DG
|
||||||
|
100062430250, # 20.0.0 Flat 8, Adelaide Court, Adelaide Place, PO33 3DG
|
||||||
|
100062429797, # 20.0.0 Flat 1, 10-11 Cross Street, PO33 2AD
|
||||||
|
10003320577, # 20.0.0 Flat 3, 10-11 Cross Street, PO33 2AD
|
||||||
|
# 10003320573, # 18.0 Flat 7, 10-11 Cross Street, PO33 2AD
|
||||||
|
10024248769, # 20.0.0 Flat 8, 10-11 Cross Street, PO33 2AD
|
||||||
|
# 10024248772, # 18.0 Flat 9, 10-11 Cross Street, PO33 2AD
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_raw_cert(service: EpcClientService, uprn: int) -> Optional[dict[str, Any]]:
|
||||||
|
"""Pull the latest raw certificate dict for a UPRN straight off the EPC
|
||||||
|
client. We want the RAW cert (not the mapped EpcPropertyData) because the
|
||||||
|
lodged SAP score lives there as `energy_rating_current` — the mapper does
|
||||||
|
not carry it onto the domain object.
|
||||||
|
"""
|
||||||
|
results = service._search(uprn=uprn) # pyright: ignore[reportPrivateUsage]
|
||||||
|
if not results:
|
||||||
|
return None
|
||||||
|
latest = max(results, key=lambda r: r.registration_date)
|
||||||
|
return service._fetch_certificate( # pyright: ignore[reportPrivateUsage]
|
||||||
|
latest.certificate_number
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def compare_sap(raw: dict[str, Any]) -> dict[str, object]:
|
||||||
|
"""Re-score a raw cert through our SAP10 calculator and line it up against
|
||||||
|
the figure the surveyor lodged. For a 20.0.0 cert the calculated value is
|
||||||
|
the counterfactual "what EPC would this get under today's spec" (ADR-0027).
|
||||||
|
"""
|
||||||
|
epc = EpcPropertyDataMapper.from_api_response(raw)
|
||||||
|
result = calculate_sap_from_inputs(cert_to_inputs(epc))
|
||||||
|
|
||||||
|
# Lodged Performance: the surveyor's original SAP score, read directly from
|
||||||
|
# the raw cert. Bands are derived from the score the same way for both sides.
|
||||||
|
lodged_score = raw.get("energy_rating_current")
|
||||||
|
lodged_band = (
|
||||||
|
Epc.from_sap_score(lodged_score).value if lodged_score is not None else "?"
|
||||||
|
)
|
||||||
|
our_band = Epc.from_sap_score(result.sap_score).value
|
||||||
|
|
||||||
|
return {
|
||||||
|
"address": epc.address_line_1,
|
||||||
|
"postcode": epc.postcode,
|
||||||
|
# The SAP methodology version (RdSAP 2012 lodges 9.9x); the *schema*
|
||||||
|
# version (20.0.0) is annotated in the UPRNS list above.
|
||||||
|
"sap_ver": raw.get("sap_version"),
|
||||||
|
"lodged_sap": lodged_score,
|
||||||
|
"lodged_band": lodged_band,
|
||||||
|
"our_sap": result.sap_score,
|
||||||
|
"our_band": our_band,
|
||||||
|
"delta": (
|
||||||
|
result.sap_score - lodged_score if lodged_score is not None else None
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
# Mirror conftest.py: pull OPEN_EPC_API_TOKEN out of backend/.env so the
|
||||||
|
# script runs standalone (`python scripts/eon/find_epc_data.py`).
|
||||||
|
repo_root = Path(__file__).resolve().parents[2]
|
||||||
|
load_dotenv(repo_root / "backend" / ".env")
|
||||||
|
|
||||||
|
token = os.getenv("OPEN_EPC_API_TOKEN")
|
||||||
|
if token is None:
|
||||||
|
raise RuntimeError("OPEN_EPC_API_TOKEN not defined in env")
|
||||||
|
service = EpcClientService(auth_token=token)
|
||||||
|
|
||||||
|
rows: list[dict[str, object]] = []
|
||||||
|
for uprn in UPRNS:
|
||||||
|
raw = fetch_raw_cert(service, uprn)
|
||||||
|
if raw is None:
|
||||||
|
print(f"UPRN {uprn}: no EPC found")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
rows.append({"uprn": uprn, **compare_sap(raw)})
|
||||||
|
except Exception as exc: # surface, don't abort the whole sweep
|
||||||
|
print(f"UPRN {uprn}: failed to score — {type(exc).__name__}: {exc}")
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
print("No certs scored.")
|
||||||
|
return
|
||||||
|
|
||||||
|
table = pd.DataFrame(rows)
|
||||||
|
with pd.option_context("display.max_columns", None, "display.width", None):
|
||||||
|
print(table.to_string(index=False))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
96
scripts/eon/harvest_certs.py
Normal file
96
scripts/eon/harvest_certs.py
Normal file
|
|
@ -0,0 +1,96 @@
|
||||||
|
"""Harvest raw EPC certificates into a JSONL corpus for mapper tests.
|
||||||
|
|
||||||
|
Source: the bulk EPC dumps in downloads/certificates-YYYY.json. Each line is
|
||||||
|
|
||||||
|
{"certificate_number": "...", "document": "<json string>", ...}
|
||||||
|
|
||||||
|
where ``document`` is the cert in the exact shape
|
||||||
|
``EpcClientService._fetch_certificate`` returns and
|
||||||
|
``EpcPropertyDataMapper.from_api_response`` consumes (it has ``schema_type``,
|
||||||
|
``roofs``, ``walls`` ... and matches the committed json_samples).
|
||||||
|
|
||||||
|
We want a balanced sample per schema so we can build out and regression-test
|
||||||
|
the mappers (notably the incomplete ``RdSapSchema20.0.0``). Schema version
|
||||||
|
tracks the dump year, so we read each target schema from a year that's rich in
|
||||||
|
it and stop once its cap is full — no need to stream whole multi-GB files.
|
||||||
|
|
||||||
|
Year -> dominant schema (see downloads/README.txt):
|
||||||
|
2026 -> RdSAP-Schema-21.0.1
|
||||||
|
2021-2024 -> RdSAP-Schema-20.0.0
|
||||||
|
|
||||||
|
SAP-Schema-18.0.0 is a minority schema (~12% of the 2021 dump) but each year
|
||||||
|
holds ~1.6M lines, so 2021 still yields well over 1000 — it just scans deeper
|
||||||
|
before the cap fills. SAP-Schema-17.1 is richest in the 2019 dump (~20%).
|
||||||
|
|
||||||
|
21.0.0 is skipped — it's effectively absent from these dumps.
|
||||||
|
|
||||||
|
Run cell by cell. No API token needed — this is pure local streaming.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
DOWNLOADS = Path("downloads")
|
||||||
|
SAMPLES = Path("backend/epc_api/json_samples")
|
||||||
|
|
||||||
|
# One corpus per schema, written into that schema's own json_samples folder
|
||||||
|
# (alongside its epc.json) as corpus.jsonl. Each schema is read from a year
|
||||||
|
# where it dominates, so we hit the cap within the first few-thousand lines.
|
||||||
|
SOURCES: list[tuple[str, str, int]] = [
|
||||||
|
# ("certificates-2026.json", "RdSAP-Schema-21.0.1", 1000),
|
||||||
|
# ("certificates-2022.json", "RdSAP-Schema-20.0.0", 1000),
|
||||||
|
# pre-SAP10 RdSAP family — NOT the SAP-Schema-* full/design-SAP family.
|
||||||
|
# schema_type scan: RdSAP-Schema-18.0 is ~82% of certificates-2018.json,
|
||||||
|
# 17.1 dominant in 2017, 17.0 dominant in 2015.
|
||||||
|
("certificates-2018.json", "RdSAP-Schema-18.0", 1000),
|
||||||
|
("certificates-2017.json", "RdSAP-Schema-17.1", 1000),
|
||||||
|
# ("certificates-2015.json", "RdSAP-Schema-17.0", 1000),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def corpus_path(schema: str) -> Path:
|
||||||
|
return SAMPLES / schema / "corpus.jsonl"
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
def harvest_one(filename: str, schema: str, cap: int) -> list[dict[str, object]]:
|
||||||
|
"""Stream `filename`, returning up to `cap` cert docs of `schema`."""
|
||||||
|
path = DOWNLOADS / filename
|
||||||
|
docs: list[dict[str, object]] = []
|
||||||
|
scanned = 0
|
||||||
|
with path.open() as fh:
|
||||||
|
for line in fh:
|
||||||
|
if len(docs) >= cap:
|
||||||
|
break
|
||||||
|
scanned += 1
|
||||||
|
try:
|
||||||
|
doc = json.loads(json.loads(line)["document"])
|
||||||
|
except (json.JSONDecodeError, KeyError):
|
||||||
|
continue
|
||||||
|
if doc.get("schema_type") == schema:
|
||||||
|
docs.append(doc)
|
||||||
|
print(f"{schema}: {len(docs)}/{cap} from {filename} (scanned {scanned} lines)")
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Build one corpus per schema, into that schema's json_samples folder.
|
||||||
|
# Overwrites each run — deterministic and cheap.
|
||||||
|
for filename, schema, cap in SOURCES:
|
||||||
|
out_path = corpus_path(schema)
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with out_path.open("w") as out:
|
||||||
|
for doc in harvest_one(filename, schema, cap):
|
||||||
|
out.write(json.dumps(doc) + "\n")
|
||||||
|
print(f"wrote {out_path}")
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# Sanity-check each corpus: line count per schema.
|
||||||
|
for _, schema, _ in SOURCES:
|
||||||
|
path = corpus_path(schema)
|
||||||
|
n = sum(1 for line in path.read_text().splitlines() if line.strip())
|
||||||
|
print(f"{schema}: {n} ({path})")
|
||||||
Loading…
Add table
Reference in a new issue