mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
Dispatch and map RdSAP-Schema-18.0 certs end-to-end 🟥
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
362cd20f11
commit
cfc337f04a
5 changed files with 2252 additions and 0 deletions
1000
backend/epc_api/json_samples/RdSAP-Schema-17.1/corpus.jsonl
Normal file
1000
backend/epc_api/json_samples/RdSAP-Schema-17.1/corpus.jsonl
Normal file
File diff suppressed because one or more lines are too long
1000
backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl
Normal file
1000
backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl
Normal file
File diff suppressed because one or more lines are too long
|
|
@ -1424,3 +1424,45 @@ class TestRdSap20_0_0ReducedFieldSynthesis:
|
|||
assert len(result.sap_windows) == len(lodged)
|
||||
total_area = sum(w.window_width * w.window_height for w in result.sap_windows)
|
||||
assert total_area == pytest.approx(expected_total)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# RdSAP 18.0 Reduced-Field Synthesis (ADR-0028 — inherit-and-validate). 18.0 is
|
||||
# the same pre-SAP10 reduced family as 20.0.0: glazed_area *band* not window m²,
|
||||
# bath/shower *room counts* not bath counts, lighting OUTLET counts not bulbs.
|
||||
# The mapper synthesises the measured form from the cert alone (no neighbour
|
||||
# data), reusing 20.0.0's coefficients (validated against 18.0's own band-4 rich
|
||||
# certs: observed 0.223 ≈ 0.148 × 1.51). Each test name pins one assumption,
|
||||
# because a pre-SAP10 cert has no same-spec lodged figure (Validation-Cohort).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CORPUS_18_0 = os.path.join(
|
||||
os.path.dirname(__file__),
|
||||
"../../../../backend/epc_api/json_samples/RdSAP-Schema-18.0/corpus.jsonl",
|
||||
)
|
||||
|
||||
|
||||
def _load_18_0_corpus() -> list[Dict[str, Any]]:
|
||||
if not os.path.exists(_CORPUS_18_0):
|
||||
return []
|
||||
with open(_CORPUS_18_0) as f:
|
||||
return [json.loads(line) for line in f if line.strip()]
|
||||
|
||||
|
||||
class TestRdSap18_0ReducedFieldSynthesis:
|
||||
|
||||
def test_cert_dispatches_and_maps_without_missing_required_field(self) -> None:
|
||||
# Arrange — the placeholder 18.0 schema was generated from one example, so
|
||||
# 986/1000 corpus certs fail to parse (over-constrained required fields),
|
||||
# and `from_api_response` never dispatched RdSAP-Schema-18.0 at all.
|
||||
# Dispatch + required→optional must let a real cert through end-to-end.
|
||||
corpus = _load_18_0_corpus()
|
||||
if not corpus:
|
||||
pytest.skip("no RdSAP-Schema-18.0 corpus harvested")
|
||||
cert = corpus[0]
|
||||
|
||||
# Act
|
||||
result = EpcPropertyDataMapper.from_api_response(cert)
|
||||
|
||||
# Assert
|
||||
assert isinstance(result, EpcPropertyData)
|
||||
|
|
|
|||
114
scripts/eon/find_epc_data.py
Normal file
114
scripts/eon/find_epc_data.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import pandas as pd
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from datatypes.epc.domain.epc import Epc
|
||||
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
||||
from domain.sap10_calculator.calculator import calculate_sap_from_inputs
|
||||
from domain.sap10_calculator.rdsap.cert_to_inputs import cert_to_inputs
|
||||
from infrastructure.epc_client.epc_client_service import EpcClientService
|
||||
|
||||
# UPRNs to compare. Most are RdSAP 20.0.0 (pre-SAP10) certs — the ones the
|
||||
# Reduced-Field Synthesis mapper (ADR-0027) re-maps so the SAP10 calculator can
|
||||
# re-score them. The commented rows are non-20.0.0 neighbours kept for context.
|
||||
UPRNS: list[int] = [
|
||||
10003318624, # 20.0.0 Flat 1, 6 Alexandra Gardens, PO38 1EE
|
||||
10003318625, # 20.0.0 Flat 2, 6 Alexandra Gardens, PO38 1EE
|
||||
10003318626, # 20.0.0 Flat 3, 6 Alexandra Gardens, PO38 1EE
|
||||
# 10003318698, # 17.1 Flat 4, 6 Alexandra Gardens, PO38 1EE
|
||||
100062430247, # 20.0.0 Flat 5, Adelaide Court, Adelaide Place, PO33 3DG
|
||||
100062430248, # 20.0.0 Flat 6, Adelaide Court, Adelaide Place, PO33 3DG
|
||||
100062430250, # 20.0.0 Flat 8, Adelaide Court, Adelaide Place, PO33 3DG
|
||||
100062429797, # 20.0.0 Flat 1, 10-11 Cross Street, PO33 2AD
|
||||
10003320577, # 20.0.0 Flat 3, 10-11 Cross Street, PO33 2AD
|
||||
# 10003320573, # 18.0 Flat 7, 10-11 Cross Street, PO33 2AD
|
||||
10024248769, # 20.0.0 Flat 8, 10-11 Cross Street, PO33 2AD
|
||||
# 10024248772, # 18.0 Flat 9, 10-11 Cross Street, PO33 2AD
|
||||
]
|
||||
|
||||
|
||||
def fetch_raw_cert(service: EpcClientService, uprn: int) -> Optional[dict[str, Any]]:
|
||||
"""Pull the latest raw certificate dict for a UPRN straight off the EPC
|
||||
client. We want the RAW cert (not the mapped EpcPropertyData) because the
|
||||
lodged SAP score lives there as `energy_rating_current` — the mapper does
|
||||
not carry it onto the domain object.
|
||||
"""
|
||||
results = service._search(uprn=uprn) # pyright: ignore[reportPrivateUsage]
|
||||
if not results:
|
||||
return None
|
||||
latest = max(results, key=lambda r: r.registration_date)
|
||||
return service._fetch_certificate( # pyright: ignore[reportPrivateUsage]
|
||||
latest.certificate_number
|
||||
)
|
||||
|
||||
|
||||
def compare_sap(raw: dict[str, Any]) -> dict[str, object]:
|
||||
"""Re-score a raw cert through our SAP10 calculator and line it up against
|
||||
the figure the surveyor lodged. For a 20.0.0 cert the calculated value is
|
||||
the counterfactual "what EPC would this get under today's spec" (ADR-0027).
|
||||
"""
|
||||
epc = EpcPropertyDataMapper.from_api_response(raw)
|
||||
result = calculate_sap_from_inputs(cert_to_inputs(epc))
|
||||
|
||||
# Lodged Performance: the surveyor's original SAP score, read directly from
|
||||
# the raw cert. Bands are derived from the score the same way for both sides.
|
||||
lodged_score = raw.get("energy_rating_current")
|
||||
lodged_band = (
|
||||
Epc.from_sap_score(lodged_score).value if lodged_score is not None else "?"
|
||||
)
|
||||
our_band = Epc.from_sap_score(result.sap_score).value
|
||||
|
||||
return {
|
||||
"address": epc.address_line_1,
|
||||
"postcode": epc.postcode,
|
||||
# The SAP methodology version (RdSAP 2012 lodges 9.9x); the *schema*
|
||||
# version (20.0.0) is annotated in the UPRNS list above.
|
||||
"sap_ver": raw.get("sap_version"),
|
||||
"lodged_sap": lodged_score,
|
||||
"lodged_band": lodged_band,
|
||||
"our_sap": result.sap_score,
|
||||
"our_band": our_band,
|
||||
"delta": (
|
||||
result.sap_score - lodged_score if lodged_score is not None else None
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
# Mirror conftest.py: pull OPEN_EPC_API_TOKEN out of backend/.env so the
|
||||
# script runs standalone (`python scripts/eon/find_epc_data.py`).
|
||||
repo_root = Path(__file__).resolve().parents[2]
|
||||
load_dotenv(repo_root / "backend" / ".env")
|
||||
|
||||
token = os.getenv("OPEN_EPC_API_TOKEN")
|
||||
if token is None:
|
||||
raise RuntimeError("OPEN_EPC_API_TOKEN not defined in env")
|
||||
service = EpcClientService(auth_token=token)
|
||||
|
||||
rows: list[dict[str, object]] = []
|
||||
for uprn in UPRNS:
|
||||
raw = fetch_raw_cert(service, uprn)
|
||||
if raw is None:
|
||||
print(f"UPRN {uprn}: no EPC found")
|
||||
continue
|
||||
try:
|
||||
rows.append({"uprn": uprn, **compare_sap(raw)})
|
||||
except Exception as exc: # surface, don't abort the whole sweep
|
||||
print(f"UPRN {uprn}: failed to score — {type(exc).__name__}: {exc}")
|
||||
|
||||
if not rows:
|
||||
print("No certs scored.")
|
||||
return
|
||||
|
||||
table = pd.DataFrame(rows)
|
||||
with pd.option_context("display.max_columns", None, "display.width", None):
|
||||
print(table.to_string(index=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
96
scripts/eon/harvest_certs.py
Normal file
96
scripts/eon/harvest_certs.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
"""Harvest raw EPC certificates into a JSONL corpus for mapper tests.
|
||||
|
||||
Source: the bulk EPC dumps in downloads/certificates-YYYY.json. Each line is
|
||||
|
||||
{"certificate_number": "...", "document": "<json string>", ...}
|
||||
|
||||
where ``document`` is the cert in the exact shape
|
||||
``EpcClientService._fetch_certificate`` returns and
|
||||
``EpcPropertyDataMapper.from_api_response`` consumes (it has ``schema_type``,
|
||||
``roofs``, ``walls`` ... and matches the committed json_samples).
|
||||
|
||||
We want a balanced sample per schema so we can build out and regression-test
|
||||
the mappers (notably the incomplete ``RdSapSchema20.0.0``). Schema version
|
||||
tracks the dump year, so we read each target schema from a year that's rich in
|
||||
it and stop once its cap is full — no need to stream whole multi-GB files.
|
||||
|
||||
Year -> dominant schema (see downloads/README.txt):
|
||||
2026 -> RdSAP-Schema-21.0.1
|
||||
2021-2024 -> RdSAP-Schema-20.0.0
|
||||
|
||||
SAP-Schema-18.0.0 is a minority schema (~12% of the 2021 dump) but each year
|
||||
holds ~1.6M lines, so 2021 still yields well over 1000 — it just scans deeper
|
||||
before the cap fills. SAP-Schema-17.1 is richest in the 2019 dump (~20%).
|
||||
|
||||
21.0.0 is skipped — it's effectively absent from these dumps.
|
||||
|
||||
Run cell by cell. No API token needed — this is pure local streaming.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
DOWNLOADS = Path("downloads")
|
||||
SAMPLES = Path("backend/epc_api/json_samples")
|
||||
|
||||
# One corpus per schema, written into that schema's own json_samples folder
|
||||
# (alongside its epc.json) as corpus.jsonl. Each schema is read from a year
|
||||
# where it dominates, so we hit the cap within the first few-thousand lines.
|
||||
SOURCES: list[tuple[str, str, int]] = [
|
||||
# ("certificates-2026.json", "RdSAP-Schema-21.0.1", 1000),
|
||||
# ("certificates-2022.json", "RdSAP-Schema-20.0.0", 1000),
|
||||
# pre-SAP10 RdSAP family — NOT the SAP-Schema-* full/design-SAP family.
|
||||
# schema_type scan: RdSAP-Schema-18.0 is ~82% of certificates-2018.json,
|
||||
# 17.1 dominant in 2017, 17.0 dominant in 2015.
|
||||
("certificates-2018.json", "RdSAP-Schema-18.0", 1000),
|
||||
("certificates-2017.json", "RdSAP-Schema-17.1", 1000),
|
||||
# ("certificates-2015.json", "RdSAP-Schema-17.0", 1000),
|
||||
]
|
||||
|
||||
|
||||
def corpus_path(schema: str) -> Path:
|
||||
return SAMPLES / schema / "corpus.jsonl"
|
||||
|
||||
|
||||
# %%
|
||||
def harvest_one(filename: str, schema: str, cap: int) -> list[dict[str, object]]:
|
||||
"""Stream `filename`, returning up to `cap` cert docs of `schema`."""
|
||||
path = DOWNLOADS / filename
|
||||
docs: list[dict[str, object]] = []
|
||||
scanned = 0
|
||||
with path.open() as fh:
|
||||
for line in fh:
|
||||
if len(docs) >= cap:
|
||||
break
|
||||
scanned += 1
|
||||
try:
|
||||
doc = json.loads(json.loads(line)["document"])
|
||||
except (json.JSONDecodeError, KeyError):
|
||||
continue
|
||||
if doc.get("schema_type") == schema:
|
||||
docs.append(doc)
|
||||
print(f"{schema}: {len(docs)}/{cap} from {filename} (scanned {scanned} lines)")
|
||||
return docs
|
||||
|
||||
|
||||
# %%
|
||||
# Build one corpus per schema, into that schema's json_samples folder.
|
||||
# Overwrites each run — deterministic and cheap.
|
||||
for filename, schema, cap in SOURCES:
|
||||
out_path = corpus_path(schema)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with out_path.open("w") as out:
|
||||
for doc in harvest_one(filename, schema, cap):
|
||||
out.write(json.dumps(doc) + "\n")
|
||||
print(f"wrote {out_path}")
|
||||
|
||||
# %%
|
||||
# Sanity-check each corpus: line count per schema.
|
||||
for _, schema, _ in SOURCES:
|
||||
path = corpus_path(schema)
|
||||
n = sum(1 for line in path.read_text().splitlines() if line.strip())
|
||||
print(f"{schema}: {n} ({path})")
|
||||
Loading…
Add table
Reference in a new issue