Map RdSAP 20.0.0 certs that omit reduced fields or lodge localised text 🟩

Required->optional defaults (kw_only + data-driven from corpus presence) so
993/1000 certs that omit sap_windows parse, and honest Union[str, DescriptionV1]
typing for description/dwelling_type which the corpus lodges as localised dicts
in ~half the certs. The never-run 20.0.0 mapper path now produces EpcPropertyData;
974/1000 corpus certs map (xpass), up from 7.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Jun-te Kim 2026-06-10 14:14:18 +00:00
parent 5589a66e7c
commit 8074f4152c
3 changed files with 79 additions and 16 deletions

View file

@ -1081,7 +1081,12 @@ class EpcPropertyDataMapper:
uprn=schema.uprn,
assessment_type=schema.assessment_type,
sap_version=schema.sap_version,
dwelling_type=schema.dwelling_type,
# ADR-0027: 20.0.0 lodges dwelling_type as str OR localised dict.
dwelling_type=(
schema.dwelling_type
if isinstance(schema.dwelling_type, str)
else schema.dwelling_type.value
),
property_type=str(schema.property_type),
built_form=str(schema.built_form),
address_line_1=schema.address_line_1,

View file

@ -1131,3 +1131,49 @@ class TestApiRoofConstructionCode:
# Assert
assert result == "Pitched, sloping ceiling"
# ---------------------------------------------------------------------------
# Schema 20.0.0 — Reduced-Field Synthesis (ADR-0027)
#
# RdSAP 20.0.0 is a pre-SAP10 reduced-data schema: it records as categories or
# aggregates the measured fields the calculator needs (a glazed_area *band*, not
# window m²; bath/shower *room counts*, not bath counts). The mapper synthesises
# the measured form from the cert alone (no neighbour data). Each test name
# encodes the synthesis ASSUMPTION it pins, because a pre-SAP10 cert has no
# same-spec lodged figure to validate against (Validation-Cohort rule).
# ---------------------------------------------------------------------------
_CORPUS_20_0_0 = os.path.join(
os.path.dirname(__file__),
"../../../../backend/epc_api/json_samples/RdSAP-Schema-20.0.0/corpus.jsonl",
)
def _load_20_0_0_corpus() -> list[Dict[str, Any]]:
if not os.path.exists(_CORPUS_20_0_0):
return []
with open(_CORPUS_20_0_0) as f:
return [json.loads(line) for line in f if line.strip()]
class TestRdSap20_0_0ReducedFieldSynthesis:
def test_cert_omitting_sap_windows_maps_without_missing_required_field(
self,
) -> None:
# Arrange — 993/1000 corpus certs omit `sap_windows` entirely; the
# placeholder schema declared it required, so every one failed to parse.
# Required→optional (default []) must let them through.
corpus = _load_20_0_0_corpus()
if not corpus:
pytest.skip("no RdSAP-Schema-20.0.0 corpus harvested")
cert = next((c for c in corpus if "sap_windows" not in c), None)
if cert is None:
pytest.skip("no corpus cert omits sap_windows")
# Act
result = EpcPropertyDataMapper.from_api_response(cert)
# Assert
assert isinstance(result, EpcPropertyData)

View file

@ -1,20 +1,24 @@
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import List, Optional, Union
from .common import Measurement
from .common import DescriptionV1, Measurement
@dataclass
class EnergyElement:
# description is a plain string in schema 20.0.0 onwards (no longer a localised object)
description: str
# ADR-0027: the corpus lodges description as EITHER a plain str OR a
# localised {value,language} dict (DescriptionV1) — not str-only as a
# one-example placeholder assumed. Union so _coerce builds the right one.
description: Union[str, DescriptionV1]
energy_efficiency_rating: int
environmental_efficiency_rating: int
@dataclass
class Addendum:
addendum_numbers: List[int]
# ADR-0027: an addendum block can lodge only stone_walls/system_build flags
# with no numbers list → optional.
addendum_numbers: List[int] = field(default_factory=list)
stone_walls: Optional[str] = None
system_build: Optional[str] = None
@ -134,7 +138,8 @@ class SapBuildingPart:
party_wall_construction: Union[int, str]
wall_thickness_measured: str
roof_insulation_location: Union[int, str]
roof_insulation_thickness: Union[str, int]
# ADR-0027: absent on 254/1506 building parts (flat-roof / no-loft) → optional.
roof_insulation_thickness: Optional[Union[str, int]] = None
sap_room_in_roof: Optional[SapRoomInRoof] = None
wall_thickness: Optional[int] = None
wall_insulation_thickness: Optional[str] = None
@ -194,7 +199,12 @@ class RenewableHeatIncentive:
impact_of_solid_wall_insulation: Optional[int] = None
@dataclass
# ADR-0027: 20.0.0 is a reduced-data schema generated from a single example, so
# it over-constrains — fields the corpus routinely omits were declared required,
# failing 993/1000 certs at parse. Required→optional is data-driven (any field
# present in <100% of the corpus gets a default); `kw_only=True` lifts the
# dataclass non-default-after-default ordering rule so defaults can sit inline.
@dataclass(kw_only=True)
class RdSapSchema20_0_0:
uprn: int
roofs: List[EnergyElement]
@ -214,13 +224,14 @@ class RdSapSchema20_0_0:
report_type: int
sap_heating: SapHeating
sap_version: float
sap_windows: List[SapWindow]
# ADR-0027: 993/1000 omit this; synthesised by Reduced-Field Synthesis.
sap_windows: List[SapWindow] = field(default_factory=list)
schema_type: str
uprn_source: str
country_code: str
main_heating: List[EnergyElement]
# dwelling_type is a plain string in schema 20.0.0 onwards
dwelling_type: str
# ADR-0027: mixed str / localised-dict in the corpus (see EnergyElement).
dwelling_type: Union[str, DescriptionV1]
language_code: int
property_type: int
address_line_1: str
@ -236,7 +247,7 @@ class RdSapSchema20_0_0:
registration_date: str
sap_energy_source: SapEnergySource
secondary_heating: EnergyElement
lzc_energy_sources: List[int]
lzc_energy_sources: List[int] = field(default_factory=list)
sap_building_parts: List[SapBuildingPart]
low_energy_lighting: int
solar_water_heating: str
@ -252,24 +263,25 @@ class RdSapSchema20_0_0:
open_fireplaces_count: int
heating_cost_potential: float
hot_water_cost_current: float
insulated_door_u_value: float
insulated_door_u_value: Optional[float] = None
mechanical_ventilation: int
percent_draughtproofed: int
suggested_improvements: List[SuggestedImprovement]
suggested_improvements: List[SuggestedImprovement] = field(default_factory=list)
co2_emissions_potential: float
energy_rating_potential: int
lighting_cost_potential: float
schema_version_original: str
hot_water_cost_potential: float
renewable_heat_incentive: RenewableHeatIncentive
windows_transmission_details: WindowsTransmissionDetails
# ADR-0027: cert-level U/g present in 687/1000; Table-24 default otherwise.
windows_transmission_details: Optional[WindowsTransmissionDetails] = None
energy_consumption_current: int
multiple_glazed_proportion: int
calculation_software_version: str
energy_consumption_potential: int
environmental_impact_current: int
fixed_lighting_outlets_count: int
multiple_glazed_proportion_nr: Optional[str]
multiple_glazed_proportion_nr: Optional[str] = None
current_energy_efficiency_band: str
environmental_impact_potential: int
potential_energy_efficiency_band: str