Generalise to a 'broken schema type' gate keyed on assessment_type

Per review (Khalim): lift the full-SAP detection out of the SAP-Schema-16.x
branch into a single top-level gate in from_api_response, and key it on the
gov-API's own assessment_type declaration rather than the structural
sap_opening_types proxy.

- _is_full_sap_assessment(data): assessment_type == "SAP" — the authoritative
  SAP-vs-RdSAP classification. Verified to separate the entire fixture corpus:
  every full-SAP schema (SAP-Schema-17.x/18.x + the broken LIG 16.x) is "SAP";
  every reduced cert (RdSAP-Schema-* and reduced SAP-Schema-16.x, incl.
  sap_16_0.json) is "RdSAP". data_type / sap_opening_types agree but are derived
  shape artifacts; assessment_type is the meaning.
- A cert that is full-SAP by assessment_type but whose schema_type LABEL is not a
  recognised full-SAP schema is a *broken schema type* (label disagrees with the
  assessment). _record_broken_schema_type logs it — visible, not silently
  rerouted — so unreliable labels surface and coverage grows as new mislabels
  appear. Generalises beyond 16.x to any future mislabel.
- _from_full_sap maps it via the full-SAP 17.1 mapper (real measured openings,
  no defaulting; only `tenure` defaulted). Correctly-labelled full-SAP certs keep
  their dedicated branches (one-mapper-per-schema convention); reduced certs are
  unchanged.

Tests: broken cert routed AND recorded; correctly-labelled full-SAP not recorded.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Jun-te Kim 2026-06-24 15:39:16 +00:00
parent 5e0963593a
commit e312aae95d
2 changed files with 114 additions and 38 deletions

View file

@ -1,4 +1,5 @@
import copy import copy
import logging
import re import re
from dataclasses import replace from dataclasses import replace
from datetime import date from datetime import date
@ -958,23 +959,14 @@ class EpcPropertyDataMapper:
@staticmethod @staticmethod
def _from_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData: def _from_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData:
"""Shared body for the SAP-Schema-16.x dedicated mappers. """Shared body for the SAP-Schema-16.x dedicated mappers: normalise the
reduced-field doc onto the RdSAP-17.1 shape and reuse that mapper.
The `SAP-Schema-16.x` *name* covers two structurally different certs:
* **RdSAP** (`assessment_type == "RdSAP"`) the reduced-field shape
(top-level `door_count` / `glazed_area` band, construction-code building
parts). Normalised onto RdSAP-17.1 and mapped by `from_rdsap_schema_17_1`.
* **full SAP** (`assessment_type == "SAP"`, e.g. the as-designed `LIG-*`
new-builds) the *measured* shape: `sap_opening_types` + structured
`sap_building_parts` carrying measured U-values and door/window openings,
NOT the reduced top-level count fields. These omit `door_count` because
the doors are lodged as openings, so the reduced normaliser failed loud
(`RdSapSchema17_1: missing required field 'door_count'`). They are
full-SAP certs and map via the full-SAP 17.1 mapper instead."""
if _is_full_sap_cert(data):
return EpcPropertyDataMapper._from_full_sap_schema_16_x(data)
Only the reduced-RdSAP shape reaches here. A full-SAP cert mis-lodged
under a SAP-Schema-16.x label (the `LIG-*` as-designed new-builds) is a
*broken schema type* `from_api_response` detects the full-SAP shape and
routes it to the full-SAP mapper before this label dispatch (see
`_is_full_sap_assessment`)."""
from datatypes.epc.schema.rdsap_schema_17_1 import RdSapSchema17_1 from datatypes.epc.schema.rdsap_schema_17_1 import RdSapSchema17_1
return EpcPropertyDataMapper.from_rdsap_schema_17_1( return EpcPropertyDataMapper.from_rdsap_schema_17_1(
@ -982,15 +974,12 @@ class EpcPropertyDataMapper:
) )
@staticmethod @staticmethod
def _from_full_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData: def _from_full_sap(data: Dict[str, Any]) -> EpcPropertyData:
"""Map a full-SAP cert lodged under a `SAP-Schema-16.x` version. """Map a cert whose *structure* is full-SAP (measured `sap_opening_types`)
via the full-SAP 17.1 mapper, regardless of its `schema_type` label the
Structurally a full-SAP cert (`sap_opening_types` + measured door/window/fabric come from the real measured openings, no reduced-field
`sap_building_parts`), so it parses with the full-SAP 17.1 dataclass and defaulting. The only `SapSchema17_1` field the broken-label (16.x) full-SAP
reuses `from_sap_schema_17_1` door/window/fabric come from the real shape omits is `tenure` (register metadata, no SAP effect), defaulted."""
measured openings, no reduced-field defaulting. The only `SapSchema17_1`
field the 16.x full-SAP shape omits is `tenure` (register metadata with no
SAP effect), defaulted here."""
normalised = copy.deepcopy(data) normalised = copy.deepcopy(data)
normalised.setdefault("tenure", "unknown") normalised.setdefault("tenure", "unknown")
return EpcPropertyDataMapper.from_sap_schema_17_1( return EpcPropertyDataMapper.from_sap_schema_17_1(
@ -2639,7 +2628,20 @@ class EpcPropertyDataMapper:
data = _normalize_shower_outlets(data) data = _normalize_shower_outlets(data)
data = _default_missing_post_town(data) data = _default_missing_post_town(data)
schema = data.get("schema_type", "") schema = data.get("schema_type", "")
if schema == "RdSAP-Schema-21.0.1":
# Shape over label. The `schema_type` is a LABEL that can disagree with the
# cert's STRUCTURE. A cert that is structurally full-SAP (measured
# `sap_opening_types`) is mapped by the full-SAP mapper whatever its label
# claims. When the label is NOT a recognised full-SAP schema it is a
# *broken schema type* — e.g. the LIG as-designed certs mis-lodged under
# SAP-Schema-16.x — recorded so the unreliable labels stay visible and
# coverage grows as new shapes surface. Correctly-labelled full-SAP certs
# fall through to their own dedicated branch below, keeping the
# one-mapper-per-schema convention.
if _is_full_sap_assessment(data) and not _is_full_sap_label(schema):
_record_broken_schema_type(schema, data)
mapped = EpcPropertyDataMapper._from_full_sap(data)
elif schema == "RdSAP-Schema-21.0.1":
from datatypes.epc.schema.rdsap_schema_21_0_1 import RdSapSchema21_0_1 from datatypes.epc.schema.rdsap_schema_21_0_1 import RdSapSchema21_0_1
mapped = EpcPropertyDataMapper.from_rdsap_schema_21_0_1( mapped = EpcPropertyDataMapper.from_rdsap_schema_21_0_1(
@ -3321,15 +3323,57 @@ def _derive_built_form_16x(dwelling_type: Any) -> int:
return 4 # flats / unstated form → modal built_form (SAP- and gate-neutral) return 4 # flats / unstated form → modal built_form (SAP- and gate-neutral)
def _is_full_sap_cert(data: Dict[str, Any]) -> bool: logger = logging.getLogger(__name__)
"""Whether a `SAP-Schema-16.x` doc is structurally a full-SAP cert rather than
the reduced RdSAP shape: an as-designed/full SAP assessment # The recognised full-SAP schema_type labels: a full-SAP *shape* under one of
(`assessment_type == "SAP"`) that lodges fabric as measured openings # these is correctly labelled, not a broken schema type. Anything else carrying
(`sap_opening_types`) instead of the reduced top-level count fields. Both # the full-SAP shape is a mislabel (e.g. the LIG as-designed certs lodged as
signals agree on the real corpus; requiring both avoids mis-routing a reduced # SAP-Schema-16.x).
cert that happens to carry one.""" _FULL_SAP_SCHEMA_LABELS: frozenset[str] = frozenset(
return data.get("assessment_type") == "SAP" and bool( {
data.get("sap_opening_types") "SAP-Schema-17.0",
"SAP-Schema-17.1",
"SAP-Schema-18.0.0",
"SAP-Schema-19.1.0",
}
)
def _is_full_sap_assessment(data: Dict[str, Any]) -> bool:
"""Whether a cert is a full-SAP assessment (vs reduced RdSAP), keyed on the
gov-API's own `assessment_type` declaration — the authoritative SAP-vs-RdSAP
classification, not a structural proxy. A full-SAP assessment lodges fabric
as measured openings + measured U-values; a reduced RdSAP one carries the
top-level count fields.
`assessment_type` separates the entire cert corpus cleanly every full-SAP
schema (SAP-Schema-17.x/18.x and the broken `LIG` 16.x) is `"SAP"`, every
reduced cert (RdSAP-Schema-* and the reduced SAP-Schema-16.x) is `"RdSAP"`
and it is independent of the `schema_type` LABEL, which can be broken. The
structural signals (`data_type`, `sap_opening_types`) agree with it on the
corpus but are derived shape artifacts; `assessment_type` is the meaning."""
return data.get("assessment_type") == "SAP"
def _is_full_sap_label(schema: str) -> bool:
"""Whether `schema` is a recognised full-SAP `schema_type` — so a full-SAP
*shape* under it is correctly labelled, not a broken schema type."""
return schema in _FULL_SAP_SCHEMA_LABELS
def _record_broken_schema_type(schema: str, data: Dict[str, Any]) -> None:
"""Surface a *broken schema type*: a cert whose `schema_type` label disagrees
with its structure the label is not a full-SAP schema, yet the cert is
structurally full-SAP. Logged rather than silently rerouted so the unreliable
labels and their frequency stay visible (mirrors the skipped-cohort-cert
capture), and coverage can grow as new mislabelled shapes surface. e.g. the
LIG as-designed certs lodged under SAP-Schema-16.x."""
logger.warning(
"broken schema_type %r: structurally full-SAP "
"(assessment_type=%r) — routing to the full-SAP mapper (uprn=%s)",
schema,
data.get("assessment_type"),
data.get("uprn"),
) )

View file

@ -11,6 +11,7 @@ exercise the shape variation the design decisions hinge on
""" """
import json import json
import logging
import os import os
from typing import Any, Dict from typing import Any, Dict
@ -684,9 +685,8 @@ class TestFullSapSchema16xRouting:
assert epc.dwelling_type == "Detached house" assert epc.dwelling_type == "Detached house"
def test_reduced_16_x_cert_unaffected_by_full_sap_routing(self) -> None: def test_reduced_16_x_cert_unaffected_by_full_sap_routing(self) -> None:
# Arrange — a reduced 16.2 cert (assessment_type RdSAP, no # Arrange — a reduced 16.2 cert (assessment_type RdSAP) must stay on the
# sap_opening_types) must stay on the RdSAP path, keeping its top-level # RdSAP path, keeping its top-level property_type.
# property_type.
data = load("sap_16_2.json") data = load("sap_16_2.json")
# Act # Act
@ -694,3 +694,35 @@ class TestFullSapSchema16xRouting:
# Assert # Assert
assert epc.property_type is not None assert epc.property_type is not None
def test_broken_schema_type_is_recorded(
self, caplog: pytest.LogCaptureFixture
) -> None:
# Arrange — a full-SAP cert mislabelled as SAP-Schema-16.0: the label
# disagrees with the assessment_type, so the mismatch must be surfaced
# (not silently rerouted).
data = load("sap_16_0_full.json")
# Act
with caplog.at_level(logging.WARNING, logger="datatypes.epc.domain.mapper"):
EpcPropertyDataMapper.from_api_response(data)
# Assert
assert any(
"broken schema_type" in r.message and "SAP-Schema-16.0" in r.message
for r in caplog.records
)
def test_correctly_labelled_full_sap_is_not_recorded_as_broken(
self, caplog: pytest.LogCaptureFixture
) -> None:
# Arrange — a correctly-labelled full-SAP cert: assessment_type SAP AND a
# recognised full-SAP label, so no mismatch to record.
data = load("sap_17_1.json")
# Act
with caplog.at_level(logging.WARNING, logger="datatypes.epc.domain.mapper"):
EpcPropertyDataMapper.from_api_response(data)
# Assert
assert not any("broken schema_type" in r.message for r in caplog.records)