Generalise to a 'broken schema type' gate keyed on assessment_type

Per review (Khalim): lift the full-SAP detection out of the SAP-Schema-16.x
branch into a single top-level gate in from_api_response, and key it on the
gov-API's own assessment_type declaration rather than the structural
sap_opening_types proxy.

- _is_full_sap_assessment(data): assessment_type == "SAP" — the authoritative
  SAP-vs-RdSAP classification. Verified to separate the entire fixture corpus:
  every full-SAP schema (SAP-Schema-17.x/18.x + the broken LIG 16.x) is "SAP";
  every reduced cert (RdSAP-Schema-* and reduced SAP-Schema-16.x, incl.
  sap_16_0.json) is "RdSAP". data_type / sap_opening_types agree but are derived
  shape artifacts; assessment_type is the meaning.
- A cert that is full-SAP by assessment_type but whose schema_type LABEL is not a
  recognised full-SAP schema is a *broken schema type* (label disagrees with the
  assessment). _record_broken_schema_type logs it — visible, not silently
  rerouted — so unreliable labels surface and coverage grows as new mislabels
  appear. Generalises beyond 16.x to any future mislabel.
- _from_full_sap maps it via the full-SAP 17.1 mapper (real measured openings,
  no defaulting; only `tenure` defaulted). Correctly-labelled full-SAP certs keep
  their dedicated branches (one-mapper-per-schema convention); reduced certs are
  unchanged.

Tests: broken cert routed AND recorded; correctly-labelled full-SAP not recorded.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Jun-te Kim 2026-06-24 15:39:16 +00:00
parent 5e0963593a
commit e312aae95d
2 changed files with 114 additions and 38 deletions

View file

@ -1,4 +1,5 @@
import copy
import logging
import re
from dataclasses import replace
from datetime import date
@ -958,23 +959,14 @@ class EpcPropertyDataMapper:
@staticmethod
def _from_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData:
"""Shared body for the SAP-Schema-16.x dedicated mappers.
The `SAP-Schema-16.x` *name* covers two structurally different certs:
* **RdSAP** (`assessment_type == "RdSAP"`) the reduced-field shape
(top-level `door_count` / `glazed_area` band, construction-code building
parts). Normalised onto RdSAP-17.1 and mapped by `from_rdsap_schema_17_1`.
* **full SAP** (`assessment_type == "SAP"`, e.g. the as-designed `LIG-*`
new-builds) the *measured* shape: `sap_opening_types` + structured
`sap_building_parts` carrying measured U-values and door/window openings,
NOT the reduced top-level count fields. These omit `door_count` because
the doors are lodged as openings, so the reduced normaliser failed loud
(`RdSapSchema17_1: missing required field 'door_count'`). They are
full-SAP certs and map via the full-SAP 17.1 mapper instead."""
if _is_full_sap_cert(data):
return EpcPropertyDataMapper._from_full_sap_schema_16_x(data)
"""Shared body for the SAP-Schema-16.x dedicated mappers: normalise the
reduced-field doc onto the RdSAP-17.1 shape and reuse that mapper.
Only the reduced-RdSAP shape reaches here. A full-SAP cert mis-lodged
under a SAP-Schema-16.x label (the `LIG-*` as-designed new-builds) is a
*broken schema type* `from_api_response` detects the full-SAP shape and
routes it to the full-SAP mapper before this label dispatch (see
`_is_full_sap_assessment`)."""
from datatypes.epc.schema.rdsap_schema_17_1 import RdSapSchema17_1
return EpcPropertyDataMapper.from_rdsap_schema_17_1(
@ -982,15 +974,12 @@ class EpcPropertyDataMapper:
)
@staticmethod
def _from_full_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData:
"""Map a full-SAP cert lodged under a `SAP-Schema-16.x` version.
Structurally a full-SAP cert (`sap_opening_types` + measured
`sap_building_parts`), so it parses with the full-SAP 17.1 dataclass and
reuses `from_sap_schema_17_1` door/window/fabric come from the real
measured openings, no reduced-field defaulting. The only `SapSchema17_1`
field the 16.x full-SAP shape omits is `tenure` (register metadata with no
SAP effect), defaulted here."""
def _from_full_sap(data: Dict[str, Any]) -> EpcPropertyData:
"""Map a cert whose *structure* is full-SAP (measured `sap_opening_types`)
via the full-SAP 17.1 mapper, regardless of its `schema_type` label the
door/window/fabric come from the real measured openings, no reduced-field
defaulting. The only `SapSchema17_1` field the broken-label (16.x) full-SAP
shape omits is `tenure` (register metadata, no SAP effect), defaulted."""
normalised = copy.deepcopy(data)
normalised.setdefault("tenure", "unknown")
return EpcPropertyDataMapper.from_sap_schema_17_1(
@ -2639,7 +2628,20 @@ class EpcPropertyDataMapper:
data = _normalize_shower_outlets(data)
data = _default_missing_post_town(data)
schema = data.get("schema_type", "")
if schema == "RdSAP-Schema-21.0.1":
# Shape over label. The `schema_type` is a LABEL that can disagree with the
# cert's STRUCTURE. A cert that is structurally full-SAP (measured
# `sap_opening_types`) is mapped by the full-SAP mapper whatever its label
# claims. When the label is NOT a recognised full-SAP schema it is a
# *broken schema type* — e.g. the LIG as-designed certs mis-lodged under
# SAP-Schema-16.x — recorded so the unreliable labels stay visible and
# coverage grows as new shapes surface. Correctly-labelled full-SAP certs
# fall through to their own dedicated branch below, keeping the
# one-mapper-per-schema convention.
if _is_full_sap_assessment(data) and not _is_full_sap_label(schema):
_record_broken_schema_type(schema, data)
mapped = EpcPropertyDataMapper._from_full_sap(data)
elif schema == "RdSAP-Schema-21.0.1":
from datatypes.epc.schema.rdsap_schema_21_0_1 import RdSapSchema21_0_1
mapped = EpcPropertyDataMapper.from_rdsap_schema_21_0_1(
@ -3321,15 +3323,57 @@ def _derive_built_form_16x(dwelling_type: Any) -> int:
return 4 # flats / unstated form → modal built_form (SAP- and gate-neutral)
def _is_full_sap_cert(data: Dict[str, Any]) -> bool:
"""Whether a `SAP-Schema-16.x` doc is structurally a full-SAP cert rather than
the reduced RdSAP shape: an as-designed/full SAP assessment
(`assessment_type == "SAP"`) that lodges fabric as measured openings
(`sap_opening_types`) instead of the reduced top-level count fields. Both
signals agree on the real corpus; requiring both avoids mis-routing a reduced
cert that happens to carry one."""
return data.get("assessment_type") == "SAP" and bool(
data.get("sap_opening_types")
logger = logging.getLogger(__name__)
# The recognised full-SAP schema_type labels: a full-SAP *shape* under one of
# these is correctly labelled, not a broken schema type. Anything else carrying
# the full-SAP shape is a mislabel (e.g. the LIG as-designed certs lodged as
# SAP-Schema-16.x).
_FULL_SAP_SCHEMA_LABELS: frozenset[str] = frozenset(
{
"SAP-Schema-17.0",
"SAP-Schema-17.1",
"SAP-Schema-18.0.0",
"SAP-Schema-19.1.0",
}
)
def _is_full_sap_assessment(data: Dict[str, Any]) -> bool:
"""Whether a cert is a full-SAP assessment (vs reduced RdSAP), keyed on the
gov-API's own `assessment_type` declaration — the authoritative SAP-vs-RdSAP
classification, not a structural proxy. A full-SAP assessment lodges fabric
as measured openings + measured U-values; a reduced RdSAP one carries the
top-level count fields.
`assessment_type` separates the entire cert corpus cleanly every full-SAP
schema (SAP-Schema-17.x/18.x and the broken `LIG` 16.x) is `"SAP"`, every
reduced cert (RdSAP-Schema-* and the reduced SAP-Schema-16.x) is `"RdSAP"`
and it is independent of the `schema_type` LABEL, which can be broken. The
structural signals (`data_type`, `sap_opening_types`) agree with it on the
corpus but are derived shape artifacts; `assessment_type` is the meaning."""
return data.get("assessment_type") == "SAP"
def _is_full_sap_label(schema: str) -> bool:
"""Whether `schema` is a recognised full-SAP `schema_type` — so a full-SAP
*shape* under it is correctly labelled, not a broken schema type."""
return schema in _FULL_SAP_SCHEMA_LABELS
def _record_broken_schema_type(schema: str, data: Dict[str, Any]) -> None:
"""Surface a *broken schema type*: a cert whose `schema_type` label disagrees
with its structure the label is not a full-SAP schema, yet the cert is
structurally full-SAP. Logged rather than silently rerouted so the unreliable
labels and their frequency stay visible (mirrors the skipped-cohort-cert
capture), and coverage can grow as new mislabelled shapes surface. e.g. the
LIG as-designed certs lodged under SAP-Schema-16.x."""
logger.warning(
"broken schema_type %r: structurally full-SAP "
"(assessment_type=%r) — routing to the full-SAP mapper (uprn=%s)",
schema,
data.get("assessment_type"),
data.get("uprn"),
)

View file

@ -11,6 +11,7 @@ exercise the shape variation the design decisions hinge on
"""
import json
import logging
import os
from typing import Any, Dict
@ -684,9 +685,8 @@ class TestFullSapSchema16xRouting:
assert epc.dwelling_type == "Detached house"
def test_reduced_16_x_cert_unaffected_by_full_sap_routing(self) -> None:
# Arrange — a reduced 16.2 cert (assessment_type RdSAP, no
# sap_opening_types) must stay on the RdSAP path, keeping its top-level
# property_type.
# Arrange — a reduced 16.2 cert (assessment_type RdSAP) must stay on the
# RdSAP path, keeping its top-level property_type.
data = load("sap_16_2.json")
# Act
@ -694,3 +694,35 @@ class TestFullSapSchema16xRouting:
# Assert
assert epc.property_type is not None
def test_broken_schema_type_is_recorded(
self, caplog: pytest.LogCaptureFixture
) -> None:
# Arrange — a full-SAP cert mislabelled as SAP-Schema-16.0: the label
# disagrees with the assessment_type, so the mismatch must be surfaced
# (not silently rerouted).
data = load("sap_16_0_full.json")
# Act
with caplog.at_level(logging.WARNING, logger="datatypes.epc.domain.mapper"):
EpcPropertyDataMapper.from_api_response(data)
# Assert
assert any(
"broken schema_type" in r.message and "SAP-Schema-16.0" in r.message
for r in caplog.records
)
def test_correctly_labelled_full_sap_is_not_recorded_as_broken(
self, caplog: pytest.LogCaptureFixture
) -> None:
# Arrange — a correctly-labelled full-SAP cert: assessment_type SAP AND a
# recognised full-SAP label, so no mismatch to record.
data = load("sap_17_1.json")
# Act
with caplog.at_level(logging.WARNING, logger="datatypes.epc.domain.mapper"):
EpcPropertyDataMapper.from_api_response(data)
# Assert
assert not any("broken schema_type" in r.message for r in caplog.records)