diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index ce1ba038..06e5a4df 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -1,4 +1,5 @@ import copy +import logging import re from dataclasses import replace from datetime import date @@ -958,23 +959,14 @@ class EpcPropertyDataMapper: @staticmethod def _from_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData: - """Shared body for the SAP-Schema-16.x dedicated mappers. - - The `SAP-Schema-16.x` *name* covers two structurally different certs: - - * **RdSAP** (`assessment_type == "RdSAP"`) — the reduced-field shape - (top-level `door_count` / `glazed_area` band, construction-code building - parts). Normalised onto RdSAP-17.1 and mapped by `from_rdsap_schema_17_1`. - * **full SAP** (`assessment_type == "SAP"`, e.g. the as-designed `LIG-*` - new-builds) — the *measured* shape: `sap_opening_types` + structured - `sap_building_parts` carrying measured U-values and door/window openings, - NOT the reduced top-level count fields. These omit `door_count` because - the doors are lodged as openings, so the reduced normaliser failed loud - (`RdSapSchema17_1: missing required field 'door_count'`). They are - full-SAP certs and map via the full-SAP 17.1 mapper instead.""" - if _is_full_sap_cert(data): - return EpcPropertyDataMapper._from_full_sap_schema_16_x(data) + """Shared body for the SAP-Schema-16.x dedicated mappers: normalise the + reduced-field doc onto the RdSAP-17.1 shape and reuse that mapper. + Only the reduced-RdSAP shape reaches here. A full-SAP cert mis-lodged + under a SAP-Schema-16.x label (the `LIG-*` as-designed new-builds) is a + *broken schema type* — `from_api_response` detects the full-SAP shape and + routes it to the full-SAP mapper before this label dispatch (see + `_is_full_sap_assessment`).""" from datatypes.epc.schema.rdsap_schema_17_1 import RdSapSchema17_1 return EpcPropertyDataMapper.from_rdsap_schema_17_1( @@ -982,15 +974,12 @@ class EpcPropertyDataMapper: ) @staticmethod - def _from_full_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData: - """Map a full-SAP cert lodged under a `SAP-Schema-16.x` version. - - Structurally a full-SAP cert (`sap_opening_types` + measured - `sap_building_parts`), so it parses with the full-SAP 17.1 dataclass and - reuses `from_sap_schema_17_1` — door/window/fabric come from the real - measured openings, no reduced-field defaulting. The only `SapSchema17_1` - field the 16.x full-SAP shape omits is `tenure` (register metadata with no - SAP effect), defaulted here.""" + def _from_full_sap(data: Dict[str, Any]) -> EpcPropertyData: + """Map a cert whose *structure* is full-SAP (measured `sap_opening_types`) + via the full-SAP 17.1 mapper, regardless of its `schema_type` label — the + door/window/fabric come from the real measured openings, no reduced-field + defaulting. The only `SapSchema17_1` field the broken-label (16.x) full-SAP + shape omits is `tenure` (register metadata, no SAP effect), defaulted.""" normalised = copy.deepcopy(data) normalised.setdefault("tenure", "unknown") return EpcPropertyDataMapper.from_sap_schema_17_1( @@ -2639,7 +2628,20 @@ class EpcPropertyDataMapper: data = _normalize_shower_outlets(data) data = _default_missing_post_town(data) schema = data.get("schema_type", "") - if schema == "RdSAP-Schema-21.0.1": + + # Shape over label. The `schema_type` is a LABEL that can disagree with the + # cert's STRUCTURE. A cert that is structurally full-SAP (measured + # `sap_opening_types`) is mapped by the full-SAP mapper whatever its label + # claims. When the label is NOT a recognised full-SAP schema it is a + # *broken schema type* — e.g. the LIG as-designed certs mis-lodged under + # SAP-Schema-16.x — recorded so the unreliable labels stay visible and + # coverage grows as new shapes surface. Correctly-labelled full-SAP certs + # fall through to their own dedicated branch below, keeping the + # one-mapper-per-schema convention. + if _is_full_sap_assessment(data) and not _is_full_sap_label(schema): + _record_broken_schema_type(schema, data) + mapped = EpcPropertyDataMapper._from_full_sap(data) + elif schema == "RdSAP-Schema-21.0.1": from datatypes.epc.schema.rdsap_schema_21_0_1 import RdSapSchema21_0_1 mapped = EpcPropertyDataMapper.from_rdsap_schema_21_0_1( @@ -3321,15 +3323,57 @@ def _derive_built_form_16x(dwelling_type: Any) -> int: return 4 # flats / unstated form → modal built_form (SAP- and gate-neutral) -def _is_full_sap_cert(data: Dict[str, Any]) -> bool: - """Whether a `SAP-Schema-16.x` doc is structurally a full-SAP cert rather than - the reduced RdSAP shape: an as-designed/full SAP assessment - (`assessment_type == "SAP"`) that lodges fabric as measured openings - (`sap_opening_types`) instead of the reduced top-level count fields. Both - signals agree on the real corpus; requiring both avoids mis-routing a reduced - cert that happens to carry one.""" - return data.get("assessment_type") == "SAP" and bool( - data.get("sap_opening_types") +logger = logging.getLogger(__name__) + +# The recognised full-SAP schema_type labels: a full-SAP *shape* under one of +# these is correctly labelled, not a broken schema type. Anything else carrying +# the full-SAP shape is a mislabel (e.g. the LIG as-designed certs lodged as +# SAP-Schema-16.x). +_FULL_SAP_SCHEMA_LABELS: frozenset[str] = frozenset( + { + "SAP-Schema-17.0", + "SAP-Schema-17.1", + "SAP-Schema-18.0.0", + "SAP-Schema-19.1.0", + } +) + + +def _is_full_sap_assessment(data: Dict[str, Any]) -> bool: + """Whether a cert is a full-SAP assessment (vs reduced RdSAP), keyed on the + gov-API's own `assessment_type` declaration — the authoritative SAP-vs-RdSAP + classification, not a structural proxy. A full-SAP assessment lodges fabric + as measured openings + measured U-values; a reduced RdSAP one carries the + top-level count fields. + + `assessment_type` separates the entire cert corpus cleanly — every full-SAP + schema (SAP-Schema-17.x/18.x and the broken `LIG` 16.x) is `"SAP"`, every + reduced cert (RdSAP-Schema-* and the reduced SAP-Schema-16.x) is `"RdSAP"` — + and it is independent of the `schema_type` LABEL, which can be broken. The + structural signals (`data_type`, `sap_opening_types`) agree with it on the + corpus but are derived shape artifacts; `assessment_type` is the meaning.""" + return data.get("assessment_type") == "SAP" + + +def _is_full_sap_label(schema: str) -> bool: + """Whether `schema` is a recognised full-SAP `schema_type` — so a full-SAP + *shape* under it is correctly labelled, not a broken schema type.""" + return schema in _FULL_SAP_SCHEMA_LABELS + + +def _record_broken_schema_type(schema: str, data: Dict[str, Any]) -> None: + """Surface a *broken schema type*: a cert whose `schema_type` label disagrees + with its structure — the label is not a full-SAP schema, yet the cert is + structurally full-SAP. Logged rather than silently rerouted so the unreliable + labels and their frequency stay visible (mirrors the skipped-cohort-cert + capture), and coverage can grow as new mislabelled shapes surface. e.g. the + LIG as-designed certs lodged under SAP-Schema-16.x.""" + logger.warning( + "broken schema_type %r: structurally full-SAP " + "(assessment_type=%r) — routing to the full-SAP mapper (uprn=%s)", + schema, + data.get("assessment_type"), + data.get("uprn"), ) diff --git a/datatypes/epc/domain/tests/test_from_sap_schema.py b/datatypes/epc/domain/tests/test_from_sap_schema.py index 8a79591d..42630ca2 100644 --- a/datatypes/epc/domain/tests/test_from_sap_schema.py +++ b/datatypes/epc/domain/tests/test_from_sap_schema.py @@ -11,6 +11,7 @@ exercise the shape variation the design decisions hinge on """ import json +import logging import os from typing import Any, Dict @@ -684,9 +685,8 @@ class TestFullSapSchema16xRouting: assert epc.dwelling_type == "Detached house" def test_reduced_16_x_cert_unaffected_by_full_sap_routing(self) -> None: - # Arrange — a reduced 16.2 cert (assessment_type RdSAP, no - # sap_opening_types) must stay on the RdSAP path, keeping its top-level - # property_type. + # Arrange — a reduced 16.2 cert (assessment_type RdSAP) must stay on the + # RdSAP path, keeping its top-level property_type. data = load("sap_16_2.json") # Act @@ -694,3 +694,35 @@ class TestFullSapSchema16xRouting: # Assert assert epc.property_type is not None + + def test_broken_schema_type_is_recorded( + self, caplog: pytest.LogCaptureFixture + ) -> None: + # Arrange — a full-SAP cert mislabelled as SAP-Schema-16.0: the label + # disagrees with the assessment_type, so the mismatch must be surfaced + # (not silently rerouted). + data = load("sap_16_0_full.json") + + # Act + with caplog.at_level(logging.WARNING, logger="datatypes.epc.domain.mapper"): + EpcPropertyDataMapper.from_api_response(data) + + # Assert + assert any( + "broken schema_type" in r.message and "SAP-Schema-16.0" in r.message + for r in caplog.records + ) + + def test_correctly_labelled_full_sap_is_not_recorded_as_broken( + self, caplog: pytest.LogCaptureFixture + ) -> None: + # Arrange — a correctly-labelled full-SAP cert: assessment_type SAP AND a + # recognised full-SAP label, so no mismatch to record. + data = load("sap_17_1.json") + + # Act + with caplog.at_level(logging.WARNING, logger="datatypes.epc.domain.mapper"): + EpcPropertyDataMapper.from_api_response(data) + + # Assert + assert not any("broken schema_type" in r.message for r in caplog.records)