mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
Generalise to a 'broken schema type' gate keyed on assessment_type
Per review (Khalim): lift the full-SAP detection out of the SAP-Schema-16.x branch into a single top-level gate in from_api_response, and key it on the gov-API's own assessment_type declaration rather than the structural sap_opening_types proxy. - _is_full_sap_assessment(data): assessment_type == "SAP" — the authoritative SAP-vs-RdSAP classification. Verified to separate the entire fixture corpus: every full-SAP schema (SAP-Schema-17.x/18.x + the broken LIG 16.x) is "SAP"; every reduced cert (RdSAP-Schema-* and reduced SAP-Schema-16.x, incl. sap_16_0.json) is "RdSAP". data_type / sap_opening_types agree but are derived shape artifacts; assessment_type is the meaning. - A cert that is full-SAP by assessment_type but whose schema_type LABEL is not a recognised full-SAP schema is a *broken schema type* (label disagrees with the assessment). _record_broken_schema_type logs it — visible, not silently rerouted — so unreliable labels surface and coverage grows as new mislabels appear. Generalises beyond 16.x to any future mislabel. - _from_full_sap maps it via the full-SAP 17.1 mapper (real measured openings, no defaulting; only `tenure` defaulted). Correctly-labelled full-SAP certs keep their dedicated branches (one-mapper-per-schema convention); reduced certs are unchanged. Tests: broken cert routed AND recorded; correctly-labelled full-SAP not recorded. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5e0963593a
commit
e312aae95d
2 changed files with 114 additions and 38 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import copy
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import replace
|
||||
from datetime import date
|
||||
|
|
@ -958,23 +959,14 @@ class EpcPropertyDataMapper:
|
|||
|
||||
@staticmethod
|
||||
def _from_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData:
|
||||
"""Shared body for the SAP-Schema-16.x dedicated mappers.
|
||||
|
||||
The `SAP-Schema-16.x` *name* covers two structurally different certs:
|
||||
|
||||
* **RdSAP** (`assessment_type == "RdSAP"`) — the reduced-field shape
|
||||
(top-level `door_count` / `glazed_area` band, construction-code building
|
||||
parts). Normalised onto RdSAP-17.1 and mapped by `from_rdsap_schema_17_1`.
|
||||
* **full SAP** (`assessment_type == "SAP"`, e.g. the as-designed `LIG-*`
|
||||
new-builds) — the *measured* shape: `sap_opening_types` + structured
|
||||
`sap_building_parts` carrying measured U-values and door/window openings,
|
||||
NOT the reduced top-level count fields. These omit `door_count` because
|
||||
the doors are lodged as openings, so the reduced normaliser failed loud
|
||||
(`RdSapSchema17_1: missing required field 'door_count'`). They are
|
||||
full-SAP certs and map via the full-SAP 17.1 mapper instead."""
|
||||
if _is_full_sap_cert(data):
|
||||
return EpcPropertyDataMapper._from_full_sap_schema_16_x(data)
|
||||
"""Shared body for the SAP-Schema-16.x dedicated mappers: normalise the
|
||||
reduced-field doc onto the RdSAP-17.1 shape and reuse that mapper.
|
||||
|
||||
Only the reduced-RdSAP shape reaches here. A full-SAP cert mis-lodged
|
||||
under a SAP-Schema-16.x label (the `LIG-*` as-designed new-builds) is a
|
||||
*broken schema type* — `from_api_response` detects the full-SAP shape and
|
||||
routes it to the full-SAP mapper before this label dispatch (see
|
||||
`_is_full_sap_assessment`)."""
|
||||
from datatypes.epc.schema.rdsap_schema_17_1 import RdSapSchema17_1
|
||||
|
||||
return EpcPropertyDataMapper.from_rdsap_schema_17_1(
|
||||
|
|
@ -982,15 +974,12 @@ class EpcPropertyDataMapper:
|
|||
)
|
||||
|
||||
@staticmethod
|
||||
def _from_full_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData:
|
||||
"""Map a full-SAP cert lodged under a `SAP-Schema-16.x` version.
|
||||
|
||||
Structurally a full-SAP cert (`sap_opening_types` + measured
|
||||
`sap_building_parts`), so it parses with the full-SAP 17.1 dataclass and
|
||||
reuses `from_sap_schema_17_1` — door/window/fabric come from the real
|
||||
measured openings, no reduced-field defaulting. The only `SapSchema17_1`
|
||||
field the 16.x full-SAP shape omits is `tenure` (register metadata with no
|
||||
SAP effect), defaulted here."""
|
||||
def _from_full_sap(data: Dict[str, Any]) -> EpcPropertyData:
|
||||
"""Map a cert whose *structure* is full-SAP (measured `sap_opening_types`)
|
||||
via the full-SAP 17.1 mapper, regardless of its `schema_type` label — the
|
||||
door/window/fabric come from the real measured openings, no reduced-field
|
||||
defaulting. The only `SapSchema17_1` field the broken-label (16.x) full-SAP
|
||||
shape omits is `tenure` (register metadata, no SAP effect), defaulted."""
|
||||
normalised = copy.deepcopy(data)
|
||||
normalised.setdefault("tenure", "unknown")
|
||||
return EpcPropertyDataMapper.from_sap_schema_17_1(
|
||||
|
|
@ -2639,7 +2628,20 @@ class EpcPropertyDataMapper:
|
|||
data = _normalize_shower_outlets(data)
|
||||
data = _default_missing_post_town(data)
|
||||
schema = data.get("schema_type", "")
|
||||
if schema == "RdSAP-Schema-21.0.1":
|
||||
|
||||
# Shape over label. The `schema_type` is a LABEL that can disagree with the
|
||||
# cert's STRUCTURE. A cert that is structurally full-SAP (measured
|
||||
# `sap_opening_types`) is mapped by the full-SAP mapper whatever its label
|
||||
# claims. When the label is NOT a recognised full-SAP schema it is a
|
||||
# *broken schema type* — e.g. the LIG as-designed certs mis-lodged under
|
||||
# SAP-Schema-16.x — recorded so the unreliable labels stay visible and
|
||||
# coverage grows as new shapes surface. Correctly-labelled full-SAP certs
|
||||
# fall through to their own dedicated branch below, keeping the
|
||||
# one-mapper-per-schema convention.
|
||||
if _is_full_sap_assessment(data) and not _is_full_sap_label(schema):
|
||||
_record_broken_schema_type(schema, data)
|
||||
mapped = EpcPropertyDataMapper._from_full_sap(data)
|
||||
elif schema == "RdSAP-Schema-21.0.1":
|
||||
from datatypes.epc.schema.rdsap_schema_21_0_1 import RdSapSchema21_0_1
|
||||
|
||||
mapped = EpcPropertyDataMapper.from_rdsap_schema_21_0_1(
|
||||
|
|
@ -3321,15 +3323,57 @@ def _derive_built_form_16x(dwelling_type: Any) -> int:
|
|||
return 4 # flats / unstated form → modal built_form (SAP- and gate-neutral)
|
||||
|
||||
|
||||
def _is_full_sap_cert(data: Dict[str, Any]) -> bool:
|
||||
"""Whether a `SAP-Schema-16.x` doc is structurally a full-SAP cert rather than
|
||||
the reduced RdSAP shape: an as-designed/full SAP assessment
|
||||
(`assessment_type == "SAP"`) that lodges fabric as measured openings
|
||||
(`sap_opening_types`) instead of the reduced top-level count fields. Both
|
||||
signals agree on the real corpus; requiring both avoids mis-routing a reduced
|
||||
cert that happens to carry one."""
|
||||
return data.get("assessment_type") == "SAP" and bool(
|
||||
data.get("sap_opening_types")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# The recognised full-SAP schema_type labels: a full-SAP *shape* under one of
|
||||
# these is correctly labelled, not a broken schema type. Anything else carrying
|
||||
# the full-SAP shape is a mislabel (e.g. the LIG as-designed certs lodged as
|
||||
# SAP-Schema-16.x).
|
||||
_FULL_SAP_SCHEMA_LABELS: frozenset[str] = frozenset(
|
||||
{
|
||||
"SAP-Schema-17.0",
|
||||
"SAP-Schema-17.1",
|
||||
"SAP-Schema-18.0.0",
|
||||
"SAP-Schema-19.1.0",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _is_full_sap_assessment(data: Dict[str, Any]) -> bool:
|
||||
"""Whether a cert is a full-SAP assessment (vs reduced RdSAP), keyed on the
|
||||
gov-API's own `assessment_type` declaration — the authoritative SAP-vs-RdSAP
|
||||
classification, not a structural proxy. A full-SAP assessment lodges fabric
|
||||
as measured openings + measured U-values; a reduced RdSAP one carries the
|
||||
top-level count fields.
|
||||
|
||||
`assessment_type` separates the entire cert corpus cleanly — every full-SAP
|
||||
schema (SAP-Schema-17.x/18.x and the broken `LIG` 16.x) is `"SAP"`, every
|
||||
reduced cert (RdSAP-Schema-* and the reduced SAP-Schema-16.x) is `"RdSAP"` —
|
||||
and it is independent of the `schema_type` LABEL, which can be broken. The
|
||||
structural signals (`data_type`, `sap_opening_types`) agree with it on the
|
||||
corpus but are derived shape artifacts; `assessment_type` is the meaning."""
|
||||
return data.get("assessment_type") == "SAP"
|
||||
|
||||
|
||||
def _is_full_sap_label(schema: str) -> bool:
|
||||
"""Whether `schema` is a recognised full-SAP `schema_type` — so a full-SAP
|
||||
*shape* under it is correctly labelled, not a broken schema type."""
|
||||
return schema in _FULL_SAP_SCHEMA_LABELS
|
||||
|
||||
|
||||
def _record_broken_schema_type(schema: str, data: Dict[str, Any]) -> None:
|
||||
"""Surface a *broken schema type*: a cert whose `schema_type` label disagrees
|
||||
with its structure — the label is not a full-SAP schema, yet the cert is
|
||||
structurally full-SAP. Logged rather than silently rerouted so the unreliable
|
||||
labels and their frequency stay visible (mirrors the skipped-cohort-cert
|
||||
capture), and coverage can grow as new mislabelled shapes surface. e.g. the
|
||||
LIG as-designed certs lodged under SAP-Schema-16.x."""
|
||||
logger.warning(
|
||||
"broken schema_type %r: structurally full-SAP "
|
||||
"(assessment_type=%r) — routing to the full-SAP mapper (uprn=%s)",
|
||||
schema,
|
||||
data.get("assessment_type"),
|
||||
data.get("uprn"),
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ exercise the shape variation the design decisions hinge on
|
|||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
|
|
@ -684,9 +685,8 @@ class TestFullSapSchema16xRouting:
|
|||
assert epc.dwelling_type == "Detached house"
|
||||
|
||||
def test_reduced_16_x_cert_unaffected_by_full_sap_routing(self) -> None:
|
||||
# Arrange — a reduced 16.2 cert (assessment_type RdSAP, no
|
||||
# sap_opening_types) must stay on the RdSAP path, keeping its top-level
|
||||
# property_type.
|
||||
# Arrange — a reduced 16.2 cert (assessment_type RdSAP) must stay on the
|
||||
# RdSAP path, keeping its top-level property_type.
|
||||
data = load("sap_16_2.json")
|
||||
|
||||
# Act
|
||||
|
|
@ -694,3 +694,35 @@ class TestFullSapSchema16xRouting:
|
|||
|
||||
# Assert
|
||||
assert epc.property_type is not None
|
||||
|
||||
def test_broken_schema_type_is_recorded(
|
||||
self, caplog: pytest.LogCaptureFixture
|
||||
) -> None:
|
||||
# Arrange — a full-SAP cert mislabelled as SAP-Schema-16.0: the label
|
||||
# disagrees with the assessment_type, so the mismatch must be surfaced
|
||||
# (not silently rerouted).
|
||||
data = load("sap_16_0_full.json")
|
||||
|
||||
# Act
|
||||
with caplog.at_level(logging.WARNING, logger="datatypes.epc.domain.mapper"):
|
||||
EpcPropertyDataMapper.from_api_response(data)
|
||||
|
||||
# Assert
|
||||
assert any(
|
||||
"broken schema_type" in r.message and "SAP-Schema-16.0" in r.message
|
||||
for r in caplog.records
|
||||
)
|
||||
|
||||
def test_correctly_labelled_full_sap_is_not_recorded_as_broken(
|
||||
self, caplog: pytest.LogCaptureFixture
|
||||
) -> None:
|
||||
# Arrange — a correctly-labelled full-SAP cert: assessment_type SAP AND a
|
||||
# recognised full-SAP label, so no mismatch to record.
|
||||
data = load("sap_17_1.json")
|
||||
|
||||
# Act
|
||||
with caplog.at_level(logging.WARNING, logger="datatypes.epc.domain.mapper"):
|
||||
EpcPropertyDataMapper.from_api_response(data)
|
||||
|
||||
# Assert
|
||||
assert not any("broken schema_type" in r.message for r in caplog.records)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue