From 5e0963593a42091c7839d7b6488c2e6db16b5e92 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 24 Jun 2026 15:16:51 +0000 Subject: [PATCH 1/2] Route full-SAP certs lodged under SAP-Schema-16.x to the full-SAP mapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A full-SAP cert (assessment_type=SAP, the as-designed LIG-* new-builds) lodged under a SAP-Schema-16.x version failed to map with: ValueError: RdSapSchema17_1: missing required field 'door_count' — crashing the property's modelling_e2e subtask. 89 distinct properties in the 2026-06-24 run hit this; ~22 of every 30 sampled are this full-SAP shape. Root cause: from_api_response dispatches on the schema_type STRING only, and the SAP-Schema-16.x branch assumed a single shape — "reduced-field (RdSAP-shaped)" — sending every 16.x cert through _normalize_sap_schema_16_x -> RdSapSchema17_1. But the SAP-Schema-16.x name covers two structurally different certs: * reduced RdSAP (assessment_type=RdSAP): top-level door_count / glazed_area band / construction-code building parts. * full SAP (assessment_type=SAP): measured shape — sap_opening_types + structured sap_building_parts carrying measured U-values and door/window OPENINGS, with NO top-level door_count (the door is an opening). These omit the reduced-only count fields, so the reduced normaliser failed loud on the first one it checks (door_count) — and 14 others behind it. Force-fitting a full-SAP cert to the reduced RdSAP schema was the bug: the data was never missing, it was being validated against the wrong schema. Fix: discriminate on shape (_is_full_sap_cert: assessment_type=SAP AND sap_opening_types present) and route full-SAP 16.x certs to the existing full-SAP 17.1 mapper, which reads the real measured openings (the lodged 0.9x2.1m door -> door_count 1) — no reduced-field defaulting. The only field the 16.x full-SAP shape omits that SapSchema17_1 needs is `tenure` (register metadata, no SAP effect), defaulted. Reduced 16.x certs (assessment_type=RdSAP, no opening types) are untouched — all pinned reduced 16.x fixtures stay on the RdSAP path. Regression test pins a real full-SAP-16.0 cert (0293-...-8795, lodged 83) mapping via the full-SAP path + the reduced 16.2 cert staying on the RdSAP path. Co-Authored-By: Claude Opus 4.8 (1M context) --- datatypes/epc/domain/mapper.py | 47 +- .../epc/domain/tests/test_from_sap_schema.py | 37 ++ .../schema/tests/fixtures/sap_16_0_full.json | 435 ++++++++++++++++++ 3 files changed, 517 insertions(+), 2 deletions(-) create mode 100644 datatypes/epc/schema/tests/fixtures/sap_16_0_full.json diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 26704922..ce1ba038 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -958,14 +958,45 @@ class EpcPropertyDataMapper: @staticmethod def _from_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData: - """Shared body for the SAP-Schema-16.x dedicated mappers: normalise the - reduced-field doc onto the RdSAP-17.1 shape and reuse that mapper.""" + """Shared body for the SAP-Schema-16.x dedicated mappers. + + The `SAP-Schema-16.x` *name* covers two structurally different certs: + + * **RdSAP** (`assessment_type == "RdSAP"`) — the reduced-field shape + (top-level `door_count` / `glazed_area` band, construction-code building + parts). Normalised onto RdSAP-17.1 and mapped by `from_rdsap_schema_17_1`. + * **full SAP** (`assessment_type == "SAP"`, e.g. the as-designed `LIG-*` + new-builds) — the *measured* shape: `sap_opening_types` + structured + `sap_building_parts` carrying measured U-values and door/window openings, + NOT the reduced top-level count fields. These omit `door_count` because + the doors are lodged as openings, so the reduced normaliser failed loud + (`RdSapSchema17_1: missing required field 'door_count'`). They are + full-SAP certs and map via the full-SAP 17.1 mapper instead.""" + if _is_full_sap_cert(data): + return EpcPropertyDataMapper._from_full_sap_schema_16_x(data) + from datatypes.epc.schema.rdsap_schema_17_1 import RdSapSchema17_1 return EpcPropertyDataMapper.from_rdsap_schema_17_1( from_dict(RdSapSchema17_1, _normalize_sap_schema_16_x(data)) ) + @staticmethod + def _from_full_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData: + """Map a full-SAP cert lodged under a `SAP-Schema-16.x` version. + + Structurally a full-SAP cert (`sap_opening_types` + measured + `sap_building_parts`), so it parses with the full-SAP 17.1 dataclass and + reuses `from_sap_schema_17_1` — door/window/fabric come from the real + measured openings, no reduced-field defaulting. The only `SapSchema17_1` + field the 16.x full-SAP shape omits is `tenure` (register metadata with no + SAP effect), defaulted here.""" + normalised = copy.deepcopy(data) + normalised.setdefault("tenure", "unknown") + return EpcPropertyDataMapper.from_sap_schema_17_1( + from_dict(SapSchema17_1, normalised) + ) + @staticmethod def from_rdsap_schema_17_1(schema: RdSapSchema17_1) -> EpcPropertyData: es = schema.sap_energy_source @@ -3290,6 +3321,18 @@ def _derive_built_form_16x(dwelling_type: Any) -> int: return 4 # flats / unstated form → modal built_form (SAP- and gate-neutral) +def _is_full_sap_cert(data: Dict[str, Any]) -> bool: + """Whether a `SAP-Schema-16.x` doc is structurally a full-SAP cert rather than + the reduced RdSAP shape: an as-designed/full SAP assessment + (`assessment_type == "SAP"`) that lodges fabric as measured openings + (`sap_opening_types`) instead of the reduced top-level count fields. Both + signals agree on the real corpus; requiring both avoids mis-routing a reduced + cert that happens to carry one.""" + return data.get("assessment_type") == "SAP" and bool( + data.get("sap_opening_types") + ) + + def _normalize_sap_schema_16_x(data: Dict[str, Any]) -> Dict[str, Any]: """Rewrite a `SAP-Schema-16.2`/`16.3` API doc onto the `RdSAP-Schema-17.1` shape so it can reuse the tested `from_rdsap_schema_17_1` mapper. diff --git a/datatypes/epc/domain/tests/test_from_sap_schema.py b/datatypes/epc/domain/tests/test_from_sap_schema.py index 28c86599..8a79591d 100644 --- a/datatypes/epc/domain/tests/test_from_sap_schema.py +++ b/datatypes/epc/domain/tests/test_from_sap_schema.py @@ -657,3 +657,40 @@ class TestFromSapSchema16_2: EpcPropertyDataMapper.from_api_response(data) assert "window" not in data # the rename landed only on the copy assert set(data.keys()) == before_keys + + +class TestFullSapSchema16xRouting: + """The `SAP-Schema-16.x` name covers two structurally different certs: the + reduced RdSAP shape (top-level `door_count`/`glazed_area`) AND full SAP (the + as-designed `LIG-*` new-builds — `assessment_type=SAP`, measured + `sap_opening_types` + structured `sap_building_parts`, doors lodged as + openings not a top-level count). Dispatch must route each to its own mapper; + sending a full-SAP cert through the reduced normaliser failed loud with + `RdSapSchema17_1: missing required field 'door_count'`.""" + + def test_full_sap_16_x_cert_maps_via_full_sap_path(self) -> None: + # Arrange — a real full-SAP cert lodged under SAP-Schema-16.0 (cert + # 0293-3880-6151-9872-8795, lodged 83) that previously raised the + # missing-door_count ValueError in the reduced normaliser. + data = load("sap_16_0_full.json") + + # Act + epc = EpcPropertyDataMapper.from_api_response(data) + + # Assert — mapped from the real measured openings, not reduced-field + # defaults: the single lodged door opening (0.9 x 2.1 m) → door_count 1. + assert isinstance(epc, EpcPropertyData) + assert epc.door_count == 1 + assert epc.dwelling_type == "Detached house" + + def test_reduced_16_x_cert_unaffected_by_full_sap_routing(self) -> None: + # Arrange — a reduced 16.2 cert (assessment_type RdSAP, no + # sap_opening_types) must stay on the RdSAP path, keeping its top-level + # property_type. + data = load("sap_16_2.json") + + # Act + epc = EpcPropertyDataMapper.from_api_response(data) + + # Assert + assert epc.property_type is not None diff --git a/datatypes/epc/schema/tests/fixtures/sap_16_0_full.json b/datatypes/epc/schema/tests/fixtures/sap_16_0_full.json new file mode 100644 index 00000000..b665cab2 --- /dev/null +++ b/datatypes/epc/schema/tests/fixtures/sap_16_0_full.json @@ -0,0 +1,435 @@ +{ + "uprn": 10008075674, + "roofs": [ + { + "description": "Average thermal transmittance 0.11 W/m\u00b2K", + "energy_efficiency_rating": 5, + "environmental_efficiency_rating": 5 + } + ], + "walls": [ + { + "description": "Average thermal transmittance 0.21 W/m\u00b2K", + "energy_efficiency_rating": 5, + "environmental_efficiency_rating": 5 + } + ], + "floors": [ + { + "description": "Average thermal transmittance 0.12 W/m\u00b2K", + "energy_efficiency_rating": 5, + "environmental_efficiency_rating": 5 + } + ], + "status": "entered", + "windows": { + "description": "High performance glazing", + "energy_efficiency_rating": 5, + "environmental_efficiency_rating": 5 + }, + "lighting": { + "description": "Low energy lighting in all fixed outlets", + "energy_efficiency_rating": 5, + "environmental_efficiency_rating": 5 + }, + "postcode": "PE2 9HD", + "data_type": 2, + "hot_water": { + "description": "From main system, plus solar", + "energy_efficiency_rating": 5, + "environmental_efficiency_rating": 5 + }, + "post_town": "PETERBOROUGH", + "built_form": 1, + "created_at": "2012-05-30 13:30:25.000000", + "living_area": 18.011, + "orientation": 7, + "region_code": 2, + "report_type": 3, + "sap_heating": { + "thermal_store": 1, + "has_solar_panel": "true", + "water_fuel_type": 1, + "solar_store_volume": 100, + "water_heating_code": 901, + "hot_water_store_size": 210, + "main_heating_details": [ + { + "burner_control": 3, + "main_fuel_type": 1, + "efficiency_type": 3, + "heat_emitter_type": 1, + "main_heating_code": 102, + "is_flue_fan_present": "false", + "main_heating_number": 1, + "is_condensing_boiler": "true", + "main_heating_control": 2110, + "is_interlocked_system": "true", + "main_heating_category": 2, + "main_heating_fraction": 1, + "gas_or_oil_boiler_type": 1, + "main_heating_flue_type": 2, + "main_heating_efficiency": 89, + "main_heating_data_source": 2, + "has_delayed_start_thermostat": "false", + "load_or_weather_compensation": 0, + "is_central_heating_pump_in_heated_space": "true" + } + ], + "has_hot_water_cylinder": "true", + "has_solar_powered_pump": "false", + "has_cylinder_thermostat": "true", + "solar_panel_aperture_area": 4.51, + "has_fixed_air_conditioning": "false", + "secondary_heating_category": 1, + "solar_panel_collector_type": 2, + "is_cylinder_in_heated_space": "true", + "solar_panel_collector_pitch": 2, + "is_hot_water_separately_timed": "true", + "is_primary_pipework_insulated": "true", + "hot_water_store_insulation_type": 1, + "hot_water_store_heat_loss_source": 3, + "is_solar_store_combined_cylinder": "true", + "solar_panel_collector_data_source": 2, + "solar_panel_collector_orientation": 5, + "solar_panel_collector_overshading": 1, + "hot_water_store_insulation_thickness": 85, + "solar_panel_collector_heat_loss_rate": 3.681, + "solar_panel_collector_zero_loss_efficiency": 77 + }, + "sap_version": 9.9, + "schema_type": "SAP-Schema-16.0", + "uprn_source": "Energy Assessor", + "country_code": "EAW", + "main_heating": [ + { + "description": "Boiler and radiators, mains gas", + "energy_efficiency_rating": 4, + "environmental_efficiency_rating": 4 + } + ], + "air_tightness": { + "description": "Air permeability 4.9 m\u00b3/h.m\u00b2 (as tested)", + "energy_efficiency_rating": 4, + "environmental_efficiency_rating": 4 + }, + "dwelling_type": "Detached house", + "language_code": 1, + "property_type": 0, + "address_line_1": "43, New Road", + "address_line_2": "Woodston", + "schema_version": "LIG-15.0", + "assessment_date": "2012-05-30", + "assessment_type": "SAP", + "completion_date": "2012-05-30", + "inspection_date": "2012-05-30", + "sap_ventilation": { + "psv_count": 0, + "pressure_test": 1, + "air_permeability": 4.88, + "open_flues_count": 0, + "ventilation_type": 1, + "extract_fans_count": 3, + "open_fireplaces_count": 0, + "sheltered_sides_count": 2, + "flueless_gas_fires_count": 0 + }, + "design_water_use": 1, + "total_floor_area": 76, + "transaction_type": 6, + "conservatory_type": 1, + "registration_date": "2012-05-30", + "restricted_access": 0, + "sap_energy_source": { + "electricity_tariff": 1, + "wind_turbines_count": 0, + "wind_turbine_terrain_type": 2, + "fixed_lighting_outlets_count": 0, + "low_energy_fixed_lighting_outlets_count": 0, + "low_energy_fixed_lighting_outlets_percentage": 100 + }, + "sap_opening_types": [ + { + "name": "Door (1)", + "type": 1, + "u_value": 1.8, + "data_source": 2, + "description": "Data from Manufacturer", + "glazing_type": 1 + }, + { + "name": "Windows (1)", + "type": 4, + "u_value": 1.5, + "data_source": 2, + "description": "Data from Manufacturer", + "frame_factor": 0.7, + "glazing_type": 7, + "solar_transmittance": 0.63 + } + ], + "secondary_heating": { + "description": "None", + "energy_efficiency_rating": 0, + "environmental_efficiency_rating": 0 + }, + "lzc_energy_sources": [ + 10 + ], + "sap_building_parts": [ + { + "sap_roofs": [ + { + "name": "main roof", + "u_value": 0.11, + "roof_type": 2, + "kappa_value": 9, + "total_roof_area": 38.65 + }, + { + "name": "flat roof", + "u_value": 0.19, + "roof_type": 2, + "kappa_value": 9, + "total_roof_area": 1.39 + } + ], + "sap_walls": [ + { + "name": "brick wall", + "u_value": 0.21, + "wall_type": 2, + "kappa_value": 150, + "total_wall_area": 115.98, + "is_curtain_walling": "false" + }, + { + "name": "downstairs", + "u_value": 0, + "wall_type": 5, + "kappa_value": 9, + "total_wall_area": 44.7 + }, + { + "name": "upper", + "u_value": 0, + "wall_type": 5, + "kappa_value": 9, + "total_wall_area": 52.99 + } + ], + "identifier": "Main Dwelling", + "overshading": 2, + "sap_openings": [ + { + "name": 1, + "type": "Door (1)", + "width": 0.9, + "height": 2.1, + "location": "brick wall", + "orientation": 7 + }, + { + "name": 2, + "type": "Windows (1)", + "width": 0.665, + "height": 1.05, + "location": "brick wall", + "orientation": 1 + }, + { + "name": 3, + "type": "Windows (1)", + "width": 2.09, + "height": 1.05, + "location": "brick wall", + "orientation": 7 + }, + { + "name": 4, + "type": "Windows (1)", + "width": 0.665, + "height": 1.05, + "location": "brick wall", + "orientation": 5 + }, + { + "name": 5, + "type": "Windows (1)", + "width": 2.5, + "height": 2.1, + "location": "brick wall", + "orientation": 3 + }, + { + "name": 6, + "type": "Windows (1)", + "width": 1.05, + "height": 1.425, + "location": "brick wall", + "orientation": 7 + }, + { + "name": 7, + "type": "Windows (1)", + "width": 0.6, + "height": 1.2, + "location": "brick wall", + "orientation": 1 + }, + { + "name": 8, + "type": "Windows (1)", + "width": 0.8, + "height": 1.425, + "location": "brick wall", + "orientation": 3 + }, + { + "name": 9, + "type": "Windows (1)", + "width": 0.8, + "height": 1.425, + "location": "brick wall", + "orientation": 3 + }, + { + "name": 10, + "type": "Windows (1)", + "width": 0.6, + "height": 1.05, + "location": "brick wall", + "orientation": 5 + } + ], + "construction_year": 2012, + "sap_thermal_bridges": { + "thermal_bridges": [ + { + "length": 9.77, + "psi_value": 0.3, + "psi_value_source": 4, + "thermal_bridge_type": 2 + }, + { + "length": 7.27, + "psi_value": 0.04, + "psi_value_source": 4, + "thermal_bridge_type": 3 + }, + { + "length": 23.55, + "psi_value": 0.05, + "psi_value_source": 4, + "thermal_bridge_type": 4 + }, + { + "length": 27.18, + "psi_value": 0.16, + "psi_value_source": 4, + "thermal_bridge_type": 5 + }, + { + "length": 27.18, + "psi_value": 0.07, + "psi_value_source": 4, + "thermal_bridge_type": 6 + }, + { + "length": 8.68, + "psi_value": 0.06, + "psi_value_source": 4, + "thermal_bridge_type": 10 + }, + { + "length": 17.17, + "psi_value": 0.24, + "psi_value_source": 4, + "thermal_bridge_type": 12 + }, + { + "length": 21.04, + "psi_value": 0.09, + "psi_value_source": 4, + "thermal_bridge_type": 16 + } + ], + "thermal_bridge_code": 5 + }, + "building_part_number": 1, + "sap_floor_dimensions": [ + { + "storey": 0, + "u_value": 0.12, + "floor_type": 2, + "storey_height": 2.55, + "heat_loss_area": 38.65, + "total_floor_area": 38.65, + "kappa_value_from_below": 18 + }, + { + "storey": 1, + "u_value": 0, + "floor_type": 3, + "kappa_value": 18, + "storey_height": 2.71, + "heat_loss_area": 1.3900032, + "total_floor_area": 37.26, + "kappa_value_from_below": 18 + } + ] + } + ], + "bedf_revision_number": 323, + "heating_cost_current": 260, + "co2_emissions_current": 1.2, + "energy_rating_average": 60, + "energy_rating_current": 83, + "lighting_cost_current": 43, + "main_heating_controls": [ + { + "description": "Time and temperature zone control", + "energy_efficiency_rating": 5, + "environmental_efficiency_rating": 5 + } + ], + "has_hot_water_cylinder": "true", + "heating_cost_potential": 251, + "hot_water_cost_current": 37, + "suggested_improvements": [ + { + "sequence": 1, + "typical_saving": 219, + "indicative_cost": "\u00a311,000 - \u00a320,000", + "improvement_type": "U", + "improvement_details": { + "improvement_number": 34 + }, + "improvement_category": 3, + "energy_performance_rating": 94, + "environmental_impact_rating": 97 + } + ], + "co2_emissions_potential": 1.2, + "energy_rating_potential": 83, + "lighting_cost_potential": 43, + "hot_water_cost_potential": 47, + "is_in_smoke_control_area": "unknown", + "renewable_heat_incentive": { + "rhi_new_dwelling": { + "space_heating": 3010, + "water_heating": 2175 + } + }, + "seller_commission_report": "Y", + "energy_consumption_current": 84, + "has_fixed_air_conditioning": "false", + "calculation_software_version": "Version: 1.4.0.80", + "energy_consumption_potential": 84, + "environmental_impact_current": 87, + "current_energy_efficiency_band": "B", + "environmental_impact_potential": 87, + "has_heated_separate_conservatory": "false", + "potential_energy_efficiency_band": "B", + "co2_emissions_current_per_floor_area": 16 +} \ No newline at end of file From e312aae95dc21cbe0b59f3d623abf65367013ee4 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 24 Jun 2026 15:39:16 +0000 Subject: [PATCH 2/2] Generalise to a 'broken schema type' gate keyed on assessment_type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review (Khalim): lift the full-SAP detection out of the SAP-Schema-16.x branch into a single top-level gate in from_api_response, and key it on the gov-API's own assessment_type declaration rather than the structural sap_opening_types proxy. - _is_full_sap_assessment(data): assessment_type == "SAP" — the authoritative SAP-vs-RdSAP classification. Verified to separate the entire fixture corpus: every full-SAP schema (SAP-Schema-17.x/18.x + the broken LIG 16.x) is "SAP"; every reduced cert (RdSAP-Schema-* and reduced SAP-Schema-16.x, incl. sap_16_0.json) is "RdSAP". data_type / sap_opening_types agree but are derived shape artifacts; assessment_type is the meaning. - A cert that is full-SAP by assessment_type but whose schema_type LABEL is not a recognised full-SAP schema is a *broken schema type* (label disagrees with the assessment). _record_broken_schema_type logs it — visible, not silently rerouted — so unreliable labels surface and coverage grows as new mislabels appear. Generalises beyond 16.x to any future mislabel. - _from_full_sap maps it via the full-SAP 17.1 mapper (real measured openings, no defaulting; only `tenure` defaulted). Correctly-labelled full-SAP certs keep their dedicated branches (one-mapper-per-schema convention); reduced certs are unchanged. Tests: broken cert routed AND recorded; correctly-labelled full-SAP not recorded. Co-Authored-By: Claude Opus 4.8 (1M context) --- datatypes/epc/domain/mapper.py | 114 ++++++++++++------ .../epc/domain/tests/test_from_sap_schema.py | 38 +++++- 2 files changed, 114 insertions(+), 38 deletions(-) diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index ce1ba038..06e5a4df 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -1,4 +1,5 @@ import copy +import logging import re from dataclasses import replace from datetime import date @@ -958,23 +959,14 @@ class EpcPropertyDataMapper: @staticmethod def _from_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData: - """Shared body for the SAP-Schema-16.x dedicated mappers. - - The `SAP-Schema-16.x` *name* covers two structurally different certs: - - * **RdSAP** (`assessment_type == "RdSAP"`) — the reduced-field shape - (top-level `door_count` / `glazed_area` band, construction-code building - parts). Normalised onto RdSAP-17.1 and mapped by `from_rdsap_schema_17_1`. - * **full SAP** (`assessment_type == "SAP"`, e.g. the as-designed `LIG-*` - new-builds) — the *measured* shape: `sap_opening_types` + structured - `sap_building_parts` carrying measured U-values and door/window openings, - NOT the reduced top-level count fields. These omit `door_count` because - the doors are lodged as openings, so the reduced normaliser failed loud - (`RdSapSchema17_1: missing required field 'door_count'`). They are - full-SAP certs and map via the full-SAP 17.1 mapper instead.""" - if _is_full_sap_cert(data): - return EpcPropertyDataMapper._from_full_sap_schema_16_x(data) + """Shared body for the SAP-Schema-16.x dedicated mappers: normalise the + reduced-field doc onto the RdSAP-17.1 shape and reuse that mapper. + Only the reduced-RdSAP shape reaches here. A full-SAP cert mis-lodged + under a SAP-Schema-16.x label (the `LIG-*` as-designed new-builds) is a + *broken schema type* — `from_api_response` detects the full-SAP shape and + routes it to the full-SAP mapper before this label dispatch (see + `_is_full_sap_assessment`).""" from datatypes.epc.schema.rdsap_schema_17_1 import RdSapSchema17_1 return EpcPropertyDataMapper.from_rdsap_schema_17_1( @@ -982,15 +974,12 @@ class EpcPropertyDataMapper: ) @staticmethod - def _from_full_sap_schema_16_x(data: Dict[str, Any]) -> EpcPropertyData: - """Map a full-SAP cert lodged under a `SAP-Schema-16.x` version. - - Structurally a full-SAP cert (`sap_opening_types` + measured - `sap_building_parts`), so it parses with the full-SAP 17.1 dataclass and - reuses `from_sap_schema_17_1` — door/window/fabric come from the real - measured openings, no reduced-field defaulting. The only `SapSchema17_1` - field the 16.x full-SAP shape omits is `tenure` (register metadata with no - SAP effect), defaulted here.""" + def _from_full_sap(data: Dict[str, Any]) -> EpcPropertyData: + """Map a cert whose *structure* is full-SAP (measured `sap_opening_types`) + via the full-SAP 17.1 mapper, regardless of its `schema_type` label — the + door/window/fabric come from the real measured openings, no reduced-field + defaulting. The only `SapSchema17_1` field the broken-label (16.x) full-SAP + shape omits is `tenure` (register metadata, no SAP effect), defaulted.""" normalised = copy.deepcopy(data) normalised.setdefault("tenure", "unknown") return EpcPropertyDataMapper.from_sap_schema_17_1( @@ -2639,7 +2628,20 @@ class EpcPropertyDataMapper: data = _normalize_shower_outlets(data) data = _default_missing_post_town(data) schema = data.get("schema_type", "") - if schema == "RdSAP-Schema-21.0.1": + + # Shape over label. The `schema_type` is a LABEL that can disagree with the + # cert's STRUCTURE. A cert that is structurally full-SAP (measured + # `sap_opening_types`) is mapped by the full-SAP mapper whatever its label + # claims. When the label is NOT a recognised full-SAP schema it is a + # *broken schema type* — e.g. the LIG as-designed certs mis-lodged under + # SAP-Schema-16.x — recorded so the unreliable labels stay visible and + # coverage grows as new shapes surface. Correctly-labelled full-SAP certs + # fall through to their own dedicated branch below, keeping the + # one-mapper-per-schema convention. + if _is_full_sap_assessment(data) and not _is_full_sap_label(schema): + _record_broken_schema_type(schema, data) + mapped = EpcPropertyDataMapper._from_full_sap(data) + elif schema == "RdSAP-Schema-21.0.1": from datatypes.epc.schema.rdsap_schema_21_0_1 import RdSapSchema21_0_1 mapped = EpcPropertyDataMapper.from_rdsap_schema_21_0_1( @@ -3321,15 +3323,57 @@ def _derive_built_form_16x(dwelling_type: Any) -> int: return 4 # flats / unstated form → modal built_form (SAP- and gate-neutral) -def _is_full_sap_cert(data: Dict[str, Any]) -> bool: - """Whether a `SAP-Schema-16.x` doc is structurally a full-SAP cert rather than - the reduced RdSAP shape: an as-designed/full SAP assessment - (`assessment_type == "SAP"`) that lodges fabric as measured openings - (`sap_opening_types`) instead of the reduced top-level count fields. Both - signals agree on the real corpus; requiring both avoids mis-routing a reduced - cert that happens to carry one.""" - return data.get("assessment_type") == "SAP" and bool( - data.get("sap_opening_types") +logger = logging.getLogger(__name__) + +# The recognised full-SAP schema_type labels: a full-SAP *shape* under one of +# these is correctly labelled, not a broken schema type. Anything else carrying +# the full-SAP shape is a mislabel (e.g. the LIG as-designed certs lodged as +# SAP-Schema-16.x). +_FULL_SAP_SCHEMA_LABELS: frozenset[str] = frozenset( + { + "SAP-Schema-17.0", + "SAP-Schema-17.1", + "SAP-Schema-18.0.0", + "SAP-Schema-19.1.0", + } +) + + +def _is_full_sap_assessment(data: Dict[str, Any]) -> bool: + """Whether a cert is a full-SAP assessment (vs reduced RdSAP), keyed on the + gov-API's own `assessment_type` declaration — the authoritative SAP-vs-RdSAP + classification, not a structural proxy. A full-SAP assessment lodges fabric + as measured openings + measured U-values; a reduced RdSAP one carries the + top-level count fields. + + `assessment_type` separates the entire cert corpus cleanly — every full-SAP + schema (SAP-Schema-17.x/18.x and the broken `LIG` 16.x) is `"SAP"`, every + reduced cert (RdSAP-Schema-* and the reduced SAP-Schema-16.x) is `"RdSAP"` — + and it is independent of the `schema_type` LABEL, which can be broken. The + structural signals (`data_type`, `sap_opening_types`) agree with it on the + corpus but are derived shape artifacts; `assessment_type` is the meaning.""" + return data.get("assessment_type") == "SAP" + + +def _is_full_sap_label(schema: str) -> bool: + """Whether `schema` is a recognised full-SAP `schema_type` — so a full-SAP + *shape* under it is correctly labelled, not a broken schema type.""" + return schema in _FULL_SAP_SCHEMA_LABELS + + +def _record_broken_schema_type(schema: str, data: Dict[str, Any]) -> None: + """Surface a *broken schema type*: a cert whose `schema_type` label disagrees + with its structure — the label is not a full-SAP schema, yet the cert is + structurally full-SAP. Logged rather than silently rerouted so the unreliable + labels and their frequency stay visible (mirrors the skipped-cohort-cert + capture), and coverage can grow as new mislabelled shapes surface. e.g. the + LIG as-designed certs lodged under SAP-Schema-16.x.""" + logger.warning( + "broken schema_type %r: structurally full-SAP " + "(assessment_type=%r) — routing to the full-SAP mapper (uprn=%s)", + schema, + data.get("assessment_type"), + data.get("uprn"), ) diff --git a/datatypes/epc/domain/tests/test_from_sap_schema.py b/datatypes/epc/domain/tests/test_from_sap_schema.py index 8a79591d..42630ca2 100644 --- a/datatypes/epc/domain/tests/test_from_sap_schema.py +++ b/datatypes/epc/domain/tests/test_from_sap_schema.py @@ -11,6 +11,7 @@ exercise the shape variation the design decisions hinge on """ import json +import logging import os from typing import Any, Dict @@ -684,9 +685,8 @@ class TestFullSapSchema16xRouting: assert epc.dwelling_type == "Detached house" def test_reduced_16_x_cert_unaffected_by_full_sap_routing(self) -> None: - # Arrange — a reduced 16.2 cert (assessment_type RdSAP, no - # sap_opening_types) must stay on the RdSAP path, keeping its top-level - # property_type. + # Arrange — a reduced 16.2 cert (assessment_type RdSAP) must stay on the + # RdSAP path, keeping its top-level property_type. data = load("sap_16_2.json") # Act @@ -694,3 +694,35 @@ class TestFullSapSchema16xRouting: # Assert assert epc.property_type is not None + + def test_broken_schema_type_is_recorded( + self, caplog: pytest.LogCaptureFixture + ) -> None: + # Arrange — a full-SAP cert mislabelled as SAP-Schema-16.0: the label + # disagrees with the assessment_type, so the mismatch must be surfaced + # (not silently rerouted). + data = load("sap_16_0_full.json") + + # Act + with caplog.at_level(logging.WARNING, logger="datatypes.epc.domain.mapper"): + EpcPropertyDataMapper.from_api_response(data) + + # Assert + assert any( + "broken schema_type" in r.message and "SAP-Schema-16.0" in r.message + for r in caplog.records + ) + + def test_correctly_labelled_full_sap_is_not_recorded_as_broken( + self, caplog: pytest.LogCaptureFixture + ) -> None: + # Arrange — a correctly-labelled full-SAP cert: assessment_type SAP AND a + # recognised full-SAP label, so no mismatch to record. + data = load("sap_17_1.json") + + # Act + with caplog.at_level(logging.WARNING, logger="datatypes.epc.domain.mapper"): + EpcPropertyDataMapper.from_api_response(data) + + # Assert + assert not any("broken schema_type" in r.message for r in caplog.records)