From 256a5afee5de527a242ebaf2a92be4d38dbd6c4f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 24 May 2026 18:32:20 +0000 Subject: [PATCH] =?UTF-8?q?Slice=2046c:=20Elmhurst=20mapper=20produces=20c?= =?UTF-8?q?alculator-equivalent=20EpcPropertyData=20=E2=80=94=20Summary=5F?= =?UTF-8?q?000474=20SAP=20within=200.5=20of=20worksheet=20PDF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The full Summary→ElmhurstSiteNotes→EpcPropertyData→cascade→SAP chain now produces unrounded SAP 62.52 for cert U985-0001-000474 vs the worksheet PDF's 62.2584 — inside the 0.5 tolerance the user accepts on the API-cert residual cohort. The hand-built worksheet-fixture chain matches Elmhurst's unrounded SAP to 4 d.p. (62.2584), so the calculator+cascade are provably equivalent to Elmhurst's calculator; this slice closes the mapper side of the chain. Mapper changes drop the string-versus-int impedance mismatch that prevented the cascade from consuming Elmhurst-coded values: - construction_age_band: `_strip_code('B 1900-1929')` → 'B' (was '1900-1929') - wall_construction: `_elmhurst_wall_construction_int('CA Cavity')` → 4 (was string 'Cavity') - wall_insulation_type: `'A As Built'` → 4 (was string 'As Built') - party_wall_construction: same int-mapping treatment - main_fuel_type: `_elmhurst_main_fuel_int('Mains gas')` → 26 (the Table 12 fuel code; was string) - heat_emitter_type: `'Radiators'` → 1 (was string) - main_heating_control: `_elmhurst_sap_control_code('SAP code 2106, ...')` → 2106 (the SAP code int; was the trailing description) - main_heating_index_number: parsed leading int from `pcdf_boiler_reference` ('16839 Vaillant…' → 16839) + `main_heating_data_source=1` so the PCDB cascade fires - window orientation: `_elmhurst_orientation_int('North-West')` → 8 (the SAP10 octant; was string — solar gains were dropping to 0 W/m² as a result) Floor handling also re-aligned with the SAP convention: floors sorted with the lowest as floor=0 (Elmhurst lodges 1st-floor entries first in the PDF); zero-area entries filtered out (single-storey extensions); non-ground room heights get the +0.25 m joist-void adjustment; `is_exposed_floor=True` for ground floors lodged above unheated space ('U Above unheated space'). `total_floor_area_m2` now sums across main + extensions. Three regression pins on the new path: - sap_building_parts == 3 (multi-bp) - sap_windows == 7 (layout-style window parser) - unrounded SAP within 0.5 of 62.2584 (worksheet PDF line 257) Existing end-to-end test assertions updated to reflect the spec-correct int codes. Co-Authored-By: Claude Opus 4.7 --- .../tests/test_elmhurst_end_to_end.py | 38 +-- .../tests/test_summary_pdf_mapper_chain.py | 52 +++-- datatypes/epc/domain/mapper.py | 220 ++++++++++++++++-- 3 files changed, 265 insertions(+), 45 deletions(-) diff --git a/backend/documents_parser/tests/test_elmhurst_end_to_end.py b/backend/documents_parser/tests/test_elmhurst_end_to_end.py index af192eeb..0512b1e6 100644 --- a/backend/documents_parser/tests/test_elmhurst_end_to_end.py +++ b/backend/documents_parser/tests/test_elmhurst_end_to_end.py @@ -133,13 +133,20 @@ class TestBuildingPart: assert result.sap_building_parts[0].identifier is BuildingPartIdentifier.MAIN def test_construction_age_band(self, result: EpcPropertyData) -> None: - assert result.sap_building_parts[0].construction_age_band == "1950-1966" + # Spec age-band letter code per RdSAP10 Table 1; the cascade + # reads this code letter for U-value lookups, not the year-range + # description. + assert result.sap_building_parts[0].construction_age_band == "D" def test_wall_construction(self, result: EpcPropertyData) -> None: - assert result.sap_building_parts[0].wall_construction == "Cavity" + # SAP10 wall_construction integer: 4 = Cavity (per + # domain.ml.rdsap_uvalues.WALL_CAVITY). + assert result.sap_building_parts[0].wall_construction == 4 def test_wall_insulation_type(self, result: EpcPropertyData) -> None: - assert result.sap_building_parts[0].wall_insulation_type == "Filled Cavity" + # SAP10 wall_insulation_type integer: 2 = Filled cavity (per + # domain.ml.rdsap_uvalues.WALL_INSULATION_FILLED_CAVITY). + assert result.sap_building_parts[0].wall_insulation_type == 2 def test_wall_thickness_measured(self, result: EpcPropertyData) -> None: assert result.sap_building_parts[0].wall_thickness_measured is True @@ -201,7 +208,9 @@ class TestWindows: assert result.sap_windows[0].window_height == 1.10 def test_first_window_orientation(self, result: EpcPropertyData) -> None: - assert result.sap_windows[0].orientation == "North" + # SAP10 octant code: 1 = North. The solar-gains cascade keys + # off the integer, not the cardinal-direction string. + assert result.sap_windows[0].orientation == 1 def test_first_window_glazing_type(self, result: EpcPropertyData) -> None: assert result.sap_windows[0].glazing_type == "Double post or during 2022" @@ -210,7 +219,8 @@ class TestWindows: assert result.sap_windows[0].draught_proofed is True def test_third_window_orientation(self, result: EpcPropertyData) -> None: - assert result.sap_windows[2].orientation == "South" + # SAP10 octant code: 5 = South. + assert result.sap_windows[2].orientation == 5 def test_frame_factor(self, result: EpcPropertyData) -> None: assert result.sap_windows[0].frame_factor == 0.7 @@ -233,12 +243,14 @@ class TestHeating: assert len(result.sap_heating.main_heating_details) == 1 def test_fuel_type(self, result: EpcPropertyData) -> None: - assert result.sap_heating.main_heating_details[0].main_fuel_type == "Mains gas" + # SAP10.2 Table 12 fuel code: 26 = mains gas (not community). + # The cascade only consumes the int code; strings drop the + # standing-charge / PE-factor / CO2-factor lookups. + assert result.sap_heating.main_heating_details[0].main_fuel_type == 26 def test_heat_emitter_type(self, result: EpcPropertyData) -> None: - assert ( - result.sap_heating.main_heating_details[0].heat_emitter_type == "Radiators" - ) + # SAP10.2 heat-emitter code: 1 = Radiators. + assert result.sap_heating.main_heating_details[0].heat_emitter_type == 1 def test_emitter_temperature(self, result: EpcPropertyData) -> None: assert ( @@ -252,10 +264,10 @@ class TestHeating: assert result.sap_heating.main_heating_details[0].has_fghrs is False def test_main_heating_control(self, result: EpcPropertyData) -> None: - assert ( - result.sap_heating.main_heating_details[0].main_heating_control - == "Programmer, room thermostat and TRVs" - ) + # SAP10.2 main_heating_control code extracted from the Elmhurst + # "SAP code 2106, Programmer, room thermostat and TRVs" string; + # the cascade keys efficiency adjustments off the integer. + assert result.sap_heating.main_heating_details[0].main_heating_control == 2106 def test_shower_outlet_type(self, result: EpcPropertyData) -> None: assert result.sap_heating.shower_outlets is not None diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index 2f41c1b8..17b588cf 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -1,23 +1,21 @@ -"""End-to-end scaffold for the Elmhurst Summary→EpcPropertyData chain. +"""End-to-end validation for the Elmhurst Summary→EpcPropertyData chain. The 6 Elmhurst worksheet fixtures in `domain.sap.worksheet.tests` build their `EpcPropertyData` synthetically — they validate the calculator + cascade in isolation from the mapper. This file pins the OTHER half of the chain: `from_elmhurst_site_notes` must produce -a calculator-equivalent `EpcPropertyData` when fed the Summary -PDF the worksheet was generated from. If the two halves agree, the -WHOLE pipeline (extractor + mapper + cascade + calculator) is -validated end-to-end against authoritative Elmhurst documents. +a calculator-equivalent `EpcPropertyData` when fed the Summary PDF +the worksheet was generated from. Together with the worksheet +cascade tests, this closes the loop: extractor + mapper + cascade ++ calculator validated end-to-end against the authoritative +Elmhurst documents. -Status: xfail. Today's audit (2026-05-24) surfaced a 28-field diff -between `from_elmhurst_site_notes(Summary_000474)` and the hand- -built `_elmhurst_worksheet_000474.build_epc()`. The load-bearing -gaps (calculator-relevant): - - sap_building_parts: 1 instead of 3 — mapper produces a single - bp via `[_map_elmhurst_building_part(survey)]` at [mapper.py:288](datatypes/epc/domain/mapper.py#L288) - - sap_windows: 0 instead of 5 — mapper plumbs no windows - - renewable_heat_incentive: None instead of RenewableHeatIncentive - - sap_heating / sap_ventilation differ in details +Status: GREEN. For cert U985-0001-000474, this pipeline produces an +unrounded SAP within 0.5 of the worksheet PDF's `62.2584` (line 257). +The cascade itself reproduces Elmhurst's calculator exactly on +hand-built inputs (handbuilt → 62.2584 to 4 d.p.); the remaining +sub-half-point gap from the mapped path is non-load-bearing field +drift (e.g. central_heating_pump_age the Summary PDF doesn't lodge). Preprocessing: the existing `ElmhurstSiteNotesExtractor` was written against Textract-style output (label\\nvalue pairs in spatial @@ -36,6 +34,8 @@ from pathlib import Path from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor from datatypes.epc.domain.mapper import EpcPropertyDataMapper +from domain.sap.calculator import calculate_sap_from_inputs +from domain.sap.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES, cert_to_inputs _FIXTURES = Path(__file__).parent / "fixtures" _SUMMARY_000474_PDF = _FIXTURES / "Summary_000474.pdf" @@ -108,3 +108,27 @@ def test_summary_000474_mapper_extracts_seven_windows() -> None: # Assert assert len(epc.sap_windows) == 7 + + +def test_summary_000474_full_chain_sap_within_half_point_of_worksheet_pdf() -> None: + # Arrange — the full Summary→ElmhurstSiteNotes→EpcPropertyData→cascade + # →SAP path against the U985-0001-000474 worksheet PDF's unrounded + # SAP rating (line 257: SAP value 62.2584, rating (258) = 62). + # The cascade itself matches Elmhurst exactly on hand-built inputs; + # this test pins the mapper end-to-end at the SAP-rating layer so + # any future mapper regression (extractor field drop, code-mapping + # break) surfaces here rather than at the residual-pin layer. + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Act + result = calculate_sap_from_inputs( + cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES) + ) + + # Assert — unrounded SAP within 0.5 of the worksheet's 62.2584 + # (the same tolerance the user accepted on the API-cert residual + # cohort given the API publishes rounded SAP integers). + worksheet_unrounded_sap = 62.2584 + assert abs(result.sap_score_continuous - worksheet_unrounded_sap) < 0.5 diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index e7f30253..aa8f4b19 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -1,3 +1,4 @@ +import re from datetime import date from typing import List, Optional, Sequence, Union, Dict, Any from datatypes.epc.schema.helpers import from_dict @@ -303,7 +304,13 @@ class EpcPropertyDataMapper: led_fixed_lighting_bulbs_count=survey.lighting.led_count, incandescent_fixed_lighting_bulbs_count=survey.lighting.incandescent_count, total_floor_area_m2=round( - sum(f.area_m2 for f in survey.dimensions.floors), 2 + sum(f.area_m2 for f in survey.dimensions.floors) + + sum( + f.area_m2 + for ext in survey.extensions + for f in ext.dimensions.floors + ), + 2, ), built_form=built_form, property_type=property_type, @@ -1775,6 +1782,69 @@ def _strip_code(value: str) -> str: return parts[1] if len(parts) > 1 else value +def _leading_code(value: str) -> str: + """Return the leading code token from an Elmhurst coded string, e.g. + 'CA Cavity' → 'CA', 'B 1900-1929' → 'B'. Returns the whole string + when there's no whitespace (defensive).""" + if not value: + return "" + return value.split(" ", 1)[0] + + +# Elmhurst wall-type codes mapped to SAP10 wall_construction integers +# (matches the constants defined in domain.ml.rdsap_uvalues). +_ELMHURST_WALL_CODE_TO_SAP10: Dict[str, int] = { + "ST": 1, # Stone (granite/sandstone) — placeholder; sandstone vs granite + # ambiguity resolved downstream via walls[].description. + "SB": 3, # Solid brick + "CA": 4, # Cavity + "TF": 5, # Timber frame + "SY": 6, # System build + "CO": 7, # Cob + "PH": 8, # Park home + "CW": 9, # Curtain wall +} + + +# Elmhurst wall-insulation-type codes mapped to the SAP10 integer enum +# documented at domain.ml.rdsap_uvalues.WALL_INSULATION_FILLED_CAVITY. +_ELMHURST_INSULATION_CODE_TO_SAP10: Dict[str, int] = { + "E": 1, # External wall insulation + "F": 2, # Filled cavity + "I": 3, # Internal wall insulation + "A": 4, # As built / assumed (default cascade) + "N": 5, # None specified +} + + +def _elmhurst_wall_construction_int(coded: str) -> Optional[int]: + """Map an Elmhurst wall_type string ('CA Cavity') to the SAP10 + integer code (4). Returns None when the leading code isn't a known + SAP10 wall type.""" + return _ELMHURST_WALL_CODE_TO_SAP10.get(_leading_code(coded)) + + +def _elmhurst_wall_insulation_int(coded: str) -> Optional[int]: + """Map an Elmhurst wall-insulation-type string ('A As Built') to + the SAP10 integer enum (4 = as-built). Returns None on unknown + leading code.""" + return _ELMHURST_INSULATION_CODE_TO_SAP10.get(_leading_code(coded)) + + +# SAP convention applied to non-ground floors in the Elmhurst worksheet +# fixtures: add 0.25 m to the lodged room height to account for the +# joist/floor-void contribution between storeys. +_UPPER_FLOOR_HEIGHT_ADD_M: float = 0.25 + + +def _is_floor_exposed_to_unheated_space(location: Optional[str]) -> bool: + """True when the floor sits above an unheated space (lodged by the + Elmhurst surveyor as 'U Above unheated space'). The cascade routes + these through `u_exposed_floor` rather than the BS EN ISO 13370 + ground-floor formula.""" + return location is not None and "above unheated" in location.lower() + + def _extract_age_band(age_range: str) -> str: """Return the letter code from a site-notes age range, e.g. 'I: 1996 - 2002' → 'I'.""" return age_range.split(":")[0].strip() @@ -1960,23 +2030,47 @@ def _map_elmhurst_building_part( ) -> SapBuildingPart: """Build a `SapBuildingPart` from one bp's worth of Elmhurst site- notes data. `identifier` distinguishes Main from each extension.""" - floor_dims = [ - SapFloorDimension( - room_height_m=f.room_height_m, - total_floor_area_m2=f.area_m2, - party_wall_length_m=f.party_wall_length_m, - heat_loss_perimeter_m=f.heat_loss_perimeter_m, - floor=i, + # Sort floors so the lowest is floor=0 and each upper floor follows. + # Elmhurst lists floors top-to-bottom in the PDF ("1st Floor" before + # "Lowest Floor"); SAP convention puts the ground floor first. The + # canonical "Lowest Floor" entry is the ground; any "Nst Floor" + # entry is above it. Zero-area floor entries (lodged when a single- + # storey bp doesn't have a real upper floor) are filtered out — the + # cascade treats those as a real storey otherwise. + def _is_lowest(name: str) -> bool: + return "lowest" in name.lower() + populated_floors = [f for f in dimensions.floors if f.area_m2 > 0] + ordered = sorted( + populated_floors, + key=lambda f: (0 if _is_lowest(f.name) else 1, f.name), + ) + floor_is_exposed = _is_floor_exposed_to_unheated_space(floor.location) + floor_dims: List[SapFloorDimension] = [] + for i, f in enumerate(ordered): + # SAP convention adds 0.25 m to non-ground room heights for the + # joist/floor-void contribution; the ground floor uses the + # lodged value directly. + height = f.room_height_m if i == 0 else f.room_height_m + _UPPER_FLOOR_HEIGHT_ADD_M + # `is_exposed_floor` only applies to the ground floor of a bp + # sitting above unheated space (e.g. an extension over a porch). + is_exposed = floor_is_exposed and i == 0 + floor_dims.append( + SapFloorDimension( + room_height_m=height, + total_floor_area_m2=f.area_m2, + party_wall_length_m=f.party_wall_length_m, + heat_loss_perimeter_m=f.heat_loss_perimeter_m, + floor=i, + is_exposed_floor=is_exposed, + ) ) - for i, f in enumerate(dimensions.floors) - ] return SapBuildingPart( identifier=identifier, - construction_age_band=_strip_code(age_band), - wall_construction=_strip_code(walls.wall_type), - wall_insulation_type=_strip_code(walls.insulation), + construction_age_band=_leading_code(age_band), + wall_construction=_elmhurst_wall_construction_int(walls.wall_type), + wall_insulation_type=_elmhurst_wall_insulation_int(walls.insulation), wall_thickness_measured=not walls.thickness_unknown, - party_wall_construction=_strip_code(walls.party_wall_type), + party_wall_construction=_elmhurst_wall_construction_int(walls.party_wall_type), sap_floor_dimensions=floor_dims, wall_thickness_mm=walls.thickness_mm, roof_insulation_location=_strip_code(roof.insulation), @@ -2027,11 +2121,34 @@ def _map_elmhurst_building_parts(survey: ElmhurstSiteNotes) -> List[SapBuildingP return parts +# Elmhurst orientation strings → SAP10 octant integer (1=N..8=NW). +# Covers the orderings the layout-style window parser produces, both +# single-direction ("East") and combined ("North-West") forms. +_ELMHURST_ORIENTATION_TO_SAP10: Dict[str, int] = { + "North": 1, + "North-East": 2, "NE": 2, + "East": 3, + "South-East": 4, "SE": 4, "East-South": 4, + "South": 5, + "South-West": 6, "SW": 6, "West-South": 6, + "West": 7, + "North-West": 8, "NW": 8, "West-North": 8, +} + + +def _elmhurst_orientation_int(orientation: str) -> int: + """Map an Elmhurst orientation string to the SAP10 octant code + (1..8). Returns 1 (N) when the string isn't recognised — the + solar-gains cascade reads orientation as int, and missing values + drop a window's solar-gain contribution entirely.""" + return _ELMHURST_ORIENTATION_TO_SAP10.get(orientation, 1) + + def _map_elmhurst_window(w: ElmhurstWindow) -> SapWindow: return SapWindow( frame_material=w.frame_type or None, glazing_gap=w.glazing_gap or "", - orientation=w.orientation, + orientation=_elmhurst_orientation_int(w.orientation), window_type="Window", glazing_type=w.glazing_type, window_width=w.width_m, @@ -2049,6 +2166,60 @@ def _map_elmhurst_window(w: ElmhurstWindow) -> SapWindow: ) +def _elmhurst_pcdb_boiler_index(reference: Optional[str]) -> Optional[int]: + """Parse the leading integer from an Elmhurst PCDF boiler reference, + e.g. '16839 Vaillant, ecoTEC pro 28, 88.70%' → 16839. Returns None + when the reference is missing or doesn't lead with an integer.""" + if not reference: + return None + first = reference.split()[0] if reference.split() else "" + return int(first) if first.isdigit() and int(first) > 0 else None + + +# Elmhurst main-fuel-type strings mapped to SAP10.2 Table 12 fuel codes. +# The cascade (cert_to_inputs._main_fuel_code) only accepts the int form; +# string values fall through to defaults and drop the standing-charge, +# PE-factor, and CO2-factor lookups. +_ELMHURST_MAIN_FUEL_TO_SAP10: Dict[str, int] = { + "Mains gas": 26, + "Mains gas - community": 1, + "LPG bottled": 5, + "LPG bulk": 6, + "LPG special condition": 7, + "Oil": 8, + "Coal": 11, + "Electricity": 30, + "Electricity (off-peak 7hr)": 33, + "Electricity (off-peak 10hr)": 31, +} + + +# Elmhurst heat-emitter-type strings mapped to SAP10.2 integer codes. +_ELMHURST_HEAT_EMITTER_TO_SAP10: Dict[str, int] = { + "Radiators": 1, + "Underfloor (in screed)": 2, + "Underfloor (timber floor)": 3, + "Warm air": 4, + "Fan coils": 5, +} + + +def _elmhurst_main_fuel_int(fuel_type: str) -> Optional[int]: + return _ELMHURST_MAIN_FUEL_TO_SAP10.get(fuel_type) + + +def _elmhurst_heat_emitter_int(emitter: str) -> Optional[int]: + return _ELMHURST_HEAT_EMITTER_TO_SAP10.get(emitter) + + +def _elmhurst_sap_control_code(sap_control: str) -> Optional[int]: + """Extract the SAP code integer from a heating-controls field like + 'SAP code 2106, Programmer, room thermostat and TRVs' → 2106. The + cascade reads `main_heating_control` as int when present.""" + m = re.match(r"SAP code\s+(\d+)", sap_control) + return int(m.group(1)) if m else None + + def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating: mh = survey.main_heating sap_control = mh.heating_controls_sap @@ -2066,17 +2237,30 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating: if survey.baths_and_showers.showers else None ) + pcdb_index = _elmhurst_pcdb_boiler_index(mh.pcdf_boiler_reference) + main_fuel_int = _elmhurst_main_fuel_int(mh.fuel_type) + heat_emitter_int = _elmhurst_heat_emitter_int(mh.heat_emitter) + sap_control_int = _elmhurst_sap_control_code(sap_control) return SapHeating( instantaneous_wwhrs=InstantaneousWwhrs(), main_heating_details=[ MainHeatingDetail( has_fghrs=survey.renewables.flue_gas_heat_recovery_present, - main_fuel_type=mh.fuel_type, - heat_emitter_type=mh.heat_emitter, + # Prefer SAP integer codes when the Elmhurst string maps + # cleanly — the cascade only reads ints for fuel-cost, + # PE-factor, and CO2-factor lookups; strings fall through + # to defaults that drop the standing-charge component. + main_fuel_type=main_fuel_int if main_fuel_int is not None else mh.fuel_type, + heat_emitter_type=heat_emitter_int if heat_emitter_int is not None else mh.heat_emitter, emitter_temperature=mh.design_flow_temperature, fan_flue_present=mh.fan_assisted_flue, - main_heating_control=control, + main_heating_control=sap_control_int if sap_control_int is not None else control, central_heating_pump_age_str=mh.heat_pump_age, + # Per RdSAP, a PCDB-listed boiler is data source 1 + # (manufacturer measured efficiency); the integer index + # number drives PCDB lookup in the cascade. + main_heating_index_number=pcdb_index, + main_heating_data_source=1 if pcdb_index is not None else None, ) ], has_fixed_air_conditioning=survey.ventilation.fixed_space_cooling,