diff --git a/backend/documents_parser/tests/fixtures/Summary_000888.pdf b/backend/documents_parser/tests/fixtures/Summary_000888.pdf new file mode 100644 index 00000000..2a48320b Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_000888.pdf differ diff --git a/backend/documents_parser/tests/test_elmhurst_end_to_end.py b/backend/documents_parser/tests/test_elmhurst_end_to_end.py index f5b339bb..1ccd28c9 100644 --- a/backend/documents_parser/tests/test_elmhurst_end_to_end.py +++ b/backend/documents_parser/tests/test_elmhurst_end_to_end.py @@ -222,7 +222,12 @@ class TestWindows: assert result.sap_windows[0].orientation == 1 def test_first_window_glazing_type(self, result: EpcPropertyData) -> None: - assert result.sap_windows[0].glazing_type == "Double post or during 2022" + # SAP 10.2 Table U2 glazing-type code: 5 = double glazed (low-E + # argon). The Elmhurst Summary's "Double post or during 2022" + # label maps to code 5 via `_ELMHURST_GLAZING_LABEL_TO_SAP10` — + # the §5 daylight factor + §6 solar gains key off the integer + # not the string. + assert result.sap_windows[0].glazing_type == 5 def test_first_window_draught_proofed(self, result: EpcPropertyData) -> None: assert result.sap_windows[0].draught_proofed is True diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index d38e677c..69f09ccb 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -878,6 +878,72 @@ def test_all_seven_ashp_cohort_certs_extract_without_unmapped_label_raise() -> N EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) +def test_summary_3336_triple_glazed_windows_route_to_code_6() -> None: + # Arrange — cert 3336-2825-9400-0512-8292's Summary §11 lodges + # "Triple post or during 2022" on every window; dr87-0001-000888 + # confirms "Window, Triple glazed" on every line. The Elmhurst + # mapper must surface SAP 10.2 Table U2 code 6 so the §5 (66).. + # (67) daylight factor uses Table 6b col light g_L = 0.70 instead + # of the default DG g_L = 0.80 — the +0.0274 SAP regression that + # this slice closes is driven by the daylight-factor offset that + # the default-DG silently masked. + pages = _summary_pdf_to_textract_style_pages( + _FIXTURES / "Summary_000888.pdf" + ) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + + # Act + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Assert — every window on cert 3336 is triple-glazed → code 6. + assert epc.sap_windows, "expected windows on cert 3336" + for w in epc.sap_windows: + assert w.glazing_type == 6 + + +def test_summary_000474_double_glazed_windows_route_to_code_3() -> None: + # Arrange — boiler-cohort cert (Summary_000474.pdf) lodges + # "Double between 2002 and 2021" / "Double with unknown install + # date" on every window. Both routes to SAP 10.2 Table U2 code 3 + # (DG air-filled post-2002) per the `_ELMHURST_GLAZING_LABEL_TO + # _SAP10` dict — same Table 6b col light g_L = 0.80 as the + # default, so the cascade SAP is unchanged for these certs, but + # the integer pin guards against future cascade consumers that + # key on the subcode (e.g. a U-value default lookup for absent + # `WindowTransmissionDetails`). + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + + # Act + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Assert + assert epc.sap_windows, "expected windows on cert 000474" + for w in epc.sap_windows: + assert w.glazing_type == 3, ( + f"expected DG post-2002 code 3, got {w.glazing_type!r}" + ) + + +def test_summary_mapper_raises_on_unmapped_glazing_type_label() -> None: + # Arrange — same strict-coverage gate as the cylinder-size helper + # (Slice S0380.15 + S0380.16): silently routing an unknown glazing + # variant to a SAP default int hid the +0.05 SAP regression on 13 + # triple-glazed certs until the cohort-2 first-attempt probe. After + # this slice, an unrecognised lodging surfaces immediately at + # extraction time. + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000899_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + # Mutate the first window's glazing_type to an unmapped string. + site_notes.windows[0].glazing_type = "Quintuple glazed with helium" + + # Act / Assert + with pytest.raises(UnmappedElmhurstLabel) as excinfo: + EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + assert excinfo.value.field == "glazing_type" + assert excinfo.value.value == "Quintuple glazed with helium" + + def test_summary_2536_normal_cylinder_routes_to_code_2() -> None: # Arrange — cert 2536-2525-0600-0788-2292's Summary §15.1 lodges # "Cylinder Size: Normal". The dr87 worksheet lodges "Cylinder diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index fdcb2c8a..edb814e0 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -3201,7 +3201,7 @@ def _map_elmhurst_window(w: ElmhurstWindow) -> SapWindow: glazing_gap=w.glazing_gap or "", orientation=_elmhurst_orientation_int(w.orientation), window_type="Window", - glazing_type=w.glazing_type, + glazing_type=_elmhurst_glazing_type_code(w.glazing_type), # SapWindow's width × height is consumed across §3 (windows_w_per_ # k), §5 (daylight factor), and §6 (solar gains) — all summed as # the area product. The Elmhurst Summary PDF lodges W and H to @@ -3458,6 +3458,70 @@ def _elmhurst_cylinder_insulation_code( return code +# Elmhurst Summary §11 "Windows" lodged glazing-type strings mapped to +# the SAP 10.2 Table U2 glazing-type enum that +# `domain/sap10_calculator/worksheet/internal_gains._G_LIGHT_BY_GLAZING_CODE` +# keys ({1: single (g_L=0.90), 2: DG pre-2002 (0.80), 3: DG post-2002 +# (0.80), 5: DG low-E argon (0.80), 6: triple (0.70), 7: secondary +# (0.80)}). Only "Triple" vs everything-else materially affects the +# §5 (66)..(67) daylight factor (Table 6b col light: triple 0.70 vs +# double 0.80) for the Elmhurst path, because the worksheet-lodged +# U-value and g-value are passed through `WindowTransmissionDetails` +# directly — but the canonical SAP code is mapped for parity with the +# API path and forward-compatibility with any future cascade consumer +# that keys on the code. +# +# The trailing-substring-match `_elmhurst_glazing_type_code` strips a +# layout-noise prefix ("value value Proofed Shutters " or "Part value +# value Proofed Shutters ") and suffix (" Summary Information", +# " Alternative wall…") that the extractor occasionally folds into +# the glazing-type token before the cohort-2 dataset was first probed; +# fixing the upstream extractor is deferred to a future slice. +_ELMHURST_GLAZING_LABEL_TO_SAP10: Dict[str, int] = { + "Single": 1, + "Double pre 2002": 2, + "Double between 2002 and 2021": 3, + "Double with unknown install date": 3, + "Double with unknown 16 mm or install date more": 3, + "Double post or during 2022": 5, + "Triple post or during 2022": 6, + # One window in cert 2636 (Summary_000898.pdf) lodges the year- + # truncated form "Triple post or during" — the trailing " 2022 1" + # was consumed by an adjacent "Alternative wall" lodging in the + # PDF table cell the extractor joined into the glazing-type token. + # Treated as the same enum as the full form per worksheet + # "Triple glazed" lodging on cert 2636's dr87-0001-000898.pdf. + "Triple post or during": 6, + "Secondary": 7, +} + +_ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE: Final[re.Pattern[str]] = re.compile( + r"^(?:Part )?value value Proofed Shutters\s+" +) +_ELMHURST_GLAZING_LABEL_NOISE_SUFFIX_RE: Final[re.Pattern[str]] = re.compile( + r"\s+Summary Information$|\s+Alternative wall.*$" +) + + +def _elmhurst_glazing_type_code(label: Optional[str]) -> int: + """Map an Elmhurst §11 lodged glazing-type label to the SAP 10.2 + Table U2 integer code. Raises `UnmappedElmhurstLabel` when the + label is missing OR present but not in + `_ELMHURST_GLAZING_LABEL_TO_SAP10` (the same strict-coverage gate + Slice S0380.15 established for cylinder labels — silently routing + an unknown variant to a SAP-default int hid the triple-glazed Δ + +0.05 SAP regression for 13 cohort-2 certs until extraction was + audited end-to-end).""" + if label is None: + raise UnmappedElmhurstLabel("glazing_type", "") + cleaned = _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE.sub("", label) + cleaned = _ELMHURST_GLAZING_LABEL_NOISE_SUFFIX_RE.sub("", cleaned).strip() + code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(cleaned) + if code is None: + raise UnmappedElmhurstLabel("glazing_type", label) + return code + + def _elmhurst_main_heating_category( mh: ElmhurstMainHeating, pcdb_index: Optional[int] ) -> Optional[int]: