diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 00aaf045..a8b1893f 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -801,6 +801,12 @@ class ElmhurstSiteNotesExtractor: "North", "South", "East", "West", "NE", "NW", "SE", "SW", }) _BP_INLINE_TOKENS = frozenset({"Main"}) # "Extension" only appears as suffix + # A room-in-roof window (rooflight) lodges its §11 "Location" cell as + # "Roof of Room in Roof", which the layout preprocessor wraps onto two + # tokens ("Roof of Room" in the prefix block, "in Roof" in the suffix). + # Detected so the window routes to a roof window (worksheet (27a)) + # and the tokens don't leak into the glazing-type phrase. + _ROOF_OF_ROOM_LOCATION_TOKENS = frozenset({"Roof of Room", "in Roof"}) # The Elmhurst Summary PDF lodges each window's glazing-type as a # capitalised phrase like "Double between 2002" / "Double with unknown" # / "Single" / "Triple" / "Secondary". The first token of that phrase @@ -1020,6 +1026,18 @@ class ElmhurstSiteNotesExtractor: before = [lines[j].strip() for j in range(before_start, data_idx) if lines[j].strip()] after = [lines[j].strip() for j in range(manuf_idx + 4, after_end) if lines[j].strip()] + # Room-in-roof windows lodge their location as "Roof of Room in + # Roof" (wrapped across the prefix/suffix blocks). Detect it, pull + # those tokens out so they don't contaminate the glazing-type + # phrase, and override the wall-keyed `location` with the roof-of- + # room marker the roof-window classifier keys on. + if any( + t in self._ROOF_OF_ROOM_LOCATION_TOKENS for t in (*before, *after) + ): + location = "Roof of Room" + before = [t for t in before if t not in self._ROOF_OF_ROOM_LOCATION_TOKENS] + after = [t for t in after if t not in self._ROOF_OF_ROOM_LOCATION_TOKENS] + glazing_type, building_part, orientation = self._compose_window_descriptors( before=before, after=after, diff --git a/backend/documents_parser/tests/fixtures/Summary_001431_case6.pdf b/backend/documents_parser/tests/fixtures/Summary_001431_case6.pdf new file mode 100644 index 00000000..258f56ac Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_001431_case6.pdf differ diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index d779adb5..92373fbe 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -3837,6 +3837,11 @@ def _is_elmhurst_roof_window( """ if w.glazing_type.startswith("Single"): return False + # Explicit "Roof of Room" location lodging (simulated case 6): the + # surveyor placed the window on the room-in-roof, so it's a rooflight + # regardless of BP roof type or U-value. + if "roof of room" in (w.location or "").lower(): + return True bp_roof_type = _elmhurst_bp_roof_type(w, survey) if bp_roof_type is not None and bp_roof_type.startswith( _ELMHURST_BP_ROOF_TYPES_WITH_ROOFLIGHTS @@ -3852,6 +3857,12 @@ def _is_elmhurst_roof_window( # worksheet's (27a) line. The cohort exercises only "Double pre 2002". _ELMHURST_ROOF_WINDOW_U_BY_GLAZING: Dict[str, float] = { "Double pre 2002": 3.4, + # Simulated case 6 rooflights: the Summary lodges the already-inclined + # roof-window U=2.30 for DG-2002-2021 glazing (vs 2.00 vertical for the + # same glazing on a wall) — the worksheet bills it on (27a) at U_eff + # 2.1062 (= 2.30 with the §3.2 R=0.04 curtain transform). Keyed here so + # the inclination adjustment isn't double-applied. + "Double between 2002 and 2021": 2.30, } diff --git a/sap worksheets/golden fixture debugging/simulated case 6/P960-0001-001431 - 2026-06-03T130227.971.pdf b/sap worksheets/golden fixture debugging/simulated case 6/P960-0001-001431 - 2026-06-03T130227.971.pdf new file mode 100644 index 00000000..13e0183a Binary files /dev/null and b/sap worksheets/golden fixture debugging/simulated case 6/P960-0001-001431 - 2026-06-03T130227.971.pdf differ diff --git a/sap worksheets/golden fixture debugging/simulated case 6/Summary_001431 (1).pdf b/sap worksheets/golden fixture debugging/simulated case 6/Summary_001431 (1).pdf new file mode 100644 index 00000000..258f56ac Binary files /dev/null and b/sap worksheets/golden fixture debugging/simulated case 6/Summary_001431 (1).pdf differ diff --git a/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431_case6.py b/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431_case6.py new file mode 100644 index 00000000..94b6d7e8 --- /dev/null +++ b/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431_case6.py @@ -0,0 +1,108 @@ +"""Mapper-driven cascade fixture for the Elmhurst P960-0001-001431 +"simulated case 6" worksheet — a DETACHED, dual-oil cousin of golden +cert 0240 carrying ROOM-IN-ROOF WINDOWS (rooflights). + +Routes the Summary PDF through ElmhurstSiteNotesExtractor + +from_elmhurst_site_notes (no hand-built EpcPropertyData) so the pin +exercises the whole extractor + mapper + calculator pipeline. + +Purpose: validate S0380.198/199 ROOF-WINDOW handling against a real +worksheet. Case 6 lodges 6 windows on the room-in-roof ("Roof of Room" +location); the worksheet bills them on line (27a) Roof Windows at +U_eff 2.1062 (= inclined 2.30 with the §3.2 R=0.04 curtain transform), +NOT on (27) as vertical glazing. This is the site-notes mirror of +0240's API `window_wall_type=4` roof windows (S0380.198). + +This cert surfaced two site-notes gaps fixed in S0380.199: +- the extractor mangled the "Roof of Room in Roof" window-location cell + into the glazing-type phrase ("Double between 2002 Roof of Room and + 2021 in Roof" → UnmappedElmhurstLabel); `_parse_window_from_anchors` + now detects + strips those tokens and marks the window roof-of-room; +- `_is_elmhurst_roof_window` gained a "Roof of Room" location branch, + and `_ELMHURST_ROOF_WINDOW_U_BY_GLAZING` an entry for the + already-inclined "Double between 2002 and 2021" → 2.30 (so the + inclination adjustment isn't double-applied). + +SCOPE: this fixture pins only the §3 heat-transmission WINDOW line refs +(27)/(27a)/(31) — NOT the full SapResult. Case 6 has a DUAL main heating +system (51% radiators + 49% underfloor, oil), and `SapResult`'s +`main_heating_fuel_kwh_per_yr` / `pumps_fans_kwh_per_yr` aggregate the +two systems differently from the worksheet's per-system (211)/(231) +lines, so a full SapResult pin isn't apples-to-apples. Heating is also +SAP code 127 here vs 0240's code 130 condensing combi — so case 6 pins +to its OWN worksheet, not 0240's register. + +Source: user-simulated PDFs at `sap worksheets/golden fixture +debugging/simulated case 6/`. Summary mirrored into the tracked +`backend/documents_parser/tests/fixtures/Summary_001431_case6.pdf`. + +Worksheet §3 window pin targets (P960-0001-001431, Block 1): +- (27) Windows = 19.3704 (Main) + 3.3704 (Ext1) = 22.7408 W/K +- (27a) Roof Windows = 6.19 m² × 2.1062 = 13.0375 W/K (the 6 rooflights) +- (31) Total external element area = 336.13 m² + +Per [[feedback-zero-error-strict]]: pins are abs=1e-4 against the PDF. +""" + +from __future__ import annotations + +import re +import subprocess +from pathlib import Path +from typing import Final + +from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.domain.mapper import EpcPropertyDataMapper + + +# parents[0]=worksheet/, [1]=sap10_calculator/, [2]=domain/, [3]=tests/, +# [4]=repo root. +_SUMMARY_PDF: Final[Path] = ( + Path(__file__).resolve().parents[4] + / "backend" / "documents_parser" / "tests" / "fixtures" + / "Summary_001431_case6.pdf" +) + +# Worksheet §3 window line refs (Block 1 — energy rating). +LINE_27_WINDOWS_W_PER_K: Final[float] = 22.7408 +LINE_27A_ROOF_WINDOWS_W_PER_K: Final[float] = 13.0375 +LINE_31_TOTAL_EXTERNAL_AREA_M2: Final[float] = 336.13 + + +def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]: + """Convert a Summary PDF into the per-page text format the + ElmhurstSiteNotesExtractor expects (mirror of the case-5 helper).""" + info = subprocess.run( + ["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True, + ).stdout + m = re.search(r"Pages:\s+(\d+)", info) + if m is None: + raise RuntimeError(f"Could not parse page count from {pdf_path}") + page_count = int(m.group(1)) + + pages: list[str] = [] + for i in range(1, page_count + 1): + layout = subprocess.run( + [ + "pdftotext", "-layout", "-f", str(i), "-l", str(i), + str(pdf_path), "-", + ], + capture_output=True, text=True, check=True, + ).stdout + tokens: list[str] = [] + for line in layout.splitlines(): + if not line.strip(): + tokens.append("") + continue + parts = [p for p in re.split(r"\s{2,}", line.strip()) if p] + tokens.extend(parts) + pages.append("\n".join(tokens)) + return pages + + +def build_epc() -> EpcPropertyData: + """Route the simulated case-6 Summary through extractor + mapper.""" + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) diff --git a/tests/domain/sap10_calculator/worksheet/test_section_cascade_pins.py b/tests/domain/sap10_calculator/worksheet/test_section_cascade_pins.py index b3480d2a..396247f6 100644 --- a/tests/domain/sap10_calculator/worksheet/test_section_cascade_pins.py +++ b/tests/domain/sap10_calculator/worksheet/test_section_cascade_pins.py @@ -42,6 +42,7 @@ from tests.domain.sap10_calculator.worksheet import ( _elmhurst_worksheet_000487 as _w000487, _elmhurst_worksheet_000490 as _w000490, _elmhurst_worksheet_000516 as _w000516, + _elmhurst_worksheet_001431_case6 as _w001431_case6, ) @@ -248,6 +249,35 @@ def test_section_3_line_refs_match_pdf( _pin(actual, expected, f"§3 {fixture_attr} {fixture_name}") +def test_section_3_roof_windows_case6_match_pdf() -> None: + """§3 (27a) roof-window pin for simulated case 6 — the 6 room-in-roof + rooflights (window_wall_type=4 on the API side / "Roof of Room" + location on the site-notes side) must bill on (27a) at U_eff 2.1062, + not on (27) as vertical glazing. Validates the S0380.198/199 roof- + window routing against a real worksheet. Case 6 is pinned only on the + §3 window line refs (not added to `_FIXTURES`) because its dual main + heating system makes the §10/§12 per-system lines non-comparable — + see the fixture module docstring.""" + # Arrange + epc = _w001431_case6.build_epc() + + # Act + ht = heat_transmission_section_from_cert(epc) + + # Assert + _pin(ht.windows_w_per_k, _w001431_case6.LINE_27_WINDOWS_W_PER_K, "§3 (27) case6") + _pin( + ht.roof_windows_w_per_k, + _w001431_case6.LINE_27A_ROOF_WINDOWS_W_PER_K, + "§3 (27a) case6", + ) + _pin( + ht.total_external_element_area_m2, + _w001431_case6.LINE_31_TOTAL_EXTERNAL_AREA_M2, + "§3 (31) case6", + ) + + # ============================================================================ # §4 Water heating — LINE_42..LINE_65 scalar + monthly tuples # ============================================================================