Slice 46b: Elmhurst extractor parses windows from layout-style Summary PDFs

The legacy `_extract_windows` regex anchors on "Permanent Shutters\n" which is broken across lines by the pdftotext-layout preprocessor. New fallback `_extract_windows_from_layout` anchors on the two stable per-window markers — a "W H Area" data line and the "Manufacturer <U_value>" line a few lines further down — and tolerates the variable-order optional fields (glazing_gap, inline building_part, inline orientation) between them. Prefix/suffix tokens around the data block are re-joined into glazing_type / building_part / orientation strings. Cert U985-0001-000474's 7 windows across Main + 2 extensions now flow through the mapper to EpcPropertyData.sap_windows (was 0). Textract-style extraction (existing fixture) is unchanged — the legacy path runs first and only falls through when its regex misses. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-07-27 23:35:01 +00:00 · 2026-05-24 18:03:29 +00:00 · 2026-05-24 18:03:29 +00:00 · 066dce19e3
commit 066dce19e3
parent 36f2c7bbdf
2 changed files with 247 additions and 1 deletions
--- a/backend/documents_parser/elmhurst_extractor.py
+++ b/backend/documents_parser/elmhurst_extractor.py
@ -336,13 +336,20 @@ class ElmhurstSiteNotesExtractor:
        return extensions

    def _extract_windows(self) -> List[Window]:
+        # Textract-style pages keep "Permanent\s+Shutters" adjacent in
+        # reading order and the windows table flows as one column-block
+        # the existing token-walker can step through. PDF-derived pages
+        # (Summary PDFs preprocessed from `pdftotext -layout`) break the
+        # header across lines, so this regex misses entirely and the
+        # `_extract_windows_from_layout` fallback below picks them up
+        # by anchoring on the W/H/Area data line.
        m = re.search(
            r"Permanent\s+Shutters\n(.*?)Draught Proofing",
            self._text,
            re.DOTALL,
        )
        if not m:
-            return []
+            return self._extract_windows_from_layout()
        tokens = [t.strip() for t in m.group(1).splitlines() if t.strip()]
        windows: List[Window] = []
        i = 0
@ -410,6 +417,229 @@ class ElmhurstSiteNotesExtractor:
            )
        return windows

+    # Anchors used by the layout-style window parser.
+    _WIDTH_HEIGHT_AREA_RE = re.compile(r"^(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)$")
+    _MANUFACTURER_RE = re.compile(r"^(Manufacturer|Default)\s+(\d+\.\d+)$")
+    _ORIENTATION_TOKENS = frozenset({
+        "North", "South", "East", "West", "NE", "NW", "SE", "SW",
+    })
+    _BP_INLINE_TOKENS = frozenset({"Main"})  # "Extension" only appears as suffix
+
+    def _extract_windows_from_layout(self) -> List[Window]:
+        """Fallback window parser for Summary PDFs preprocessed from
+        `pdftotext -layout`. Each window has two stable anchors:
+        a "W H Area" line and a "Manufacturer <U_value>" line a few
+        lines further down. Everything between holds frame_type,
+        frame_factor, and a variable mix of glazing_gap, building_part,
+        location, and orientation (depending on which fields the
+        surveyor lodged); everything around the window holds glazing-
+        type/building-part/orientation prefix/suffix tokens split by
+        the layout preprocessor.
+        """
+        m = re.search(
+            r"11\.0 Windows:(.*?)(Draught Proofing|12\.0 Ventilation)",
+            self._text, re.DOTALL,
+        )
+        if not m:
+            return []
+        lines = m.group(1).splitlines()
+
+        # Locate all (data_line, manufacturer_line) pairs in document
+        # order. Each pair is one window.
+        data_anchors: List[tuple[int, re.Match[str]]] = []
+        for i, line in enumerate(lines):
+            anchor = self._WIDTH_HEIGHT_AREA_RE.match(line.strip())
+            if anchor is not None:
+                data_anchors.append((i, anchor))
+
+        windows: List[Window] = []
+        for k, (data_idx, anchor) in enumerate(data_anchors):
+            manuf_idx = self._find_manufacturer_after(lines, data_idx)
+            if manuf_idx is None:
+                continue
+            prev_window_end = (
+                self._estimate_window_end(lines, data_anchors[k - 1][0])
+                if k > 0 else 0
+            )
+            next_window_start = (
+                data_anchors[k + 1][0] if k + 1 < len(data_anchors) else len(lines)
+            )
+            try:
+                window = self._parse_window_from_anchors(
+                    lines=lines,
+                    data_idx=data_idx,
+                    manuf_idx=manuf_idx,
+                    anchor=anchor,
+                    before_start=prev_window_end,
+                    after_end=next_window_start,
+                )
+            except (ValueError, IndexError):
+                continue
+            if window is not None:
+                windows.append(window)
+        return windows
+
+    def _find_manufacturer_after(self, lines: List[str], data_idx: int) -> Optional[int]:
+        for j in range(data_idx + 1, min(data_idx + 12, len(lines))):
+            if self._MANUFACTURER_RE.match(lines[j].strip()):
+                return j
+        return None
+
+    def _estimate_window_end(self, lines: List[str], data_idx: int) -> int:
+        """End-of-window index (exclusive) for the window whose data
+        line is at `data_idx`. Used to bound the "before" segment of
+        the *next* window when extracting suffix tokens."""
+        manuf_idx = self._find_manufacturer_after(lines, data_idx)
+        if manuf_idx is None:
+            return data_idx + 1
+        # Manufacturer + g_value + draught + shutters + ~3 suffix tokens
+        return manuf_idx + 7
+
+    def _parse_window_from_anchors(
+        self,
+        *,
+        lines: List[str],
+        data_idx: int,
+        manuf_idx: int,
+        anchor: re.Match[str],
+        before_start: int,
+        after_end: int,
+    ) -> Optional[Window]:
+        width = float(anchor.group(1))
+        height = float(anchor.group(2))
+        area = float(anchor.group(3))
+
+        # frame_type and frame_factor immediately follow the data line.
+        if data_idx + 2 >= len(lines):
+            return None
+        frame_type = lines[data_idx + 1].strip()
+        try:
+            frame_factor = float(lines[data_idx + 2].strip())
+        except ValueError:
+            return None
+        if not 0.0 < frame_factor <= 1.0:
+            return None
+
+        # Variable-order tokens between frame_factor and Manufacturer.
+        middle = [lines[j].strip() for j in range(data_idx + 3, manuf_idx)]
+        glazing_gap = next((t for t in middle if "mm" in t.lower()), None)
+        location = next((t for t in middle if "wall" in t.lower()), "External wall")
+        bp_inline = next((t for t in middle if t in self._BP_INLINE_TOKENS), None)
+        orient_inline = next(
+            (t for t in middle if t in self._ORIENTATION_TOKENS), None
+        )
+
+        # Manufacturer line carries data_source + u_value.
+        manuf_match = self._MANUFACTURER_RE.match(lines[manuf_idx].strip())
+        if manuf_match is None:
+            return None
+        data_source = manuf_match.group(1)
+        u_value = float(manuf_match.group(2))
+
+        # Post-manufacturer: g_value, draught, shutters.
+        if manuf_idx + 3 >= len(lines):
+            return None
+        try:
+            g_value = float(lines[manuf_idx + 1].strip())
+        except ValueError:
+            return None
+        draught_proofed = lines[manuf_idx + 2].strip().lower() == "yes"
+        permanent_shutters = lines[manuf_idx + 3].strip()
+
+        # Prefix / suffix tokens (variable count) carry the
+        # glazing-type, building-part, and orientation strings split by
+        # the layout preprocessor.
+        before = [lines[j].strip() for j in range(before_start, data_idx) if lines[j].strip()]
+        after = [lines[j].strip() for j in range(manuf_idx + 4, after_end) if lines[j].strip()]
+
+        glazing_type, building_part, orientation = self._compose_window_descriptors(
+            before=before,
+            after=after,
+            bp_inline=bp_inline,
+            orient_inline=orient_inline,
+        )
+
+        return Window(
+            width_m=width,
+            height_m=height,
+            area_m2=area,
+            glazing_type=glazing_type,
+            frame_factor=frame_factor,
+            building_part=building_part,
+            location=location,
+            orientation=orientation,
+            data_source=data_source,
+            u_value=u_value,
+            g_value=g_value,
+            draught_proofed=draught_proofed,
+            permanent_shutters=permanent_shutters,
+            frame_type=frame_type,
+            glazing_gap=glazing_gap,
+        )
+
+    def _compose_window_descriptors(
+        self,
+        *,
+        before: List[str],
+        after: List[str],
+        bp_inline: Optional[str],
+        orient_inline: Optional[str],
+    ) -> tuple[str, str, str]:
+        """Re-join the glazing-type / building-part / orientation tokens
+        split by the layout preprocessor. Each is at most 2 fragments
+        (one before the data line, one after); inline tokens in the
+        between-segment win over prefix/suffix fragments."""
+        # before holds (in document order, possibly): glazing_prefix,
+        # bp_prefix, orient_prefix — bp/orient may be missing.
+        # after holds: glazing_suffix, bp_suffix, orient_suffix — same.
+        prefix = list(before[-3:])  # last 3 lines preceding data
+        suffix = list(after[:3])
+
+        def pop_if_orientation(tokens: List[str]) -> Optional[str]:
+            for t in tokens:
+                if t in self._ORIENTATION_TOKENS:
+                    tokens.remove(t)
+                    return t
+            return None
+
+        def pop_if_bp_fragment(tokens: List[str]) -> Optional[str]:
+            # Prefix fragments like "1st" / "2nd" — match digit-prefixed
+            # ordinals; suffix fragments are always "Extension".
+            for t in tokens:
+                if re.match(r"^\d+(?:st|nd|rd|th)$", t) or t == "Extension":
+                    tokens.remove(t)
+                    return t
+            return None
+
+        orient_prefix_token = pop_if_orientation(prefix)
+        orient_suffix_token = pop_if_orientation(suffix)
+        bp_prefix_frag = pop_if_bp_fragment(prefix)
+        bp_suffix_frag = pop_if_bp_fragment(suffix)
+
+        # Glazing type: remaining prefix + remaining suffix (joined).
+        glazing_type = " ".join([*prefix, *suffix]).strip()
+
+        # Building part: inline token wins; otherwise join prefix + suffix.
+        if bp_inline is not None:
+            building_part = bp_inline
+        else:
+            building_part = " ".join(
+                t for t in (bp_prefix_frag, bp_suffix_frag) if t
+            ).strip()
+
+        # Orientation: inline token wins for the primary direction;
+        # combine with the opposite-direction fragment when present.
+        primary = orient_inline or orient_prefix_token or ""
+        secondary_candidates = [
+            t for t in (orient_prefix_token, orient_suffix_token) if t and t != primary
+        ]
+        if primary and secondary_candidates:
+            orientation = f"{primary}-{secondary_candidates[0]}"
+        else:
+            orientation = primary
+
+        return glazing_type, building_part, orientation
+
    def _extract_ventilation(self) -> VentilationAndCooling:
        return VentilationAndCooling(
            open_chimneys_count=self._int_val("No. of open chimneys"),
--- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py
+++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py
@ -92,3 +92,19 @@ def test_summary_000474_mapper_produces_three_building_parts() -> None:

    # Assert
    assert len(epc.sap_building_parts) == 3
+
+
+def test_summary_000474_mapper_extracts_seven_windows() -> None:
+    # Arrange — cert U985-0001-000474's §11 table lodges 7 windows
+    # across Main + 1st Extension + 2nd Extension. The legacy Textract-
+    # style window parser couldn't anchor on the Summary PDF's tabular
+    # layout; the new W/H/Area-plus-Manufacturer anchor pair picks them
+    # all up.
+    pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF)
+    site_notes = ElmhurstSiteNotesExtractor(pages).extract()
+
+    # Act
+    epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
+
+    # Assert
+    assert len(epc.sap_windows) == 7