From 066dce19e365b436f2dfdf9869df5c14197a0fb2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 24 May 2026 18:03:29 +0000 Subject: [PATCH] Slice 46b: Elmhurst extractor parses windows from layout-style Summary PDFs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The legacy `_extract_windows` regex anchors on "Permanent Shutters\n" which is broken across lines by the pdftotext-layout preprocessor. New fallback `_extract_windows_from_layout` anchors on the two stable per-window markers — a "W H Area" data line and the "Manufacturer " line a few lines further down — and tolerates the variable-order optional fields (glazing_gap, inline building_part, inline orientation) between them. Prefix/suffix tokens around the data block are re-joined into glazing_type / building_part / orientation strings. Cert U985-0001-000474's 7 windows across Main + 2 extensions now flow through the mapper to EpcPropertyData.sap_windows (was 0). Textract-style extraction (existing fixture) is unchanged — the legacy path runs first and only falls through when its regex misses. Co-Authored-By: Claude Opus 4.7 --- .../documents_parser/elmhurst_extractor.py | 232 +++++++++++++++++- .../tests/test_summary_pdf_mapper_chain.py | 16 ++ 2 files changed, 247 insertions(+), 1 deletion(-) diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 40e2b5a8..c77d92cc 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -336,13 +336,20 @@ class ElmhurstSiteNotesExtractor: return extensions def _extract_windows(self) -> List[Window]: + # Textract-style pages keep "Permanent\s+Shutters" adjacent in + # reading order and the windows table flows as one column-block + # the existing token-walker can step through. PDF-derived pages + # (Summary PDFs preprocessed from `pdftotext -layout`) break the + # header across lines, so this regex misses entirely and the + # `_extract_windows_from_layout` fallback below picks them up + # by anchoring on the W/H/Area data line. m = re.search( r"Permanent\s+Shutters\n(.*?)Draught Proofing", self._text, re.DOTALL, ) if not m: - return [] + return self._extract_windows_from_layout() tokens = [t.strip() for t in m.group(1).splitlines() if t.strip()] windows: List[Window] = [] i = 0 @@ -410,6 +417,229 @@ class ElmhurstSiteNotesExtractor: ) return windows + # Anchors used by the layout-style window parser. + _WIDTH_HEIGHT_AREA_RE = re.compile(r"^(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)$") + _MANUFACTURER_RE = re.compile(r"^(Manufacturer|Default)\s+(\d+\.\d+)$") + _ORIENTATION_TOKENS = frozenset({ + "North", "South", "East", "West", "NE", "NW", "SE", "SW", + }) + _BP_INLINE_TOKENS = frozenset({"Main"}) # "Extension" only appears as suffix + + def _extract_windows_from_layout(self) -> List[Window]: + """Fallback window parser for Summary PDFs preprocessed from + `pdftotext -layout`. Each window has two stable anchors: + a "W H Area" line and a "Manufacturer " line a few + lines further down. Everything between holds frame_type, + frame_factor, and a variable mix of glazing_gap, building_part, + location, and orientation (depending on which fields the + surveyor lodged); everything around the window holds glazing- + type/building-part/orientation prefix/suffix tokens split by + the layout preprocessor. + """ + m = re.search( + r"11\.0 Windows:(.*?)(Draught Proofing|12\.0 Ventilation)", + self._text, re.DOTALL, + ) + if not m: + return [] + lines = m.group(1).splitlines() + + # Locate all (data_line, manufacturer_line) pairs in document + # order. Each pair is one window. + data_anchors: List[tuple[int, re.Match[str]]] = [] + for i, line in enumerate(lines): + anchor = self._WIDTH_HEIGHT_AREA_RE.match(line.strip()) + if anchor is not None: + data_anchors.append((i, anchor)) + + windows: List[Window] = [] + for k, (data_idx, anchor) in enumerate(data_anchors): + manuf_idx = self._find_manufacturer_after(lines, data_idx) + if manuf_idx is None: + continue + prev_window_end = ( + self._estimate_window_end(lines, data_anchors[k - 1][0]) + if k > 0 else 0 + ) + next_window_start = ( + data_anchors[k + 1][0] if k + 1 < len(data_anchors) else len(lines) + ) + try: + window = self._parse_window_from_anchors( + lines=lines, + data_idx=data_idx, + manuf_idx=manuf_idx, + anchor=anchor, + before_start=prev_window_end, + after_end=next_window_start, + ) + except (ValueError, IndexError): + continue + if window is not None: + windows.append(window) + return windows + + def _find_manufacturer_after(self, lines: List[str], data_idx: int) -> Optional[int]: + for j in range(data_idx + 1, min(data_idx + 12, len(lines))): + if self._MANUFACTURER_RE.match(lines[j].strip()): + return j + return None + + def _estimate_window_end(self, lines: List[str], data_idx: int) -> int: + """End-of-window index (exclusive) for the window whose data + line is at `data_idx`. Used to bound the "before" segment of + the *next* window when extracting suffix tokens.""" + manuf_idx = self._find_manufacturer_after(lines, data_idx) + if manuf_idx is None: + return data_idx + 1 + # Manufacturer + g_value + draught + shutters + ~3 suffix tokens + return manuf_idx + 7 + + def _parse_window_from_anchors( + self, + *, + lines: List[str], + data_idx: int, + manuf_idx: int, + anchor: re.Match[str], + before_start: int, + after_end: int, + ) -> Optional[Window]: + width = float(anchor.group(1)) + height = float(anchor.group(2)) + area = float(anchor.group(3)) + + # frame_type and frame_factor immediately follow the data line. + if data_idx + 2 >= len(lines): + return None + frame_type = lines[data_idx + 1].strip() + try: + frame_factor = float(lines[data_idx + 2].strip()) + except ValueError: + return None + if not 0.0 < frame_factor <= 1.0: + return None + + # Variable-order tokens between frame_factor and Manufacturer. + middle = [lines[j].strip() for j in range(data_idx + 3, manuf_idx)] + glazing_gap = next((t for t in middle if "mm" in t.lower()), None) + location = next((t for t in middle if "wall" in t.lower()), "External wall") + bp_inline = next((t for t in middle if t in self._BP_INLINE_TOKENS), None) + orient_inline = next( + (t for t in middle if t in self._ORIENTATION_TOKENS), None + ) + + # Manufacturer line carries data_source + u_value. + manuf_match = self._MANUFACTURER_RE.match(lines[manuf_idx].strip()) + if manuf_match is None: + return None + data_source = manuf_match.group(1) + u_value = float(manuf_match.group(2)) + + # Post-manufacturer: g_value, draught, shutters. + if manuf_idx + 3 >= len(lines): + return None + try: + g_value = float(lines[manuf_idx + 1].strip()) + except ValueError: + return None + draught_proofed = lines[manuf_idx + 2].strip().lower() == "yes" + permanent_shutters = lines[manuf_idx + 3].strip() + + # Prefix / suffix tokens (variable count) carry the + # glazing-type, building-part, and orientation strings split by + # the layout preprocessor. + before = [lines[j].strip() for j in range(before_start, data_idx) if lines[j].strip()] + after = [lines[j].strip() for j in range(manuf_idx + 4, after_end) if lines[j].strip()] + + glazing_type, building_part, orientation = self._compose_window_descriptors( + before=before, + after=after, + bp_inline=bp_inline, + orient_inline=orient_inline, + ) + + return Window( + width_m=width, + height_m=height, + area_m2=area, + glazing_type=glazing_type, + frame_factor=frame_factor, + building_part=building_part, + location=location, + orientation=orientation, + data_source=data_source, + u_value=u_value, + g_value=g_value, + draught_proofed=draught_proofed, + permanent_shutters=permanent_shutters, + frame_type=frame_type, + glazing_gap=glazing_gap, + ) + + def _compose_window_descriptors( + self, + *, + before: List[str], + after: List[str], + bp_inline: Optional[str], + orient_inline: Optional[str], + ) -> tuple[str, str, str]: + """Re-join the glazing-type / building-part / orientation tokens + split by the layout preprocessor. Each is at most 2 fragments + (one before the data line, one after); inline tokens in the + between-segment win over prefix/suffix fragments.""" + # before holds (in document order, possibly): glazing_prefix, + # bp_prefix, orient_prefix — bp/orient may be missing. + # after holds: glazing_suffix, bp_suffix, orient_suffix — same. + prefix = list(before[-3:]) # last 3 lines preceding data + suffix = list(after[:3]) + + def pop_if_orientation(tokens: List[str]) -> Optional[str]: + for t in tokens: + if t in self._ORIENTATION_TOKENS: + tokens.remove(t) + return t + return None + + def pop_if_bp_fragment(tokens: List[str]) -> Optional[str]: + # Prefix fragments like "1st" / "2nd" — match digit-prefixed + # ordinals; suffix fragments are always "Extension". + for t in tokens: + if re.match(r"^\d+(?:st|nd|rd|th)$", t) or t == "Extension": + tokens.remove(t) + return t + return None + + orient_prefix_token = pop_if_orientation(prefix) + orient_suffix_token = pop_if_orientation(suffix) + bp_prefix_frag = pop_if_bp_fragment(prefix) + bp_suffix_frag = pop_if_bp_fragment(suffix) + + # Glazing type: remaining prefix + remaining suffix (joined). + glazing_type = " ".join([*prefix, *suffix]).strip() + + # Building part: inline token wins; otherwise join prefix + suffix. + if bp_inline is not None: + building_part = bp_inline + else: + building_part = " ".join( + t for t in (bp_prefix_frag, bp_suffix_frag) if t + ).strip() + + # Orientation: inline token wins for the primary direction; + # combine with the opposite-direction fragment when present. + primary = orient_inline or orient_prefix_token or "" + secondary_candidates = [ + t for t in (orient_prefix_token, orient_suffix_token) if t and t != primary + ] + if primary and secondary_candidates: + orientation = f"{primary}-{secondary_candidates[0]}" + else: + orientation = primary + + return glazing_type, building_part, orientation + def _extract_ventilation(self) -> VentilationAndCooling: return VentilationAndCooling( open_chimneys_count=self._int_val("No. of open chimneys"), diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index f3f3b209..2f41c1b8 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -92,3 +92,19 @@ def test_summary_000474_mapper_produces_three_building_parts() -> None: # Assert assert len(epc.sap_building_parts) == 3 + + +def test_summary_000474_mapper_extracts_seven_windows() -> None: + # Arrange — cert U985-0001-000474's §11 table lodges 7 windows + # across Main + 1st Extension + 2nd Extension. The legacy Textract- + # style window parser couldn't anchor on the Summary PDF's tabular + # layout; the new W/H/Area-plus-Manufacturer anchor pair picks them + # all up. + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + + # Act + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Assert + assert len(epc.sap_windows) == 7