Slice 46b: Elmhurst extractor parses windows from layout-style Summary PDFs

The legacy `_extract_windows` regex anchors on "Permanent Shutters\n" which is broken across lines by the pdftotext-layout preprocessor. New fallback `_extract_windows_from_layout` anchors on the two stable per-window markers — a "W H Area" data line and the "Manufacturer <U_value>" line a few lines further down — and tolerates the variable-order optional fields (glazing_gap, inline building_part, inline orientation) between them. Prefix/suffix tokens around the data block are re-joined into glazing_type / building_part / orientation strings.

Cert U985-0001-000474's 7 windows across Main + 2 extensions now flow through the mapper to EpcPropertyData.sap_windows (was 0). Textract-style extraction (existing fixture) is unchanged — the legacy path runs first and only falls through when its regex misses.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-05-24 18:03:29 +00:00
parent 36f2c7bbdf
commit 066dce19e3
2 changed files with 247 additions and 1 deletions

View file

@ -336,13 +336,20 @@ class ElmhurstSiteNotesExtractor:
return extensions
def _extract_windows(self) -> List[Window]:
# Textract-style pages keep "Permanent\s+Shutters" adjacent in
# reading order and the windows table flows as one column-block
# the existing token-walker can step through. PDF-derived pages
# (Summary PDFs preprocessed from `pdftotext -layout`) break the
# header across lines, so this regex misses entirely and the
# `_extract_windows_from_layout` fallback below picks them up
# by anchoring on the W/H/Area data line.
m = re.search(
r"Permanent\s+Shutters\n(.*?)Draught Proofing",
self._text,
re.DOTALL,
)
if not m:
return []
return self._extract_windows_from_layout()
tokens = [t.strip() for t in m.group(1).splitlines() if t.strip()]
windows: List[Window] = []
i = 0
@ -410,6 +417,229 @@ class ElmhurstSiteNotesExtractor:
)
return windows
# Anchors used by the layout-style window parser.
_WIDTH_HEIGHT_AREA_RE = re.compile(r"^(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)$")
_MANUFACTURER_RE = re.compile(r"^(Manufacturer|Default)\s+(\d+\.\d+)$")
_ORIENTATION_TOKENS = frozenset({
"North", "South", "East", "West", "NE", "NW", "SE", "SW",
})
_BP_INLINE_TOKENS = frozenset({"Main"}) # "Extension" only appears as suffix
def _extract_windows_from_layout(self) -> List[Window]:
"""Fallback window parser for Summary PDFs preprocessed from
`pdftotext -layout`. Each window has two stable anchors:
a "W H Area" line and a "Manufacturer <U_value>" line a few
lines further down. Everything between holds frame_type,
frame_factor, and a variable mix of glazing_gap, building_part,
location, and orientation (depending on which fields the
surveyor lodged); everything around the window holds glazing-
type/building-part/orientation prefix/suffix tokens split by
the layout preprocessor.
"""
m = re.search(
r"11\.0 Windows:(.*?)(Draught Proofing|12\.0 Ventilation)",
self._text, re.DOTALL,
)
if not m:
return []
lines = m.group(1).splitlines()
# Locate all (data_line, manufacturer_line) pairs in document
# order. Each pair is one window.
data_anchors: List[tuple[int, re.Match[str]]] = []
for i, line in enumerate(lines):
anchor = self._WIDTH_HEIGHT_AREA_RE.match(line.strip())
if anchor is not None:
data_anchors.append((i, anchor))
windows: List[Window] = []
for k, (data_idx, anchor) in enumerate(data_anchors):
manuf_idx = self._find_manufacturer_after(lines, data_idx)
if manuf_idx is None:
continue
prev_window_end = (
self._estimate_window_end(lines, data_anchors[k - 1][0])
if k > 0 else 0
)
next_window_start = (
data_anchors[k + 1][0] if k + 1 < len(data_anchors) else len(lines)
)
try:
window = self._parse_window_from_anchors(
lines=lines,
data_idx=data_idx,
manuf_idx=manuf_idx,
anchor=anchor,
before_start=prev_window_end,
after_end=next_window_start,
)
except (ValueError, IndexError):
continue
if window is not None:
windows.append(window)
return windows
def _find_manufacturer_after(self, lines: List[str], data_idx: int) -> Optional[int]:
for j in range(data_idx + 1, min(data_idx + 12, len(lines))):
if self._MANUFACTURER_RE.match(lines[j].strip()):
return j
return None
def _estimate_window_end(self, lines: List[str], data_idx: int) -> int:
"""End-of-window index (exclusive) for the window whose data
line is at `data_idx`. Used to bound the "before" segment of
the *next* window when extracting suffix tokens."""
manuf_idx = self._find_manufacturer_after(lines, data_idx)
if manuf_idx is None:
return data_idx + 1
# Manufacturer + g_value + draught + shutters + ~3 suffix tokens
return manuf_idx + 7
def _parse_window_from_anchors(
self,
*,
lines: List[str],
data_idx: int,
manuf_idx: int,
anchor: re.Match[str],
before_start: int,
after_end: int,
) -> Optional[Window]:
width = float(anchor.group(1))
height = float(anchor.group(2))
area = float(anchor.group(3))
# frame_type and frame_factor immediately follow the data line.
if data_idx + 2 >= len(lines):
return None
frame_type = lines[data_idx + 1].strip()
try:
frame_factor = float(lines[data_idx + 2].strip())
except ValueError:
return None
if not 0.0 < frame_factor <= 1.0:
return None
# Variable-order tokens between frame_factor and Manufacturer.
middle = [lines[j].strip() for j in range(data_idx + 3, manuf_idx)]
glazing_gap = next((t for t in middle if "mm" in t.lower()), None)
location = next((t for t in middle if "wall" in t.lower()), "External wall")
bp_inline = next((t for t in middle if t in self._BP_INLINE_TOKENS), None)
orient_inline = next(
(t for t in middle if t in self._ORIENTATION_TOKENS), None
)
# Manufacturer line carries data_source + u_value.
manuf_match = self._MANUFACTURER_RE.match(lines[manuf_idx].strip())
if manuf_match is None:
return None
data_source = manuf_match.group(1)
u_value = float(manuf_match.group(2))
# Post-manufacturer: g_value, draught, shutters.
if manuf_idx + 3 >= len(lines):
return None
try:
g_value = float(lines[manuf_idx + 1].strip())
except ValueError:
return None
draught_proofed = lines[manuf_idx + 2].strip().lower() == "yes"
permanent_shutters = lines[manuf_idx + 3].strip()
# Prefix / suffix tokens (variable count) carry the
# glazing-type, building-part, and orientation strings split by
# the layout preprocessor.
before = [lines[j].strip() for j in range(before_start, data_idx) if lines[j].strip()]
after = [lines[j].strip() for j in range(manuf_idx + 4, after_end) if lines[j].strip()]
glazing_type, building_part, orientation = self._compose_window_descriptors(
before=before,
after=after,
bp_inline=bp_inline,
orient_inline=orient_inline,
)
return Window(
width_m=width,
height_m=height,
area_m2=area,
glazing_type=glazing_type,
frame_factor=frame_factor,
building_part=building_part,
location=location,
orientation=orientation,
data_source=data_source,
u_value=u_value,
g_value=g_value,
draught_proofed=draught_proofed,
permanent_shutters=permanent_shutters,
frame_type=frame_type,
glazing_gap=glazing_gap,
)
def _compose_window_descriptors(
self,
*,
before: List[str],
after: List[str],
bp_inline: Optional[str],
orient_inline: Optional[str],
) -> tuple[str, str, str]:
"""Re-join the glazing-type / building-part / orientation tokens
split by the layout preprocessor. Each is at most 2 fragments
(one before the data line, one after); inline tokens in the
between-segment win over prefix/suffix fragments."""
# before holds (in document order, possibly): glazing_prefix,
# bp_prefix, orient_prefix — bp/orient may be missing.
# after holds: glazing_suffix, bp_suffix, orient_suffix — same.
prefix = list(before[-3:]) # last 3 lines preceding data
suffix = list(after[:3])
def pop_if_orientation(tokens: List[str]) -> Optional[str]:
for t in tokens:
if t in self._ORIENTATION_TOKENS:
tokens.remove(t)
return t
return None
def pop_if_bp_fragment(tokens: List[str]) -> Optional[str]:
# Prefix fragments like "1st" / "2nd" — match digit-prefixed
# ordinals; suffix fragments are always "Extension".
for t in tokens:
if re.match(r"^\d+(?:st|nd|rd|th)$", t) or t == "Extension":
tokens.remove(t)
return t
return None
orient_prefix_token = pop_if_orientation(prefix)
orient_suffix_token = pop_if_orientation(suffix)
bp_prefix_frag = pop_if_bp_fragment(prefix)
bp_suffix_frag = pop_if_bp_fragment(suffix)
# Glazing type: remaining prefix + remaining suffix (joined).
glazing_type = " ".join([*prefix, *suffix]).strip()
# Building part: inline token wins; otherwise join prefix + suffix.
if bp_inline is not None:
building_part = bp_inline
else:
building_part = " ".join(
t for t in (bp_prefix_frag, bp_suffix_frag) if t
).strip()
# Orientation: inline token wins for the primary direction;
# combine with the opposite-direction fragment when present.
primary = orient_inline or orient_prefix_token or ""
secondary_candidates = [
t for t in (orient_prefix_token, orient_suffix_token) if t and t != primary
]
if primary and secondary_candidates:
orientation = f"{primary}-{secondary_candidates[0]}"
else:
orientation = primary
return glazing_type, building_part, orientation
def _extract_ventilation(self) -> VentilationAndCooling:
return VentilationAndCooling(
open_chimneys_count=self._int_val("No. of open chimneys"),

View file

@ -92,3 +92,19 @@ def test_summary_000474_mapper_produces_three_building_parts() -> None:
# Assert
assert len(epc.sap_building_parts) == 3
def test_summary_000474_mapper_extracts_seven_windows() -> None:
# Arrange — cert U985-0001-000474's §11 table lodges 7 windows
# across Main + 1st Extension + 2nd Extension. The legacy Textract-
# style window parser couldn't anchor on the Summary PDF's tabular
# layout; the new W/H/Area-plus-Manufacturer anchor pair picks them
# all up.
pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF)
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
# Act
epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
# Assert
assert len(epc.sap_windows) == 7