mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Slice 46b: Elmhurst extractor parses windows from layout-style Summary PDFs
The legacy `_extract_windows` regex anchors on "Permanent Shutters\n" which is broken across lines by the pdftotext-layout preprocessor. New fallback `_extract_windows_from_layout` anchors on the two stable per-window markers — a "W H Area" data line and the "Manufacturer <U_value>" line a few lines further down — and tolerates the variable-order optional fields (glazing_gap, inline building_part, inline orientation) between them. Prefix/suffix tokens around the data block are re-joined into glazing_type / building_part / orientation strings. Cert U985-0001-000474's 7 windows across Main + 2 extensions now flow through the mapper to EpcPropertyData.sap_windows (was 0). Textract-style extraction (existing fixture) is unchanged — the legacy path runs first and only falls through when its regex misses. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
36f2c7bbdf
commit
066dce19e3
2 changed files with 247 additions and 1 deletions
|
|
@ -336,13 +336,20 @@ class ElmhurstSiteNotesExtractor:
|
|||
return extensions
|
||||
|
||||
def _extract_windows(self) -> List[Window]:
|
||||
# Textract-style pages keep "Permanent\s+Shutters" adjacent in
|
||||
# reading order and the windows table flows as one column-block
|
||||
# the existing token-walker can step through. PDF-derived pages
|
||||
# (Summary PDFs preprocessed from `pdftotext -layout`) break the
|
||||
# header across lines, so this regex misses entirely and the
|
||||
# `_extract_windows_from_layout` fallback below picks them up
|
||||
# by anchoring on the W/H/Area data line.
|
||||
m = re.search(
|
||||
r"Permanent\s+Shutters\n(.*?)Draught Proofing",
|
||||
self._text,
|
||||
re.DOTALL,
|
||||
)
|
||||
if not m:
|
||||
return []
|
||||
return self._extract_windows_from_layout()
|
||||
tokens = [t.strip() for t in m.group(1).splitlines() if t.strip()]
|
||||
windows: List[Window] = []
|
||||
i = 0
|
||||
|
|
@ -410,6 +417,229 @@ class ElmhurstSiteNotesExtractor:
|
|||
)
|
||||
return windows
|
||||
|
||||
# Anchors used by the layout-style window parser.
|
||||
_WIDTH_HEIGHT_AREA_RE = re.compile(r"^(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)$")
|
||||
_MANUFACTURER_RE = re.compile(r"^(Manufacturer|Default)\s+(\d+\.\d+)$")
|
||||
_ORIENTATION_TOKENS = frozenset({
|
||||
"North", "South", "East", "West", "NE", "NW", "SE", "SW",
|
||||
})
|
||||
_BP_INLINE_TOKENS = frozenset({"Main"}) # "Extension" only appears as suffix
|
||||
|
||||
def _extract_windows_from_layout(self) -> List[Window]:
|
||||
"""Fallback window parser for Summary PDFs preprocessed from
|
||||
`pdftotext -layout`. Each window has two stable anchors:
|
||||
a "W H Area" line and a "Manufacturer <U_value>" line a few
|
||||
lines further down. Everything between holds frame_type,
|
||||
frame_factor, and a variable mix of glazing_gap, building_part,
|
||||
location, and orientation (depending on which fields the
|
||||
surveyor lodged); everything around the window holds glazing-
|
||||
type/building-part/orientation prefix/suffix tokens split by
|
||||
the layout preprocessor.
|
||||
"""
|
||||
m = re.search(
|
||||
r"11\.0 Windows:(.*?)(Draught Proofing|12\.0 Ventilation)",
|
||||
self._text, re.DOTALL,
|
||||
)
|
||||
if not m:
|
||||
return []
|
||||
lines = m.group(1).splitlines()
|
||||
|
||||
# Locate all (data_line, manufacturer_line) pairs in document
|
||||
# order. Each pair is one window.
|
||||
data_anchors: List[tuple[int, re.Match[str]]] = []
|
||||
for i, line in enumerate(lines):
|
||||
anchor = self._WIDTH_HEIGHT_AREA_RE.match(line.strip())
|
||||
if anchor is not None:
|
||||
data_anchors.append((i, anchor))
|
||||
|
||||
windows: List[Window] = []
|
||||
for k, (data_idx, anchor) in enumerate(data_anchors):
|
||||
manuf_idx = self._find_manufacturer_after(lines, data_idx)
|
||||
if manuf_idx is None:
|
||||
continue
|
||||
prev_window_end = (
|
||||
self._estimate_window_end(lines, data_anchors[k - 1][0])
|
||||
if k > 0 else 0
|
||||
)
|
||||
next_window_start = (
|
||||
data_anchors[k + 1][0] if k + 1 < len(data_anchors) else len(lines)
|
||||
)
|
||||
try:
|
||||
window = self._parse_window_from_anchors(
|
||||
lines=lines,
|
||||
data_idx=data_idx,
|
||||
manuf_idx=manuf_idx,
|
||||
anchor=anchor,
|
||||
before_start=prev_window_end,
|
||||
after_end=next_window_start,
|
||||
)
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
if window is not None:
|
||||
windows.append(window)
|
||||
return windows
|
||||
|
||||
def _find_manufacturer_after(self, lines: List[str], data_idx: int) -> Optional[int]:
|
||||
for j in range(data_idx + 1, min(data_idx + 12, len(lines))):
|
||||
if self._MANUFACTURER_RE.match(lines[j].strip()):
|
||||
return j
|
||||
return None
|
||||
|
||||
def _estimate_window_end(self, lines: List[str], data_idx: int) -> int:
|
||||
"""End-of-window index (exclusive) for the window whose data
|
||||
line is at `data_idx`. Used to bound the "before" segment of
|
||||
the *next* window when extracting suffix tokens."""
|
||||
manuf_idx = self._find_manufacturer_after(lines, data_idx)
|
||||
if manuf_idx is None:
|
||||
return data_idx + 1
|
||||
# Manufacturer + g_value + draught + shutters + ~3 suffix tokens
|
||||
return manuf_idx + 7
|
||||
|
||||
def _parse_window_from_anchors(
|
||||
self,
|
||||
*,
|
||||
lines: List[str],
|
||||
data_idx: int,
|
||||
manuf_idx: int,
|
||||
anchor: re.Match[str],
|
||||
before_start: int,
|
||||
after_end: int,
|
||||
) -> Optional[Window]:
|
||||
width = float(anchor.group(1))
|
||||
height = float(anchor.group(2))
|
||||
area = float(anchor.group(3))
|
||||
|
||||
# frame_type and frame_factor immediately follow the data line.
|
||||
if data_idx + 2 >= len(lines):
|
||||
return None
|
||||
frame_type = lines[data_idx + 1].strip()
|
||||
try:
|
||||
frame_factor = float(lines[data_idx + 2].strip())
|
||||
except ValueError:
|
||||
return None
|
||||
if not 0.0 < frame_factor <= 1.0:
|
||||
return None
|
||||
|
||||
# Variable-order tokens between frame_factor and Manufacturer.
|
||||
middle = [lines[j].strip() for j in range(data_idx + 3, manuf_idx)]
|
||||
glazing_gap = next((t for t in middle if "mm" in t.lower()), None)
|
||||
location = next((t for t in middle if "wall" in t.lower()), "External wall")
|
||||
bp_inline = next((t for t in middle if t in self._BP_INLINE_TOKENS), None)
|
||||
orient_inline = next(
|
||||
(t for t in middle if t in self._ORIENTATION_TOKENS), None
|
||||
)
|
||||
|
||||
# Manufacturer line carries data_source + u_value.
|
||||
manuf_match = self._MANUFACTURER_RE.match(lines[manuf_idx].strip())
|
||||
if manuf_match is None:
|
||||
return None
|
||||
data_source = manuf_match.group(1)
|
||||
u_value = float(manuf_match.group(2))
|
||||
|
||||
# Post-manufacturer: g_value, draught, shutters.
|
||||
if manuf_idx + 3 >= len(lines):
|
||||
return None
|
||||
try:
|
||||
g_value = float(lines[manuf_idx + 1].strip())
|
||||
except ValueError:
|
||||
return None
|
||||
draught_proofed = lines[manuf_idx + 2].strip().lower() == "yes"
|
||||
permanent_shutters = lines[manuf_idx + 3].strip()
|
||||
|
||||
# Prefix / suffix tokens (variable count) carry the
|
||||
# glazing-type, building-part, and orientation strings split by
|
||||
# the layout preprocessor.
|
||||
before = [lines[j].strip() for j in range(before_start, data_idx) if lines[j].strip()]
|
||||
after = [lines[j].strip() for j in range(manuf_idx + 4, after_end) if lines[j].strip()]
|
||||
|
||||
glazing_type, building_part, orientation = self._compose_window_descriptors(
|
||||
before=before,
|
||||
after=after,
|
||||
bp_inline=bp_inline,
|
||||
orient_inline=orient_inline,
|
||||
)
|
||||
|
||||
return Window(
|
||||
width_m=width,
|
||||
height_m=height,
|
||||
area_m2=area,
|
||||
glazing_type=glazing_type,
|
||||
frame_factor=frame_factor,
|
||||
building_part=building_part,
|
||||
location=location,
|
||||
orientation=orientation,
|
||||
data_source=data_source,
|
||||
u_value=u_value,
|
||||
g_value=g_value,
|
||||
draught_proofed=draught_proofed,
|
||||
permanent_shutters=permanent_shutters,
|
||||
frame_type=frame_type,
|
||||
glazing_gap=glazing_gap,
|
||||
)
|
||||
|
||||
def _compose_window_descriptors(
|
||||
self,
|
||||
*,
|
||||
before: List[str],
|
||||
after: List[str],
|
||||
bp_inline: Optional[str],
|
||||
orient_inline: Optional[str],
|
||||
) -> tuple[str, str, str]:
|
||||
"""Re-join the glazing-type / building-part / orientation tokens
|
||||
split by the layout preprocessor. Each is at most 2 fragments
|
||||
(one before the data line, one after); inline tokens in the
|
||||
between-segment win over prefix/suffix fragments."""
|
||||
# before holds (in document order, possibly): glazing_prefix,
|
||||
# bp_prefix, orient_prefix — bp/orient may be missing.
|
||||
# after holds: glazing_suffix, bp_suffix, orient_suffix — same.
|
||||
prefix = list(before[-3:]) # last 3 lines preceding data
|
||||
suffix = list(after[:3])
|
||||
|
||||
def pop_if_orientation(tokens: List[str]) -> Optional[str]:
|
||||
for t in tokens:
|
||||
if t in self._ORIENTATION_TOKENS:
|
||||
tokens.remove(t)
|
||||
return t
|
||||
return None
|
||||
|
||||
def pop_if_bp_fragment(tokens: List[str]) -> Optional[str]:
|
||||
# Prefix fragments like "1st" / "2nd" — match digit-prefixed
|
||||
# ordinals; suffix fragments are always "Extension".
|
||||
for t in tokens:
|
||||
if re.match(r"^\d+(?:st|nd|rd|th)$", t) or t == "Extension":
|
||||
tokens.remove(t)
|
||||
return t
|
||||
return None
|
||||
|
||||
orient_prefix_token = pop_if_orientation(prefix)
|
||||
orient_suffix_token = pop_if_orientation(suffix)
|
||||
bp_prefix_frag = pop_if_bp_fragment(prefix)
|
||||
bp_suffix_frag = pop_if_bp_fragment(suffix)
|
||||
|
||||
# Glazing type: remaining prefix + remaining suffix (joined).
|
||||
glazing_type = " ".join([*prefix, *suffix]).strip()
|
||||
|
||||
# Building part: inline token wins; otherwise join prefix + suffix.
|
||||
if bp_inline is not None:
|
||||
building_part = bp_inline
|
||||
else:
|
||||
building_part = " ".join(
|
||||
t for t in (bp_prefix_frag, bp_suffix_frag) if t
|
||||
).strip()
|
||||
|
||||
# Orientation: inline token wins for the primary direction;
|
||||
# combine with the opposite-direction fragment when present.
|
||||
primary = orient_inline or orient_prefix_token or ""
|
||||
secondary_candidates = [
|
||||
t for t in (orient_prefix_token, orient_suffix_token) if t and t != primary
|
||||
]
|
||||
if primary and secondary_candidates:
|
||||
orientation = f"{primary}-{secondary_candidates[0]}"
|
||||
else:
|
||||
orientation = primary
|
||||
|
||||
return glazing_type, building_part, orientation
|
||||
|
||||
def _extract_ventilation(self) -> VentilationAndCooling:
|
||||
return VentilationAndCooling(
|
||||
open_chimneys_count=self._int_val("No. of open chimneys"),
|
||||
|
|
|
|||
|
|
@ -92,3 +92,19 @@ def test_summary_000474_mapper_produces_three_building_parts() -> None:
|
|||
|
||||
# Assert
|
||||
assert len(epc.sap_building_parts) == 3
|
||||
|
||||
|
||||
def test_summary_000474_mapper_extracts_seven_windows() -> None:
|
||||
# Arrange — cert U985-0001-000474's §11 table lodges 7 windows
|
||||
# across Main + 1st Extension + 2nd Extension. The legacy Textract-
|
||||
# style window parser couldn't anchor on the Summary PDF's tabular
|
||||
# layout; the new W/H/Area-plus-Manufacturer anchor pair picks them
|
||||
# all up.
|
||||
pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF)
|
||||
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
|
||||
|
||||
# Act
|
||||
epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
|
||||
|
||||
# Assert
|
||||
assert len(epc.sap_windows) == 7
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue