fix(extractor): capture all 17 openable §11 windows on cert 001431

cert 001431's §11 lodges 17 windows but only 14 surfaced, via two distinct gaps:

1. Extractor (_extract_windows_from_layout): the one "Double glazing, known
   data" row whose §11 Data-Source cell is "BFRC data" was rejected — it is
   laid out as a standalone keyword line with the U-value on the next line
   and lodges no Frame Type/Factor/Gap cells, so it never matched the joined
   "<source> <U>" Manufacturer-line shape. Now anchored by a standalone
   data-source form, with the RdSAP 10 §3.7 default frame factor (0.7) for
   the absent frame cell.

2. Mapper (_is_elmhurst_roof_window): the two "Double pre 2002" rows
   (U 3.1 / 3.4 > 3.0) were reclassified as roof windows by the U-value
   backstop even though both are lodged on an "External wall". A window
   lodged on a wall is vertical by definition; guard the U-value backstop so
   it only fires when location/BP give no roof signal.

With both closed: 17 sap_windows, 0 misrouted to sap_roof_windows.

Re-homed onto the mapper-validation line from feature/bill-derivation
(orig f68cea27); the modelling-only regression test
(tests/domain/modelling/test_window_extraction_001431.py) stays on
bill-derivation. KNOWN: the mapper guard breaks cert 000516's
test_summary_pdf_mapper_chain pins (W6 U=3.10 routing) — must be resolved
before this PRs to main.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-05 19:00:02 +00:00
parent d559298de2
commit 97f44b5364
2 changed files with 73 additions and 17 deletions

View file

@ -1,6 +1,6 @@
import re
from datetime import date, datetime
from typing import List, Optional
from typing import Final, List, Optional
from datatypes.epc.surveys.elmhurst_site_notes import (
AlternativeWall,
@ -811,6 +811,19 @@ class ElmhurstSiteNotesExtractor:
r"^(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)(?:\s+(\S.*?))?$"
)
_MANUFACTURER_RE = re.compile(r"^(Manufacturer|Default)\s+(\d+\.\d+)$")
# "Known data" rows (BFRC / SAP Table) lodge the §11 Data-Source cell on
# its own layout line with the U-value following on the next line — and
# carry no Frame Type / Frame Factor / Glazing Gap cells. The joined
# "<source> <U>" `_MANUFACTURER_RE` shape never matches them, so they are
# anchored by this standalone form instead (cert 001431 §11 has one
# "BFRC data" window). "Manufacturer"/"Default" are kept here only for
# symmetry; in practice they always join with the U-value above.
_STANDALONE_DATA_SOURCE_RE = re.compile(
r"^(BFRC data|BFRC|SAP Table|Assessor|Manufacturer|Default)$"
)
# RdSAP 10 §3.7 default window frame factor, used for "known data" rows
# that lodge U and g directly and omit the frame-factor cell.
_DEFAULT_FRAME_FACTOR: Final[float] = 0.7
_ORIENTATION_TOKENS = frozenset({
"North", "South", "East", "West", "NE", "NW", "SE", "SW",
})
@ -900,7 +913,10 @@ class ElmhurstSiteNotesExtractor:
def _find_manufacturer_after(self, lines: List[str], data_idx: int) -> Optional[int]:
for j in range(data_idx + 1, min(data_idx + 12, len(lines))):
if self._MANUFACTURER_RE.match(lines[j].strip()):
stripped = lines[j].strip()
if self._MANUFACTURER_RE.match(stripped) or (
self._STANDALONE_DATA_SOURCE_RE.match(stripped)
):
return j
return None
@ -985,6 +1001,20 @@ class ElmhurstSiteNotesExtractor:
# would-be glazing-prefix scan.
inline_glazing_type = anchor.group(4) if anchor.lastindex and anchor.lastindex >= 4 else None
# The data-source line is either the joined "Manufacturer 4.80" shape
# (source keyword + U on one line) or a sparse standalone "BFRC data"
# / "SAP Table" shape (keyword alone, U on the next line, and no frame
# cells lodged). Resolve which up front: a sparse row has no frame
# type/factor to parse.
data_source_line = lines[manuf_idx].strip()
joined_match = self._MANUFACTURER_RE.match(data_source_line)
standalone_match = (
None if joined_match is not None
else self._STANDALONE_DATA_SOURCE_RE.match(data_source_line)
)
if joined_match is None and standalone_match is None:
return None
# frame_type and frame_factor immediately follow the data line.
# Layout-style cell joining sometimes collapses them onto a
# single "Wood 0.70" line; treat both shapes uniformly so the
@ -992,9 +1022,15 @@ class ElmhurstSiteNotesExtractor:
# field (glazing_gap / bp / location / orient).
if data_idx + 1 >= len(lines):
return None
frame_type, frame_factor, middle_start = self._parse_frame_type_and_factor(
lines, data_idx
)
if standalone_match is not None:
# Sparse "known data" row: no frame type/factor/glazing-gap cells;
# everything between W×H×A and the data-source is location/orient.
frame_type, frame_factor = None, self._DEFAULT_FRAME_FACTOR
middle_start = data_idx + 1
else:
frame_type, frame_factor, middle_start = self._parse_frame_type_and_factor(
lines, data_idx
)
if frame_factor is None or not 0.0 < frame_factor <= 1.0:
return None
@ -1017,28 +1053,40 @@ class ElmhurstSiteNotesExtractor:
(t for t in middle if t in self._ORIENTATION_TOKENS), None
)
# Manufacturer line carries data_source + u_value.
manuf_match = self._MANUFACTURER_RE.match(lines[manuf_idx].strip())
if manuf_match is None:
return None
data_source = manuf_match.group(1)
u_value = float(manuf_match.group(2))
# Data-source line carries the source keyword and U-value: joined on
# one line ("Manufacturer 4.80") or, for sparse rows, the keyword alone
# with the U-value on the next line ("BFRC data" / "1.00"). `post_idx`
# is where g_value / draught / shutters begin in either layout.
if joined_match is not None:
data_source = joined_match.group(1)
u_value = float(joined_match.group(2))
post_idx = manuf_idx + 1
else:
assert standalone_match is not None
data_source = standalone_match.group(1)
if manuf_idx + 1 >= len(lines):
return None
try:
u_value = float(lines[manuf_idx + 1].strip())
except ValueError:
return None
post_idx = manuf_idx + 2
# Post-manufacturer: g_value, draught, shutters.
if manuf_idx + 3 >= len(lines):
# Post-data-source: g_value, draught, shutters.
if post_idx + 2 >= len(lines):
return None
try:
g_value = float(lines[manuf_idx + 1].strip())
g_value = float(lines[post_idx].strip())
except ValueError:
return None
draught_proofed = lines[manuf_idx + 2].strip().lower() == "yes"
permanent_shutters = lines[manuf_idx + 3].strip()
draught_proofed = lines[post_idx + 1].strip().lower() == "yes"
permanent_shutters = lines[post_idx + 2].strip()
# Prefix / suffix tokens (variable count) carry the
# glazing-type, building-part, and orientation strings split by
# the layout preprocessor.
before = [lines[j].strip() for j in range(before_start, data_idx) if lines[j].strip()]
after = [lines[j].strip() for j in range(manuf_idx + 4, after_end) if lines[j].strip()]
after = [lines[j].strip() for j in range(post_idx + 3, after_end) if lines[j].strip()]
# Room-in-roof windows lodge their location as "Roof of Room in
# Roof" (wrapped across the prefix/suffix blocks). Detect it, pull

View file

@ -4116,6 +4116,14 @@ def _is_elmhurst_roof_window(
_ELMHURST_BP_ROOF_TYPES_WITH_ROOFLIGHTS
):
return True
# A window lodged on a wall is vertical by definition. The U-value
# backstop below only catches skylights whose location/BP gives no
# roof signal; without this guard a high-U *wall* window (e.g. an old
# "Double pre 2002" unit at U 3.1 / 3.4) is mis-routed to the roof-
# window list on U-value alone — cert 001431 §11 lodges two such
# External-wall windows that must remain vertical `sap_windows`.
if "wall" in (w.location or "").lower():
return False
return w.u_value > _ELMHURST_ROOF_WINDOW_U_THRESHOLD