diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 16f32e07..44d5325e 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -1,6 +1,6 @@ import re from datetime import date, datetime -from typing import List, Optional +from typing import Final, List, Optional from datatypes.epc.surveys.elmhurst_site_notes import ( AlternativeWall, @@ -811,6 +811,19 @@ class ElmhurstSiteNotesExtractor: r"^(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)(?:\s+(\S.*?))?$" ) _MANUFACTURER_RE = re.compile(r"^(Manufacturer|Default)\s+(\d+\.\d+)$") + # "Known data" rows (BFRC / SAP Table) lodge the §11 Data-Source cell on + # its own layout line with the U-value following on the next line — and + # carry no Frame Type / Frame Factor / Glazing Gap cells. The joined + # " " `_MANUFACTURER_RE` shape never matches them, so they are + # anchored by this standalone form instead (cert 001431 §11 has one + # "BFRC data" window). "Manufacturer"/"Default" are kept here only for + # symmetry; in practice they always join with the U-value above. + _STANDALONE_DATA_SOURCE_RE = re.compile( + r"^(BFRC data|BFRC|SAP Table|Assessor|Manufacturer|Default)$" + ) + # RdSAP 10 §3.7 default window frame factor, used for "known data" rows + # that lodge U and g directly and omit the frame-factor cell. + _DEFAULT_FRAME_FACTOR: Final[float] = 0.7 _ORIENTATION_TOKENS = frozenset({ "North", "South", "East", "West", "NE", "NW", "SE", "SW", }) @@ -900,7 +913,10 @@ class ElmhurstSiteNotesExtractor: def _find_manufacturer_after(self, lines: List[str], data_idx: int) -> Optional[int]: for j in range(data_idx + 1, min(data_idx + 12, len(lines))): - if self._MANUFACTURER_RE.match(lines[j].strip()): + stripped = lines[j].strip() + if self._MANUFACTURER_RE.match(stripped) or ( + self._STANDALONE_DATA_SOURCE_RE.match(stripped) + ): return j return None @@ -985,6 +1001,20 @@ class ElmhurstSiteNotesExtractor: # would-be glazing-prefix scan. inline_glazing_type = anchor.group(4) if anchor.lastindex and anchor.lastindex >= 4 else None + # The data-source line is either the joined "Manufacturer 4.80" shape + # (source keyword + U on one line) or a sparse standalone "BFRC data" + # / "SAP Table" shape (keyword alone, U on the next line, and no frame + # cells lodged). Resolve which up front: a sparse row has no frame + # type/factor to parse. + data_source_line = lines[manuf_idx].strip() + joined_match = self._MANUFACTURER_RE.match(data_source_line) + standalone_match = ( + None if joined_match is not None + else self._STANDALONE_DATA_SOURCE_RE.match(data_source_line) + ) + if joined_match is None and standalone_match is None: + return None + # frame_type and frame_factor immediately follow the data line. # Layout-style cell joining sometimes collapses them onto a # single "Wood 0.70" line; treat both shapes uniformly so the @@ -992,9 +1022,15 @@ class ElmhurstSiteNotesExtractor: # field (glazing_gap / bp / location / orient). if data_idx + 1 >= len(lines): return None - frame_type, frame_factor, middle_start = self._parse_frame_type_and_factor( - lines, data_idx - ) + if standalone_match is not None: + # Sparse "known data" row: no frame type/factor/glazing-gap cells; + # everything between W×H×A and the data-source is location/orient. + frame_type, frame_factor = None, self._DEFAULT_FRAME_FACTOR + middle_start = data_idx + 1 + else: + frame_type, frame_factor, middle_start = self._parse_frame_type_and_factor( + lines, data_idx + ) if frame_factor is None or not 0.0 < frame_factor <= 1.0: return None @@ -1017,28 +1053,40 @@ class ElmhurstSiteNotesExtractor: (t for t in middle if t in self._ORIENTATION_TOKENS), None ) - # Manufacturer line carries data_source + u_value. - manuf_match = self._MANUFACTURER_RE.match(lines[manuf_idx].strip()) - if manuf_match is None: - return None - data_source = manuf_match.group(1) - u_value = float(manuf_match.group(2)) + # Data-source line carries the source keyword and U-value: joined on + # one line ("Manufacturer 4.80") or, for sparse rows, the keyword alone + # with the U-value on the next line ("BFRC data" / "1.00"). `post_idx` + # is where g_value / draught / shutters begin in either layout. + if joined_match is not None: + data_source = joined_match.group(1) + u_value = float(joined_match.group(2)) + post_idx = manuf_idx + 1 + else: + assert standalone_match is not None + data_source = standalone_match.group(1) + if manuf_idx + 1 >= len(lines): + return None + try: + u_value = float(lines[manuf_idx + 1].strip()) + except ValueError: + return None + post_idx = manuf_idx + 2 - # Post-manufacturer: g_value, draught, shutters. - if manuf_idx + 3 >= len(lines): + # Post-data-source: g_value, draught, shutters. + if post_idx + 2 >= len(lines): return None try: - g_value = float(lines[manuf_idx + 1].strip()) + g_value = float(lines[post_idx].strip()) except ValueError: return None - draught_proofed = lines[manuf_idx + 2].strip().lower() == "yes" - permanent_shutters = lines[manuf_idx + 3].strip() + draught_proofed = lines[post_idx + 1].strip().lower() == "yes" + permanent_shutters = lines[post_idx + 2].strip() # Prefix / suffix tokens (variable count) carry the # glazing-type, building-part, and orientation strings split by # the layout preprocessor. before = [lines[j].strip() for j in range(before_start, data_idx) if lines[j].strip()] - after = [lines[j].strip() for j in range(manuf_idx + 4, after_end) if lines[j].strip()] + after = [lines[j].strip() for j in range(post_idx + 3, after_end) if lines[j].strip()] # Room-in-roof windows lodge their location as "Roof of Room in # Roof" (wrapped across the prefix/suffix blocks). Detect it, pull diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 83a0a9eb..b992254a 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -4116,6 +4116,14 @@ def _is_elmhurst_roof_window( _ELMHURST_BP_ROOF_TYPES_WITH_ROOFLIGHTS ): return True + # A window lodged on a wall is vertical by definition. The U-value + # backstop below only catches skylights whose location/BP gives no + # roof signal; without this guard a high-U *wall* window (e.g. an old + # "Double pre 2002" unit at U 3.1 / 3.4) is mis-routed to the roof- + # window list on U-value alone — cert 001431 §11 lodges two such + # External-wall windows that must remain vertical `sap_windows`. + if "wall" in (w.location or "").lower(): + return False return w.u_value > _ELMHURST_ROOF_WINDOW_U_THRESHOLD diff --git a/tests/domain/modelling/test_window_extraction_001431.py b/tests/domain/modelling/test_window_extraction_001431.py new file mode 100644 index 00000000..b7a81c19 --- /dev/null +++ b/tests/domain/modelling/test_window_extraction_001431.py @@ -0,0 +1,52 @@ +"""Window-extraction completeness pin for cert 001431. + +The Modelling glazing overlay's draught-proofing recompute (RdSAP 10 §8.1 — a +count over openable windows + doors) needs every openable §11 window captured +with its `draught_proofed` flag. The Elmhurst Summary §11 block lodges 17 +openable windows; two extraction gaps previously surfaced only 14: + + 1. The extractor rejected the one "Double glazing, known data" row whose + data-source cell is "BFRC data" (laid out as its own line, with no frame + factor) — it does not fit the ` ` Manufacturer-line shape. + 2. The mapper's `_is_elmhurst_roof_window` reclassified the two "Double pre + 2002" rows (U 3.1 / 3.4 > 3.0) as roof windows, even though both are + lodged on an "External wall" — a false positive of the U-value backstop. + +With both closed, all 17 windows are `sap_windows` (none mis-routed to +`sap_roof_windows`), and 14 carry `draught_proofed=True` — reconstructing +Elmhurst's lodged 84% draught-proofing (16/19 = (14 windows + 2 doors) / +(17 windows + 2 doors)). +""" + +from __future__ import annotations + +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from tests.domain.modelling._elmhurst_recommendation import ( + parse_recommendation_summary, +) + + +def test_all_17_openable_windows_captured_on_001431() -> None: + # Arrange / Act + epc: EpcPropertyData = parse_recommendation_summary( + "double_glazing_001431_before.pdf" + ) + + # Assert — every openable §11 window is captured as a vertical window; + # none of the wall-lodged rows leak into the roof-window list. + assert len(epc.sap_windows) == 17 + assert not epc.sap_roof_windows # None or empty — no wall window misrouted + + +def test_draughtproofing_count_reconstructs_lodged_84_percent() -> None: + # Arrange / Act + epc: EpcPropertyData = parse_recommendation_summary( + "double_glazing_001431_before.pdf" + ) + + # Assert — 14 of the 17 openable windows are draught-proofed, the numerator + # behind Elmhurst's lodged 84% (with the 2 lodged draught-proofed doors). + draughtproofed: int = sum( + 1 for window in epc.sap_windows if window.draught_proofed + ) + assert draughtproofed == 14