Slice 47: Summary_000474 chain pins SAP at 1e-4 vs worksheet PDF

Two diffs closed against the hand-built `_elmhurst_worksheet_000474` target (SAP 62.2584): 1. `pumps_fans_kwh_per_yr` (130 → 160). The cascade keys §4f pumps+fans electricity on `MainHeatingDetail.main_heating_category` (gas-fired boilers = cat 2 → 160 kWh/yr). `from_elmhurst_site_notes` wasn't populating the field, so it fell through to the default 130. Added `_elmhurst_main_heating_category` deriving cat 2 for the gas/LPG- PCDB-boiler branch; other categories deferred until a fixture exercises them (consistent with the cascade lookup). 2. Window [4] orientation `East-South` → `East` and window [5] orientation `''` → `South-East`. The layout-style parser's `before_start = prev_manuf + 7` / `after_end = next_data` rule was over-grabbing prefix tokens of W_{k+1} as suffix tokens of W_k ('South' from W_5's prefix bled into W_4's suffix). Replaced with a symmetric partition on the first glazing-type-start token (`Single`/`Double`/`Triple`/`Secondary`) within the cross-window gap, used as the upper bound of W_k's suffix and the lower bound of W_{k+1}'s prefix. Same boundary on both sides — prefix tokens of the next window can no longer be attributed as suffix of the current one. After both fixes, Summary_000474 → ElmhurstSiteNotes → EpcPropertyData → cascade → SAP matches the worksheet PDF's unrounded line 257 value to 1e-4 tolerance. All 754 datatypes/epc/ + backend/documents_parser/ tests green; pyright net-zero on touched files. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-30 13:10:47 +00:00 · 2026-05-24 19:01:38 +00:00 · 2026-05-24 19:01:38 +00:00 · 29ab80b0e5
commit 29ab80b0e5
parent b6544e1cd1
2 changed files with 76 additions and 16 deletions
--- a/backend/documents_parser/elmhurst_extractor.py
+++ b/backend/documents_parser/elmhurst_extractor.py
@ -424,6 +424,15 @@ class ElmhurstSiteNotesExtractor:
        "North", "South", "East", "West", "NE", "NW", "SE", "SW",
    })
    _BP_INLINE_TOKENS = frozenset({"Main"})  # "Extension" only appears as suffix
    # The Elmhurst Summary PDF lodges each window's glazing-type as a
    # capitalised phrase like "Double between 2002" / "Double with unknown"
    # / "Single" / "Triple" / "Secondary". The first token of that phrase
    # marks the start of a new window's prefix block in the layout dump,
    # which is the only stable signal partitioning one window's suffix
    # from the next window's prefix.
    _GLAZING_TYPE_PREFIX_WORDS = frozenset({
        "Single", "Double", "Triple", "Secondary",
    })
    def _extract_windows_from_layout(self) -> List[Window]:
        """Fallback window parser for Summary PDFs preprocessed from
@ -457,21 +466,34 @@ class ElmhurstSiteNotesExtractor:
            manuf_idx = self._find_manufacturer_after(lines, data_idx)
            if manuf_idx is None:
                continue
-            prev_window_end = (
+            prev_manuf_idx = (
-                self._estimate_window_end(lines, data_anchors[k - 1][0])
+                self._find_manufacturer_after(lines, data_anchors[k - 1][0])
-                if k > 0 else 0
+                if k > 0 else None
            )
-            next_window_start = (
+            next_data_idx = (
                data_anchors[k + 1][0] if k + 1 < len(data_anchors) else len(lines)
            )
            # Partition the cross-window gap between this window's suffix
            # and the next window's prefix on the first glazing-type-start
            # token (Single/Double/Triple/Secondary). The same boundary
            # is used symmetrically — current window's `after_end` = next
            # window's `before_start` — so prefix tokens of W_{k+1} never
            # get attributed as suffix of W_k (which was the bug producing
            # orientation='East-South' for windows where 'South' actually
            # belonged to the next row).
            before_start = (
                self._partition_after_manuf(lines, prev_manuf_idx, data_idx)
                if prev_manuf_idx is not None else 0
            )
            after_end = self._partition_after_manuf(lines, manuf_idx, next_data_idx)
            try:
                window = self._parse_window_from_anchors(
                    lines=lines,
                    data_idx=data_idx,
                    manuf_idx=manuf_idx,
                    anchor=anchor,
-                    before_start=prev_window_end,
+                    before_start=before_start,
-                    after_end=next_window_start,
+                    after_end=after_end,
                )
            except (ValueError, IndexError):
                continue
@ -485,15 +507,22 @@ class ElmhurstSiteNotesExtractor:
                return j
        return None
-    def _estimate_window_end(self, lines: List[str], data_idx: int) -> int:
+    def _partition_after_manuf(
-        """End-of-window index (exclusive) for the window whose data
+        self, lines: List[str], manuf_idx: int, next_data_idx: int
-        line is at `data_idx`. Used to bound the "before" segment of
+    ) -> int:
-        the *next* window when extracting suffix tokens."""
+        """Return the exclusive upper bound for this window's suffix
-        manuf_idx = self._find_manufacturer_after(lines, data_idx)
+        block (and the inclusive lower bound for the next window's prefix
-        if manuf_idx is None:
+        block). After the manufacturer line come 3 fixed tokens (g_value,
-            return data_idx + 1
+        draught, shutters); the variable suffix lines start at manuf+4
-        # Manufacturer + g_value + draught + shutters + ~3 suffix tokens
+        and run until the next window's glazing-type-start token (e.g.
-        return manuf_idx + 7
+        'Double between 2002', 'Single', 'Triple ...') or until the
        next window's data line if no such token is present."""
        scan_start = manuf_idx + 4
        for j in range(scan_start, next_data_idx):
            first_word = lines[j].strip().split(" ", 1)[0]
            if first_word in self._GLAZING_TYPE_PREFIX_WORDS:
                return j
        return next_data_idx
    def _parse_window_from_anchors(
        self,
--- a/datatypes/epc/domain/mapper.py
+++ b/datatypes/epc/domain/mapper.py
@ -1,6 +1,6 @@
 import re
 from datetime import date
-from typing import List, Optional, Sequence, Union, Dict, Any
+from typing import Any, Dict, Final, List, Optional, Sequence, Union
 from datatypes.epc.schema.helpers import from_dict
 from datatypes.epc.domain.epc_property_data import (
@ -62,6 +62,7 @@ from datatypes.epc.surveys.elmhurst_site_notes import (
    BuildingPartDimensions as ElmhurstBuildingPartDimensions,
    ElmhurstSiteNotes,
    FloorDetails as ElmhurstFloorDetails,
    MainHeating as ElmhurstMainHeating,
    RoofDetails as ElmhurstRoofDetails,
    VentilationAndCooling as ElmhurstVentilation,
    WallDetails as ElmhurstWallDetails,
@ -2220,6 +2221,34 @@ def _elmhurst_sap_control_code(sap_control: str) -> Optional[int]:
    return int(m.group(1)) if m else None
 # SAP10.2 Table 4a main-heating-category codes. Currently only the
 # gas-fired-boiler branch is exercised by the Elmhurst cohort — the
 # cascade reads `main_heating_category` to key the §4f pumps+fans table
 # (160 kWh/yr for cat 2 = 115 central heating pump + 45 flue fan) and to
 # detect heat-network mains (cat 6). Other categories (heat pumps,
 # warm-air, electric storage, oil/biomass) are deferred until a fixture
 # exercises them.
 _ELMHURST_HEATING_CATEGORY_GAS_BOILER: Final[int] = 2
 _ELMHURST_GAS_BOILER_FUEL_TYPES: frozenset[str] = frozenset({
    "Mains gas",
    "LPG bottled",
    "LPG bulk",
    "LPG special condition",
 })
 def _elmhurst_main_heating_category(
    mh: ElmhurstMainHeating, pcdb_index: Optional[int]
 ) -> Optional[int]:
    """Derive the SAP10.2 Table 4a main-heating-category from Elmhurst-
    lodged data. A PCDB-referenced boiler on mains/LPG gas is category 2
    (gas-fired boilers); other system types fall through to None so the
    cascade applies its default pumps_fans 130 kWh/yr until extended."""
    if pcdb_index is not None and mh.fuel_type in _ELMHURST_GAS_BOILER_FUEL_TYPES:
        return _ELMHURST_HEATING_CATEGORY_GAS_BOILER
    return None
 def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
    mh = survey.main_heating
    sap_control = mh.heating_controls_sap
@ -2241,6 +2270,7 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
    main_fuel_int = _elmhurst_main_fuel_int(mh.fuel_type)
    heat_emitter_int = _elmhurst_heat_emitter_int(mh.heat_emitter)
    sap_control_int = _elmhurst_sap_control_code(sap_control)
    main_heating_category = _elmhurst_main_heating_category(mh, pcdb_index)
    return SapHeating(
        instantaneous_wwhrs=InstantaneousWwhrs(),
        main_heating_details=[
@ -2256,6 +2286,7 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
                fan_flue_present=mh.fan_assisted_flue,
                main_heating_control=sap_control_int if sap_control_int is not None else control,
                central_heating_pump_age_str=mh.heat_pump_age,
                main_heating_category=main_heating_category,
                # Per RdSAP, a PCDB-listed boiler is data source 1
                # (manufacturer measured efficiency); the integer index
                # number drives PCDB lookup in the cascade.