Slice 47: Summary_000474 chain pins SAP at 1e-4 vs worksheet PDF

Two diffs closed against the hand-built `_elmhurst_worksheet_000474`
target (SAP 62.2584):

1. `pumps_fans_kwh_per_yr` (130 → 160). The cascade keys §4f pumps+fans
   electricity on `MainHeatingDetail.main_heating_category` (gas-fired
   boilers = cat 2 → 160 kWh/yr). `from_elmhurst_site_notes` wasn't
   populating the field, so it fell through to the default 130. Added
   `_elmhurst_main_heating_category` deriving cat 2 for the gas/LPG-
   PCDB-boiler branch; other categories deferred until a fixture
   exercises them (consistent with the cascade lookup).

2. Window [4] orientation `East-South` → `East` and window [5]
   orientation `''` → `South-East`. The layout-style parser's
   `before_start = prev_manuf + 7` / `after_end = next_data` rule was
   over-grabbing prefix tokens of W_{k+1} as suffix tokens of W_k
   ('South' from W_5's prefix bled into W_4's suffix). Replaced with
   a symmetric partition on the first glazing-type-start token
   (`Single`/`Double`/`Triple`/`Secondary`) within the cross-window
   gap, used as the upper bound of W_k's suffix and the lower bound
   of W_{k+1}'s prefix. Same boundary on both sides — prefix tokens
   of the next window can no longer be attributed as suffix of the
   current one.

After both fixes, Summary_000474 → ElmhurstSiteNotes → EpcPropertyData
→ cascade → SAP matches the worksheet PDF's unrounded line 257 value
to 1e-4 tolerance. All 754 datatypes/epc/ + backend/documents_parser/
tests green; pyright net-zero on touched files.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-05-24 19:01:38 +00:00
parent b6544e1cd1
commit 29ab80b0e5
2 changed files with 76 additions and 16 deletions

View file

@ -424,6 +424,15 @@ class ElmhurstSiteNotesExtractor:
"North", "South", "East", "West", "NE", "NW", "SE", "SW", "North", "South", "East", "West", "NE", "NW", "SE", "SW",
}) })
_BP_INLINE_TOKENS = frozenset({"Main"}) # "Extension" only appears as suffix _BP_INLINE_TOKENS = frozenset({"Main"}) # "Extension" only appears as suffix
# The Elmhurst Summary PDF lodges each window's glazing-type as a
# capitalised phrase like "Double between 2002" / "Double with unknown"
# / "Single" / "Triple" / "Secondary". The first token of that phrase
# marks the start of a new window's prefix block in the layout dump,
# which is the only stable signal partitioning one window's suffix
# from the next window's prefix.
_GLAZING_TYPE_PREFIX_WORDS = frozenset({
"Single", "Double", "Triple", "Secondary",
})
def _extract_windows_from_layout(self) -> List[Window]: def _extract_windows_from_layout(self) -> List[Window]:
"""Fallback window parser for Summary PDFs preprocessed from """Fallback window parser for Summary PDFs preprocessed from
@ -457,21 +466,34 @@ class ElmhurstSiteNotesExtractor:
manuf_idx = self._find_manufacturer_after(lines, data_idx) manuf_idx = self._find_manufacturer_after(lines, data_idx)
if manuf_idx is None: if manuf_idx is None:
continue continue
prev_window_end = ( prev_manuf_idx = (
self._estimate_window_end(lines, data_anchors[k - 1][0]) self._find_manufacturer_after(lines, data_anchors[k - 1][0])
if k > 0 else 0 if k > 0 else None
) )
next_window_start = ( next_data_idx = (
data_anchors[k + 1][0] if k + 1 < len(data_anchors) else len(lines) data_anchors[k + 1][0] if k + 1 < len(data_anchors) else len(lines)
) )
# Partition the cross-window gap between this window's suffix
# and the next window's prefix on the first glazing-type-start
# token (Single/Double/Triple/Secondary). The same boundary
# is used symmetrically — current window's `after_end` = next
# window's `before_start` — so prefix tokens of W_{k+1} never
# get attributed as suffix of W_k (which was the bug producing
# orientation='East-South' for windows where 'South' actually
# belonged to the next row).
before_start = (
self._partition_after_manuf(lines, prev_manuf_idx, data_idx)
if prev_manuf_idx is not None else 0
)
after_end = self._partition_after_manuf(lines, manuf_idx, next_data_idx)
try: try:
window = self._parse_window_from_anchors( window = self._parse_window_from_anchors(
lines=lines, lines=lines,
data_idx=data_idx, data_idx=data_idx,
manuf_idx=manuf_idx, manuf_idx=manuf_idx,
anchor=anchor, anchor=anchor,
before_start=prev_window_end, before_start=before_start,
after_end=next_window_start, after_end=after_end,
) )
except (ValueError, IndexError): except (ValueError, IndexError):
continue continue
@ -485,15 +507,22 @@ class ElmhurstSiteNotesExtractor:
return j return j
return None return None
def _estimate_window_end(self, lines: List[str], data_idx: int) -> int: def _partition_after_manuf(
"""End-of-window index (exclusive) for the window whose data self, lines: List[str], manuf_idx: int, next_data_idx: int
line is at `data_idx`. Used to bound the "before" segment of ) -> int:
the *next* window when extracting suffix tokens.""" """Return the exclusive upper bound for this window's suffix
manuf_idx = self._find_manufacturer_after(lines, data_idx) block (and the inclusive lower bound for the next window's prefix
if manuf_idx is None: block). After the manufacturer line come 3 fixed tokens (g_value,
return data_idx + 1 draught, shutters); the variable suffix lines start at manuf+4
# Manufacturer + g_value + draught + shutters + ~3 suffix tokens and run until the next window's glazing-type-start token (e.g.
return manuf_idx + 7 'Double between 2002', 'Single', 'Triple ...') or until the
next window's data line if no such token is present."""
scan_start = manuf_idx + 4
for j in range(scan_start, next_data_idx):
first_word = lines[j].strip().split(" ", 1)[0]
if first_word in self._GLAZING_TYPE_PREFIX_WORDS:
return j
return next_data_idx
def _parse_window_from_anchors( def _parse_window_from_anchors(
self, self,

View file

@ -1,6 +1,6 @@
import re import re
from datetime import date from datetime import date
from typing import List, Optional, Sequence, Union, Dict, Any from typing import Any, Dict, Final, List, Optional, Sequence, Union
from datatypes.epc.schema.helpers import from_dict from datatypes.epc.schema.helpers import from_dict
from datatypes.epc.domain.epc_property_data import ( from datatypes.epc.domain.epc_property_data import (
@ -62,6 +62,7 @@ from datatypes.epc.surveys.elmhurst_site_notes import (
BuildingPartDimensions as ElmhurstBuildingPartDimensions, BuildingPartDimensions as ElmhurstBuildingPartDimensions,
ElmhurstSiteNotes, ElmhurstSiteNotes,
FloorDetails as ElmhurstFloorDetails, FloorDetails as ElmhurstFloorDetails,
MainHeating as ElmhurstMainHeating,
RoofDetails as ElmhurstRoofDetails, RoofDetails as ElmhurstRoofDetails,
VentilationAndCooling as ElmhurstVentilation, VentilationAndCooling as ElmhurstVentilation,
WallDetails as ElmhurstWallDetails, WallDetails as ElmhurstWallDetails,
@ -2220,6 +2221,34 @@ def _elmhurst_sap_control_code(sap_control: str) -> Optional[int]:
return int(m.group(1)) if m else None return int(m.group(1)) if m else None
# SAP10.2 Table 4a main-heating-category codes. Currently only the
# gas-fired-boiler branch is exercised by the Elmhurst cohort — the
# cascade reads `main_heating_category` to key the §4f pumps+fans table
# (160 kWh/yr for cat 2 = 115 central heating pump + 45 flue fan) and to
# detect heat-network mains (cat 6). Other categories (heat pumps,
# warm-air, electric storage, oil/biomass) are deferred until a fixture
# exercises them.
_ELMHURST_HEATING_CATEGORY_GAS_BOILER: Final[int] = 2
_ELMHURST_GAS_BOILER_FUEL_TYPES: frozenset[str] = frozenset({
"Mains gas",
"LPG bottled",
"LPG bulk",
"LPG special condition",
})
def _elmhurst_main_heating_category(
mh: ElmhurstMainHeating, pcdb_index: Optional[int]
) -> Optional[int]:
"""Derive the SAP10.2 Table 4a main-heating-category from Elmhurst-
lodged data. A PCDB-referenced boiler on mains/LPG gas is category 2
(gas-fired boilers); other system types fall through to None so the
cascade applies its default pumps_fans 130 kWh/yr until extended."""
if pcdb_index is not None and mh.fuel_type in _ELMHURST_GAS_BOILER_FUEL_TYPES:
return _ELMHURST_HEATING_CATEGORY_GAS_BOILER
return None
def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating: def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
mh = survey.main_heating mh = survey.main_heating
sap_control = mh.heating_controls_sap sap_control = mh.heating_controls_sap
@ -2241,6 +2270,7 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
main_fuel_int = _elmhurst_main_fuel_int(mh.fuel_type) main_fuel_int = _elmhurst_main_fuel_int(mh.fuel_type)
heat_emitter_int = _elmhurst_heat_emitter_int(mh.heat_emitter) heat_emitter_int = _elmhurst_heat_emitter_int(mh.heat_emitter)
sap_control_int = _elmhurst_sap_control_code(sap_control) sap_control_int = _elmhurst_sap_control_code(sap_control)
main_heating_category = _elmhurst_main_heating_category(mh, pcdb_index)
return SapHeating( return SapHeating(
instantaneous_wwhrs=InstantaneousWwhrs(), instantaneous_wwhrs=InstantaneousWwhrs(),
main_heating_details=[ main_heating_details=[
@ -2256,6 +2286,7 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
fan_flue_present=mh.fan_assisted_flue, fan_flue_present=mh.fan_assisted_flue,
main_heating_control=sap_control_int if sap_control_int is not None else control, main_heating_control=sap_control_int if sap_control_int is not None else control,
central_heating_pump_age_str=mh.heat_pump_age, central_heating_pump_age_str=mh.heat_pump_age,
main_heating_category=main_heating_category,
# Per RdSAP, a PCDB-listed boiler is data source 1 # Per RdSAP, a PCDB-listed boiler is data source 1
# (manufacturer measured efficiency); the integer index # (manufacturer measured efficiency); the integer index
# number drives PCDB lookup in the cascade. # number drives PCDB lookup in the cascade.