Slice 47: Summary_000474 chain pins SAP at 1e-4 vs worksheet PDF

Two diffs closed against the hand-built `_elmhurst_worksheet_000474`
target (SAP 62.2584):

1. `pumps_fans_kwh_per_yr` (130 → 160). The cascade keys §4f pumps+fans
   electricity on `MainHeatingDetail.main_heating_category` (gas-fired
   boilers = cat 2 → 160 kWh/yr). `from_elmhurst_site_notes` wasn't
   populating the field, so it fell through to the default 130. Added
   `_elmhurst_main_heating_category` deriving cat 2 for the gas/LPG-
   PCDB-boiler branch; other categories deferred until a fixture
   exercises them (consistent with the cascade lookup).

2. Window [4] orientation `East-South` → `East` and window [5]
   orientation `''` → `South-East`. The layout-style parser's
   `before_start = prev_manuf + 7` / `after_end = next_data` rule was
   over-grabbing prefix tokens of W_{k+1} as suffix tokens of W_k
   ('South' from W_5's prefix bled into W_4's suffix). Replaced with
   a symmetric partition on the first glazing-type-start token
   (`Single`/`Double`/`Triple`/`Secondary`) within the cross-window
   gap, used as the upper bound of W_k's suffix and the lower bound
   of W_{k+1}'s prefix. Same boundary on both sides — prefix tokens
   of the next window can no longer be attributed as suffix of the
   current one.

After both fixes, Summary_000474 → ElmhurstSiteNotes → EpcPropertyData
→ cascade → SAP matches the worksheet PDF's unrounded line 257 value
to 1e-4 tolerance. All 754 datatypes/epc/ + backend/documents_parser/
tests green; pyright net-zero on touched files.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-05-24 19:01:38 +00:00
parent b6544e1cd1
commit 29ab80b0e5
2 changed files with 76 additions and 16 deletions

View file

@ -424,6 +424,15 @@ class ElmhurstSiteNotesExtractor:
"North", "South", "East", "West", "NE", "NW", "SE", "SW",
})
_BP_INLINE_TOKENS = frozenset({"Main"}) # "Extension" only appears as suffix
# The Elmhurst Summary PDF lodges each window's glazing-type as a
# capitalised phrase like "Double between 2002" / "Double with unknown"
# / "Single" / "Triple" / "Secondary". The first token of that phrase
# marks the start of a new window's prefix block in the layout dump,
# which is the only stable signal partitioning one window's suffix
# from the next window's prefix.
_GLAZING_TYPE_PREFIX_WORDS = frozenset({
"Single", "Double", "Triple", "Secondary",
})
def _extract_windows_from_layout(self) -> List[Window]:
"""Fallback window parser for Summary PDFs preprocessed from
@ -457,21 +466,34 @@ class ElmhurstSiteNotesExtractor:
manuf_idx = self._find_manufacturer_after(lines, data_idx)
if manuf_idx is None:
continue
prev_window_end = (
self._estimate_window_end(lines, data_anchors[k - 1][0])
if k > 0 else 0
prev_manuf_idx = (
self._find_manufacturer_after(lines, data_anchors[k - 1][0])
if k > 0 else None
)
next_window_start = (
next_data_idx = (
data_anchors[k + 1][0] if k + 1 < len(data_anchors) else len(lines)
)
# Partition the cross-window gap between this window's suffix
# and the next window's prefix on the first glazing-type-start
# token (Single/Double/Triple/Secondary). The same boundary
# is used symmetrically — current window's `after_end` = next
# window's `before_start` — so prefix tokens of W_{k+1} never
# get attributed as suffix of W_k (which was the bug producing
# orientation='East-South' for windows where 'South' actually
# belonged to the next row).
before_start = (
self._partition_after_manuf(lines, prev_manuf_idx, data_idx)
if prev_manuf_idx is not None else 0
)
after_end = self._partition_after_manuf(lines, manuf_idx, next_data_idx)
try:
window = self._parse_window_from_anchors(
lines=lines,
data_idx=data_idx,
manuf_idx=manuf_idx,
anchor=anchor,
before_start=prev_window_end,
after_end=next_window_start,
before_start=before_start,
after_end=after_end,
)
except (ValueError, IndexError):
continue
@ -485,15 +507,22 @@ class ElmhurstSiteNotesExtractor:
return j
return None
def _estimate_window_end(self, lines: List[str], data_idx: int) -> int:
"""End-of-window index (exclusive) for the window whose data
line is at `data_idx`. Used to bound the "before" segment of
the *next* window when extracting suffix tokens."""
manuf_idx = self._find_manufacturer_after(lines, data_idx)
if manuf_idx is None:
return data_idx + 1
# Manufacturer + g_value + draught + shutters + ~3 suffix tokens
return manuf_idx + 7
def _partition_after_manuf(
self, lines: List[str], manuf_idx: int, next_data_idx: int
) -> int:
"""Return the exclusive upper bound for this window's suffix
block (and the inclusive lower bound for the next window's prefix
block). After the manufacturer line come 3 fixed tokens (g_value,
draught, shutters); the variable suffix lines start at manuf+4
and run until the next window's glazing-type-start token (e.g.
'Double between 2002', 'Single', 'Triple ...') or until the
next window's data line if no such token is present."""
scan_start = manuf_idx + 4
for j in range(scan_start, next_data_idx):
first_word = lines[j].strip().split(" ", 1)[0]
if first_word in self._GLAZING_TYPE_PREFIX_WORDS:
return j
return next_data_idx
def _parse_window_from_anchors(
self,

View file

@ -1,6 +1,6 @@
import re
from datetime import date
from typing import List, Optional, Sequence, Union, Dict, Any
from typing import Any, Dict, Final, List, Optional, Sequence, Union
from datatypes.epc.schema.helpers import from_dict
from datatypes.epc.domain.epc_property_data import (
@ -62,6 +62,7 @@ from datatypes.epc.surveys.elmhurst_site_notes import (
BuildingPartDimensions as ElmhurstBuildingPartDimensions,
ElmhurstSiteNotes,
FloorDetails as ElmhurstFloorDetails,
MainHeating as ElmhurstMainHeating,
RoofDetails as ElmhurstRoofDetails,
VentilationAndCooling as ElmhurstVentilation,
WallDetails as ElmhurstWallDetails,
@ -2220,6 +2221,34 @@ def _elmhurst_sap_control_code(sap_control: str) -> Optional[int]:
return int(m.group(1)) if m else None
# SAP10.2 Table 4a main-heating-category codes. Currently only the
# gas-fired-boiler branch is exercised by the Elmhurst cohort — the
# cascade reads `main_heating_category` to key the §4f pumps+fans table
# (160 kWh/yr for cat 2 = 115 central heating pump + 45 flue fan) and to
# detect heat-network mains (cat 6). Other categories (heat pumps,
# warm-air, electric storage, oil/biomass) are deferred until a fixture
# exercises them.
_ELMHURST_HEATING_CATEGORY_GAS_BOILER: Final[int] = 2
_ELMHURST_GAS_BOILER_FUEL_TYPES: frozenset[str] = frozenset({
"Mains gas",
"LPG bottled",
"LPG bulk",
"LPG special condition",
})
def _elmhurst_main_heating_category(
mh: ElmhurstMainHeating, pcdb_index: Optional[int]
) -> Optional[int]:
"""Derive the SAP10.2 Table 4a main-heating-category from Elmhurst-
lodged data. A PCDB-referenced boiler on mains/LPG gas is category 2
(gas-fired boilers); other system types fall through to None so the
cascade applies its default pumps_fans 130 kWh/yr until extended."""
if pcdb_index is not None and mh.fuel_type in _ELMHURST_GAS_BOILER_FUEL_TYPES:
return _ELMHURST_HEATING_CATEGORY_GAS_BOILER
return None
def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
mh = survey.main_heating
sap_control = mh.heating_controls_sap
@ -2241,6 +2270,7 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
main_fuel_int = _elmhurst_main_fuel_int(mh.fuel_type)
heat_emitter_int = _elmhurst_heat_emitter_int(mh.heat_emitter)
sap_control_int = _elmhurst_sap_control_code(sap_control)
main_heating_category = _elmhurst_main_heating_category(mh, pcdb_index)
return SapHeating(
instantaneous_wwhrs=InstantaneousWwhrs(),
main_heating_details=[
@ -2256,6 +2286,7 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
fan_flue_present=mh.fan_assisted_flue,
main_heating_control=sap_control_int if sap_control_int is not None else control,
central_heating_pump_age_str=mh.heat_pump_age,
main_heating_category=main_heating_category,
# Per RdSAP, a PCDB-listed boiler is data source 1
# (manufacturer measured efficiency); the integer index
# number drives PCDB lookup in the cascade.