mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Slice 47: Summary_000474 chain pins SAP at 1e-4 vs worksheet PDF
Two diffs closed against the hand-built `_elmhurst_worksheet_000474`
target (SAP 62.2584):
1. `pumps_fans_kwh_per_yr` (130 → 160). The cascade keys §4f pumps+fans
electricity on `MainHeatingDetail.main_heating_category` (gas-fired
boilers = cat 2 → 160 kWh/yr). `from_elmhurst_site_notes` wasn't
populating the field, so it fell through to the default 130. Added
`_elmhurst_main_heating_category` deriving cat 2 for the gas/LPG-
PCDB-boiler branch; other categories deferred until a fixture
exercises them (consistent with the cascade lookup).
2. Window [4] orientation `East-South` → `East` and window [5]
orientation `''` → `South-East`. The layout-style parser's
`before_start = prev_manuf + 7` / `after_end = next_data` rule was
over-grabbing prefix tokens of W_{k+1} as suffix tokens of W_k
('South' from W_5's prefix bled into W_4's suffix). Replaced with
a symmetric partition on the first glazing-type-start token
(`Single`/`Double`/`Triple`/`Secondary`) within the cross-window
gap, used as the upper bound of W_k's suffix and the lower bound
of W_{k+1}'s prefix. Same boundary on both sides — prefix tokens
of the next window can no longer be attributed as suffix of the
current one.
After both fixes, Summary_000474 → ElmhurstSiteNotes → EpcPropertyData
→ cascade → SAP matches the worksheet PDF's unrounded line 257 value
to 1e-4 tolerance. All 754 datatypes/epc/ + backend/documents_parser/
tests green; pyright net-zero on touched files.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
b6544e1cd1
commit
29ab80b0e5
2 changed files with 76 additions and 16 deletions
|
|
@ -424,6 +424,15 @@ class ElmhurstSiteNotesExtractor:
|
|||
"North", "South", "East", "West", "NE", "NW", "SE", "SW",
|
||||
})
|
||||
_BP_INLINE_TOKENS = frozenset({"Main"}) # "Extension" only appears as suffix
|
||||
# The Elmhurst Summary PDF lodges each window's glazing-type as a
|
||||
# capitalised phrase like "Double between 2002" / "Double with unknown"
|
||||
# / "Single" / "Triple" / "Secondary". The first token of that phrase
|
||||
# marks the start of a new window's prefix block in the layout dump,
|
||||
# which is the only stable signal partitioning one window's suffix
|
||||
# from the next window's prefix.
|
||||
_GLAZING_TYPE_PREFIX_WORDS = frozenset({
|
||||
"Single", "Double", "Triple", "Secondary",
|
||||
})
|
||||
|
||||
def _extract_windows_from_layout(self) -> List[Window]:
|
||||
"""Fallback window parser for Summary PDFs preprocessed from
|
||||
|
|
@ -457,21 +466,34 @@ class ElmhurstSiteNotesExtractor:
|
|||
manuf_idx = self._find_manufacturer_after(lines, data_idx)
|
||||
if manuf_idx is None:
|
||||
continue
|
||||
prev_window_end = (
|
||||
self._estimate_window_end(lines, data_anchors[k - 1][0])
|
||||
if k > 0 else 0
|
||||
prev_manuf_idx = (
|
||||
self._find_manufacturer_after(lines, data_anchors[k - 1][0])
|
||||
if k > 0 else None
|
||||
)
|
||||
next_window_start = (
|
||||
next_data_idx = (
|
||||
data_anchors[k + 1][0] if k + 1 < len(data_anchors) else len(lines)
|
||||
)
|
||||
# Partition the cross-window gap between this window's suffix
|
||||
# and the next window's prefix on the first glazing-type-start
|
||||
# token (Single/Double/Triple/Secondary). The same boundary
|
||||
# is used symmetrically — current window's `after_end` = next
|
||||
# window's `before_start` — so prefix tokens of W_{k+1} never
|
||||
# get attributed as suffix of W_k (which was the bug producing
|
||||
# orientation='East-South' for windows where 'South' actually
|
||||
# belonged to the next row).
|
||||
before_start = (
|
||||
self._partition_after_manuf(lines, prev_manuf_idx, data_idx)
|
||||
if prev_manuf_idx is not None else 0
|
||||
)
|
||||
after_end = self._partition_after_manuf(lines, manuf_idx, next_data_idx)
|
||||
try:
|
||||
window = self._parse_window_from_anchors(
|
||||
lines=lines,
|
||||
data_idx=data_idx,
|
||||
manuf_idx=manuf_idx,
|
||||
anchor=anchor,
|
||||
before_start=prev_window_end,
|
||||
after_end=next_window_start,
|
||||
before_start=before_start,
|
||||
after_end=after_end,
|
||||
)
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
|
@ -485,15 +507,22 @@ class ElmhurstSiteNotesExtractor:
|
|||
return j
|
||||
return None
|
||||
|
||||
def _estimate_window_end(self, lines: List[str], data_idx: int) -> int:
|
||||
"""End-of-window index (exclusive) for the window whose data
|
||||
line is at `data_idx`. Used to bound the "before" segment of
|
||||
the *next* window when extracting suffix tokens."""
|
||||
manuf_idx = self._find_manufacturer_after(lines, data_idx)
|
||||
if manuf_idx is None:
|
||||
return data_idx + 1
|
||||
# Manufacturer + g_value + draught + shutters + ~3 suffix tokens
|
||||
return manuf_idx + 7
|
||||
def _partition_after_manuf(
|
||||
self, lines: List[str], manuf_idx: int, next_data_idx: int
|
||||
) -> int:
|
||||
"""Return the exclusive upper bound for this window's suffix
|
||||
block (and the inclusive lower bound for the next window's prefix
|
||||
block). After the manufacturer line come 3 fixed tokens (g_value,
|
||||
draught, shutters); the variable suffix lines start at manuf+4
|
||||
and run until the next window's glazing-type-start token (e.g.
|
||||
'Double between 2002', 'Single', 'Triple ...') or until the
|
||||
next window's data line if no such token is present."""
|
||||
scan_start = manuf_idx + 4
|
||||
for j in range(scan_start, next_data_idx):
|
||||
first_word = lines[j].strip().split(" ", 1)[0]
|
||||
if first_word in self._GLAZING_TYPE_PREFIX_WORDS:
|
||||
return j
|
||||
return next_data_idx
|
||||
|
||||
def _parse_window_from_anchors(
|
||||
self,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import re
|
||||
from datetime import date
|
||||
from typing import List, Optional, Sequence, Union, Dict, Any
|
||||
from typing import Any, Dict, Final, List, Optional, Sequence, Union
|
||||
from datatypes.epc.schema.helpers import from_dict
|
||||
|
||||
from datatypes.epc.domain.epc_property_data import (
|
||||
|
|
@ -62,6 +62,7 @@ from datatypes.epc.surveys.elmhurst_site_notes import (
|
|||
BuildingPartDimensions as ElmhurstBuildingPartDimensions,
|
||||
ElmhurstSiteNotes,
|
||||
FloorDetails as ElmhurstFloorDetails,
|
||||
MainHeating as ElmhurstMainHeating,
|
||||
RoofDetails as ElmhurstRoofDetails,
|
||||
VentilationAndCooling as ElmhurstVentilation,
|
||||
WallDetails as ElmhurstWallDetails,
|
||||
|
|
@ -2220,6 +2221,34 @@ def _elmhurst_sap_control_code(sap_control: str) -> Optional[int]:
|
|||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
# SAP10.2 Table 4a main-heating-category codes. Currently only the
|
||||
# gas-fired-boiler branch is exercised by the Elmhurst cohort — the
|
||||
# cascade reads `main_heating_category` to key the §4f pumps+fans table
|
||||
# (160 kWh/yr for cat 2 = 115 central heating pump + 45 flue fan) and to
|
||||
# detect heat-network mains (cat 6). Other categories (heat pumps,
|
||||
# warm-air, electric storage, oil/biomass) are deferred until a fixture
|
||||
# exercises them.
|
||||
_ELMHURST_HEATING_CATEGORY_GAS_BOILER: Final[int] = 2
|
||||
_ELMHURST_GAS_BOILER_FUEL_TYPES: frozenset[str] = frozenset({
|
||||
"Mains gas",
|
||||
"LPG bottled",
|
||||
"LPG bulk",
|
||||
"LPG special condition",
|
||||
})
|
||||
|
||||
|
||||
def _elmhurst_main_heating_category(
|
||||
mh: ElmhurstMainHeating, pcdb_index: Optional[int]
|
||||
) -> Optional[int]:
|
||||
"""Derive the SAP10.2 Table 4a main-heating-category from Elmhurst-
|
||||
lodged data. A PCDB-referenced boiler on mains/LPG gas is category 2
|
||||
(gas-fired boilers); other system types fall through to None so the
|
||||
cascade applies its default pumps_fans 130 kWh/yr until extended."""
|
||||
if pcdb_index is not None and mh.fuel_type in _ELMHURST_GAS_BOILER_FUEL_TYPES:
|
||||
return _ELMHURST_HEATING_CATEGORY_GAS_BOILER
|
||||
return None
|
||||
|
||||
|
||||
def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
|
||||
mh = survey.main_heating
|
||||
sap_control = mh.heating_controls_sap
|
||||
|
|
@ -2241,6 +2270,7 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
|
|||
main_fuel_int = _elmhurst_main_fuel_int(mh.fuel_type)
|
||||
heat_emitter_int = _elmhurst_heat_emitter_int(mh.heat_emitter)
|
||||
sap_control_int = _elmhurst_sap_control_code(sap_control)
|
||||
main_heating_category = _elmhurst_main_heating_category(mh, pcdb_index)
|
||||
return SapHeating(
|
||||
instantaneous_wwhrs=InstantaneousWwhrs(),
|
||||
main_heating_details=[
|
||||
|
|
@ -2256,6 +2286,7 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
|
|||
fan_flue_present=mh.fan_assisted_flue,
|
||||
main_heating_control=sap_control_int if sap_control_int is not None else control,
|
||||
central_heating_pump_age_str=mh.heat_pump_age,
|
||||
main_heating_category=main_heating_category,
|
||||
# Per RdSAP, a PCDB-listed boiler is data source 1
|
||||
# (manufacturer measured efficiency); the integer index
|
||||
# number drives PCDB lookup in the cascade.
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue