diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index c77d92cc..822254ca 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -424,6 +424,15 @@ class ElmhurstSiteNotesExtractor: "North", "South", "East", "West", "NE", "NW", "SE", "SW", }) _BP_INLINE_TOKENS = frozenset({"Main"}) # "Extension" only appears as suffix + # The Elmhurst Summary PDF lodges each window's glazing-type as a + # capitalised phrase like "Double between 2002" / "Double with unknown" + # / "Single" / "Triple" / "Secondary". The first token of that phrase + # marks the start of a new window's prefix block in the layout dump, + # which is the only stable signal partitioning one window's suffix + # from the next window's prefix. + _GLAZING_TYPE_PREFIX_WORDS = frozenset({ + "Single", "Double", "Triple", "Secondary", + }) def _extract_windows_from_layout(self) -> List[Window]: """Fallback window parser for Summary PDFs preprocessed from @@ -457,21 +466,34 @@ class ElmhurstSiteNotesExtractor: manuf_idx = self._find_manufacturer_after(lines, data_idx) if manuf_idx is None: continue - prev_window_end = ( - self._estimate_window_end(lines, data_anchors[k - 1][0]) - if k > 0 else 0 + prev_manuf_idx = ( + self._find_manufacturer_after(lines, data_anchors[k - 1][0]) + if k > 0 else None ) - next_window_start = ( + next_data_idx = ( data_anchors[k + 1][0] if k + 1 < len(data_anchors) else len(lines) ) + # Partition the cross-window gap between this window's suffix + # and the next window's prefix on the first glazing-type-start + # token (Single/Double/Triple/Secondary). The same boundary + # is used symmetrically โ€” current window's `after_end` = next + # window's `before_start` โ€” so prefix tokens of W_{k+1} never + # get attributed as suffix of W_k (which was the bug producing + # orientation='East-South' for windows where 'South' actually + # belonged to the next row). + before_start = ( + self._partition_after_manuf(lines, prev_manuf_idx, data_idx) + if prev_manuf_idx is not None else 0 + ) + after_end = self._partition_after_manuf(lines, manuf_idx, next_data_idx) try: window = self._parse_window_from_anchors( lines=lines, data_idx=data_idx, manuf_idx=manuf_idx, anchor=anchor, - before_start=prev_window_end, - after_end=next_window_start, + before_start=before_start, + after_end=after_end, ) except (ValueError, IndexError): continue @@ -485,15 +507,22 @@ class ElmhurstSiteNotesExtractor: return j return None - def _estimate_window_end(self, lines: List[str], data_idx: int) -> int: - """End-of-window index (exclusive) for the window whose data - line is at `data_idx`. Used to bound the "before" segment of - the *next* window when extracting suffix tokens.""" - manuf_idx = self._find_manufacturer_after(lines, data_idx) - if manuf_idx is None: - return data_idx + 1 - # Manufacturer + g_value + draught + shutters + ~3 suffix tokens - return manuf_idx + 7 + def _partition_after_manuf( + self, lines: List[str], manuf_idx: int, next_data_idx: int + ) -> int: + """Return the exclusive upper bound for this window's suffix + block (and the inclusive lower bound for the next window's prefix + block). After the manufacturer line come 3 fixed tokens (g_value, + draught, shutters); the variable suffix lines start at manuf+4 + and run until the next window's glazing-type-start token (e.g. + 'Double between 2002', 'Single', 'Triple ...') or until the + next window's data line if no such token is present.""" + scan_start = manuf_idx + 4 + for j in range(scan_start, next_data_idx): + first_word = lines[j].strip().split(" ", 1)[0] + if first_word in self._GLAZING_TYPE_PREFIX_WORDS: + return j + return next_data_idx def _parse_window_from_anchors( self, diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index aa8f4b19..62dc74ea 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -1,6 +1,6 @@ import re from datetime import date -from typing import List, Optional, Sequence, Union, Dict, Any +from typing import Any, Dict, Final, List, Optional, Sequence, Union from datatypes.epc.schema.helpers import from_dict from datatypes.epc.domain.epc_property_data import ( @@ -62,6 +62,7 @@ from datatypes.epc.surveys.elmhurst_site_notes import ( BuildingPartDimensions as ElmhurstBuildingPartDimensions, ElmhurstSiteNotes, FloorDetails as ElmhurstFloorDetails, + MainHeating as ElmhurstMainHeating, RoofDetails as ElmhurstRoofDetails, VentilationAndCooling as ElmhurstVentilation, WallDetails as ElmhurstWallDetails, @@ -2220,6 +2221,34 @@ def _elmhurst_sap_control_code(sap_control: str) -> Optional[int]: return int(m.group(1)) if m else None +# SAP10.2 Table 4a main-heating-category codes. Currently only the +# gas-fired-boiler branch is exercised by the Elmhurst cohort โ€” the +# cascade reads `main_heating_category` to key the ยง4f pumps+fans table +# (160 kWh/yr for cat 2 = 115 central heating pump + 45 flue fan) and to +# detect heat-network mains (cat 6). Other categories (heat pumps, +# warm-air, electric storage, oil/biomass) are deferred until a fixture +# exercises them. +_ELMHURST_HEATING_CATEGORY_GAS_BOILER: Final[int] = 2 +_ELMHURST_GAS_BOILER_FUEL_TYPES: frozenset[str] = frozenset({ + "Mains gas", + "LPG bottled", + "LPG bulk", + "LPG special condition", +}) + + +def _elmhurst_main_heating_category( + mh: ElmhurstMainHeating, pcdb_index: Optional[int] +) -> Optional[int]: + """Derive the SAP10.2 Table 4a main-heating-category from Elmhurst- + lodged data. A PCDB-referenced boiler on mains/LPG gas is category 2 + (gas-fired boilers); other system types fall through to None so the + cascade applies its default pumps_fans 130 kWh/yr until extended.""" + if pcdb_index is not None and mh.fuel_type in _ELMHURST_GAS_BOILER_FUEL_TYPES: + return _ELMHURST_HEATING_CATEGORY_GAS_BOILER + return None + + def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating: mh = survey.main_heating sap_control = mh.heating_controls_sap @@ -2241,6 +2270,7 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating: main_fuel_int = _elmhurst_main_fuel_int(mh.fuel_type) heat_emitter_int = _elmhurst_heat_emitter_int(mh.heat_emitter) sap_control_int = _elmhurst_sap_control_code(sap_control) + main_heating_category = _elmhurst_main_heating_category(mh, pcdb_index) return SapHeating( instantaneous_wwhrs=InstantaneousWwhrs(), main_heating_details=[ @@ -2256,6 +2286,7 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating: fan_flue_present=mh.fan_assisted_flue, main_heating_control=sap_control_int if sap_control_int is not None else control, central_heating_pump_age_str=mh.heat_pump_age, + main_heating_category=main_heating_category, # Per RdSAP, a PCDB-listed boiler is data source 1 # (manufacturer measured efficiency); the integer index # number drives PCDB lookup in the cascade.