From 729ee29c840d7292c17c2c07ed2f1bc494da5496 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 31 May 2026 08:26:24 +0000 Subject: [PATCH] =?UTF-8?q?Slice=20S0380.128:=20extractor=20=C2=A714.0=20c?= =?UTF-8?q?losure=20falls=20back=20to=20"14.1=20Community=20Heating"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Elmhurst Summary §14.0 Main Heating1 normally closes at "14.1 Main Heating2", but community-heated dwellings and "no system" certs lodge §14.0 followed directly by "14.1 Community Heating/Heat Network" (no second main system exists on a community-heated dwelling). Pre-slice the extractor's `_between("14.0 Main Heating1", "14.1 Main Heating2")` returned an empty string for these shapes — every §14.0 field (including `Main Heating SAP Code`) came back None, then the mapper strict-raised `UnmappedElmhurstLabel` with "§14.0 Main Heating1 has neither PCDF boiler reference (None) nor SAP code (None)". The fix adds a `_section_lines_first_end(start, ends)` helper that accepts a tuple of end-marker candidates and uses whichever appears first after `start`. `_extract_main_heating` now closes §14.0 at either "14.1 Main Heating2" or "14.1 Community Heating" — whichever Summary lodges. Impact on heating-systems corpus 001431 at `sap worksheets/heating systems examples/`: Variant Pre-S0380.128 -> Post-S0380.128 ------------------------ ------------------ ----------------- community heating 1 mapper-raise -> SAP code 301 OK community heating 2 mapper-raise -> SAP code 302 OK community heating 3 mapper-raise -> SAP code 304 OK community heating 4 mapper-raise -> SAP code 302 OK community heating 6 mapper-raise -> SAP code 302 OK no system mapper-raise -> SAP code 699 OK Corpus tally: **35/41 -> 41/41 cascade-OK**. With all populated variants now executing, the cascade-vs-worksheet residual cluster is fully visible for the first time. Notably community heating 6 surfaces the FIRST negative ΔSAP in the corpus (-6.87 — cascade undershooting the worksheet rather than overshooting), a distinct diagnostic shape worth investigating next. The fix is structural (extractor section bracketing) — no spec rule to cite. RdSAP 10 §17 page 85 row 1.0 ("Main Heating") + §17 row 10-1a ("Community Heat Source") confirm that community-heated certs have only one main heating system (no Main 2 block). Extended handover suite at HEAD post-slice: **832 pass, 0 fail** (was 831 + 1 new AAA test). Pyright net-zero on touched files (13 → 13 — pre-existing errors unrelated). Co-Authored-By: Claude Opus 4.7 --- .../documents_parser/elmhurst_extractor.py | 39 ++++++++++++++++++- .../tests/test_summary_pdf_mapper_chain.py | 34 ++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index aaaf0135..f1b5748b 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -128,6 +128,32 @@ class ElmhurstSiteNotesExtractor: text = self._between(start, end) return [l.strip() for l in text.splitlines() if l.strip()] + def _section_lines_first_end( + self, start: str, ends: tuple[str, ...], + ) -> List[str]: + """Like `_section_lines` but accepts multiple end-marker candidates + and uses whichever appears first after `start`. Defends against + Summary-shape variants where the next-section heading differs + (e.g. §14.0 Main Heating1 closes at "14.1 Main Heating2" on + boiler/HP certs but at "14.1 Community Heating" on community- + heated certs).""" + try: + s = self._text.index(start) + len(start) + except ValueError: + return [] + earliest: int | None = None + for end in ends: + try: + idx = self._text.index(end, s) + except ValueError: + continue + if earliest is None or idx < earliest: + earliest = idx + if earliest is None: + return [] + text = self._text[s:earliest] + return [l.strip() for l in text.splitlines() if l.strip()] + def _local_val(self, lines: List[str], label: str) -> Optional[str]: lb = label.rstrip(":") lc = lb + ":" @@ -1171,7 +1197,18 @@ class ElmhurstSiteNotesExtractor: ) def _extract_main_heating(self) -> MainHeating: - lines = self._section_lines("14.0 Main Heating1", "14.1 Main Heating2") + # Community-heated dwellings (e.g. SAP code 301 "Community heating + # scheme" per SAP10.2 Table 4a category 6) and "no system" certs + # (SAP code 699 "Electric heaters assumed where no system lodged") + # lodge §14.0 Main Heating1 directly followed by §14.1 Community + # Heating/Heat Network rather than §14.1 Main Heating2 — there is + # no second main system on a community-heated dwelling. Close the + # §14.0 block at whichever §14.1 form appears first so every + # Summary shape surfaces the SAP code. + lines = self._section_lines_first_end( + "14.0 Main Heating1", + ("14.1 Main Heating2", "14.1 Community Heating"), + ) pct_raw = self._local_val(lines, "Percentage of Heat") pct = int(pct_raw.split()[0]) if pct_raw else 0 # §14.0 "Main Heating SAP Code" identifies Main 1 by SAP 10.2 diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index 925c3294..1d556d78 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -236,6 +236,40 @@ def test_summary_001479_mapper_extensions_count_matches_extension_bps() -> None: assert len(epc.sap_building_parts) == 3 +def test_summary_001431_community_heating_1_main_heating_sap_code_extracted_when_no_main_heating_2_block() -> None: + # Arrange — Heating-systems corpus fixture 001431 / "community heating 1" + # lodges §14.0 Main Heating1 directly followed by §14.1 Community + # Heating/Heat Network (no §14.1 Main Heating2 block, since community- + # heated dwellings don't have a second main system to lodge). The §14.0 + # block carries `Main Heating SAP Code: 301` (Community heating per + # SAP10.2 Table 4a category 6 — "Heat networks"). + # + # Pre-slice the extractor's `_section_lines("14.0 Main Heating1", + # "14.1 Main Heating2")` returned an empty list because the end marker + # was missing, so every §14.0 field (incl. `Main Heating SAP Code`) + # came back as None. The mapper then raised `UnmappedElmhurstLabel` + # with "§14.0 Main Heating1 has neither PCDF boiler reference (None) + # nor SAP code (None)" — blocking all 6 community-heated + "no system" + # corpus variants from cascade execution. + # + # The fix closes the §14.0 block at whichever §14.1 marker appears + # first ("14.1 Main Heating2" or "14.1 Community Heating"), so the + # SAP code surfaces correctly on every Summary shape. + summary_pdf = ( + Path(__file__).parents[3] + / "sap worksheets/heating systems examples/community heating 1/Summary_001431.pdf" + ) + pages = _summary_pdf_to_textract_style_pages(summary_pdf) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + + # Act + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Assert + main_1 = epc.sap_heating.main_heating_details[0] + assert main_1.sap_main_heating_code == 301 + + def test_summary_001431_pcdb_1_inaccessible_cylinder_resolves_to_normal_per_rdsap_10_table_28() -> None: # Arrange — Heating-systems corpus fixture 001431 / "pcdb 1" lodges # §15.1 "Cylinder Size: No Access" (the Elmhurst inaccessible-cylinder