diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index aaaf0135..f1b5748b 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -128,6 +128,32 @@ class ElmhurstSiteNotesExtractor: text = self._between(start, end) return [l.strip() for l in text.splitlines() if l.strip()] + def _section_lines_first_end( + self, start: str, ends: tuple[str, ...], + ) -> List[str]: + """Like `_section_lines` but accepts multiple end-marker candidates + and uses whichever appears first after `start`. Defends against + Summary-shape variants where the next-section heading differs + (e.g. §14.0 Main Heating1 closes at "14.1 Main Heating2" on + boiler/HP certs but at "14.1 Community Heating" on community- + heated certs).""" + try: + s = self._text.index(start) + len(start) + except ValueError: + return [] + earliest: int | None = None + for end in ends: + try: + idx = self._text.index(end, s) + except ValueError: + continue + if earliest is None or idx < earliest: + earliest = idx + if earliest is None: + return [] + text = self._text[s:earliest] + return [l.strip() for l in text.splitlines() if l.strip()] + def _local_val(self, lines: List[str], label: str) -> Optional[str]: lb = label.rstrip(":") lc = lb + ":" @@ -1171,7 +1197,18 @@ class ElmhurstSiteNotesExtractor: ) def _extract_main_heating(self) -> MainHeating: - lines = self._section_lines("14.0 Main Heating1", "14.1 Main Heating2") + # Community-heated dwellings (e.g. SAP code 301 "Community heating + # scheme" per SAP10.2 Table 4a category 6) and "no system" certs + # (SAP code 699 "Electric heaters assumed where no system lodged") + # lodge §14.0 Main Heating1 directly followed by §14.1 Community + # Heating/Heat Network rather than §14.1 Main Heating2 — there is + # no second main system on a community-heated dwelling. Close the + # §14.0 block at whichever §14.1 form appears first so every + # Summary shape surfaces the SAP code. + lines = self._section_lines_first_end( + "14.0 Main Heating1", + ("14.1 Main Heating2", "14.1 Community Heating"), + ) pct_raw = self._local_val(lines, "Percentage of Heat") pct = int(pct_raw.split()[0]) if pct_raw else 0 # §14.0 "Main Heating SAP Code" identifies Main 1 by SAP 10.2 diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index 925c3294..1d556d78 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -236,6 +236,40 @@ def test_summary_001479_mapper_extensions_count_matches_extension_bps() -> None: assert len(epc.sap_building_parts) == 3 +def test_summary_001431_community_heating_1_main_heating_sap_code_extracted_when_no_main_heating_2_block() -> None: + # Arrange — Heating-systems corpus fixture 001431 / "community heating 1" + # lodges §14.0 Main Heating1 directly followed by §14.1 Community + # Heating/Heat Network (no §14.1 Main Heating2 block, since community- + # heated dwellings don't have a second main system to lodge). The §14.0 + # block carries `Main Heating SAP Code: 301` (Community heating per + # SAP10.2 Table 4a category 6 — "Heat networks"). + # + # Pre-slice the extractor's `_section_lines("14.0 Main Heating1", + # "14.1 Main Heating2")` returned an empty list because the end marker + # was missing, so every §14.0 field (incl. `Main Heating SAP Code`) + # came back as None. The mapper then raised `UnmappedElmhurstLabel` + # with "§14.0 Main Heating1 has neither PCDF boiler reference (None) + # nor SAP code (None)" — blocking all 6 community-heated + "no system" + # corpus variants from cascade execution. + # + # The fix closes the §14.0 block at whichever §14.1 marker appears + # first ("14.1 Main Heating2" or "14.1 Community Heating"), so the + # SAP code surfaces correctly on every Summary shape. + summary_pdf = ( + Path(__file__).parents[3] + / "sap worksheets/heating systems examples/community heating 1/Summary_001431.pdf" + ) + pages = _summary_pdf_to_textract_style_pages(summary_pdf) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + + # Act + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Assert + main_1 = epc.sap_heating.main_heating_details[0] + assert main_1.sap_main_heating_code == 301 + + def test_summary_001431_pcdb_1_inaccessible_cylinder_resolves_to_normal_per_rdsap_10_table_28() -> None: # Arrange — Heating-systems corpus fixture 001431 / "pcdb 1" lodges # §15.1 "Cylinder Size: No Access" (the Elmhurst inaccessible-cylinder