diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 44d5325e..01b50deb 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -861,7 +861,7 @@ class ElmhurstSiteNotesExtractor: ) if not m: return [] - lines = m.group(1).splitlines() + lines = self._merge_split_dimension_lines(m.group(1).splitlines()) # Locate all (data_line, manufacturer_line) pairs in document # order. Each pair is one window. @@ -911,6 +911,40 @@ class ElmhurstSiteNotesExtractor: windows.append(window) return windows + # A "W H" pair on its own line (e.g. "5.79 2.00") whose Area cell the + # layout preprocessor pushed onto the following line as a lone decimal + # ("11.58"). Wider Area columns in the §11 grid trigger the 2+-space + # split; narrower ones keep all three on one line (the 3-decimal anchor). + _WIDTH_HEIGHT_RE = re.compile(r"^(\d+\.\d+)\s+(\d+\.\d+)$") + _AREA_ONLY_RE = re.compile(r"^(\d+\.\d+)$") + + def _merge_split_dimension_lines(self, lines: List[str]) -> List[str]: + """Re-join a window's "W H" line with a following bare-Area line + into the canonical "W H Area" shape the data anchor expects. + + Gated on Area ≈ W × H (the §11 Area is always the product), so an + unrelated lone decimal below a "W H" line — a frame factor, g-value + or U-value — is never absorbed. Layouts that already lodge all + three on one line are untouched (their line has 3 decimals, not 2). + """ + merged: List[str] = [] + i = 0 + while i < len(lines): + wh = self._WIDTH_HEIGHT_RE.match(lines[i].strip()) + area = ( + self._AREA_ONLY_RE.match(lines[i + 1].strip()) + if wh is not None and i + 1 < len(lines) else None + ) + if wh is not None and area is not None: + w, h, a = float(wh.group(1)), float(wh.group(2)), float(area.group(1)) + if abs(w * h - a) <= 0.05: + merged.append(f"{wh.group(1)} {wh.group(2)} {area.group(1)}") + i += 2 + continue + merged.append(lines[i]) + i += 1 + return merged + def _find_manufacturer_after(self, lines: List[str], data_idx: int) -> Optional[int]: for j in range(data_idx + 1, min(data_idx + 12, len(lines))): stripped = lines[j].strip() diff --git a/backend/documents_parser/tests/fixtures/Summary_001431_case20.pdf b/backend/documents_parser/tests/fixtures/Summary_001431_case20.pdf new file mode 100644 index 00000000..40c67781 Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_001431_case20.pdf differ diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index 66f0172e..d1bbbd3e 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -78,6 +78,7 @@ _SUMMARY_000884_PDF = _FIXTURES / "Summary_000884.pdf" # cert 9421 (Normal cyli _SUMMARY_000910_PDF = _FIXTURES / "Summary_000910.pdf" # cert 0036 (Flat, party wall U=0) _SUMMARY_000890_PDF = _FIXTURES / "Summary_000890.pdf" # cert 7800 (two electric showers) _SUMMARY_000565_PDF = _FIXTURES / "Summary_000565.pdf" # cert 000565 (5-bp Elmhurst-only) +_SUMMARY_001431_CASE20_PDF = _FIXTURES / "Summary_001431_case20.pdf" # sim case 20 (storage heaters + RR type-2 + wrapped "Double between 2002 and 2021" glazing) # GOV.UK EPB API JSON for cert 001479 — the API-path counterpart of the # Summary_001479.pdf fixture. Together they drive the API ≡ Summary @@ -127,6 +128,20 @@ def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]: return pages +def test_summary_001431_case20_extracts_all_five_section11_windows() -> None: + # Arrange — sim case 20's §11 lodges 5 windows, each with the glazing + # label "Double between 2002 and 2021". That phrase wraps to two PDF + # lines, so pdftotext interleaves its continuation ("and 2021") with + # the next row's cells — a layout the window parser must survive. + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_001431_CASE20_PDF) + + # Act + survey = ElmhurstSiteNotesExtractor(pages).extract() + + # Assert + assert len(survey.windows) == 5 + + def test_summary_000474_mapper_produces_three_building_parts() -> None: # Arrange — cert U985-0001-000474 is a mid-terrace with 3 building # parts (Main + 2 extensions) per the hand-built worksheet fixture