diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 4e222bc8..012a9573 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -862,7 +862,17 @@ class ElmhurstSiteNotesExtractor: # Variable-order tokens between frame_factor and Manufacturer. middle = [lines[j].strip() for j in range(middle_start, manuf_idx)] glazing_gap = next((t for t in middle if "mm" in t.lower()), None) - location = next((t for t in middle if "wall" in t.lower()), "External wall") + # Wall-location lodging. Most rows put "External wall" in + # `middle`; alt-wall rows (cert 2636 window-4 / cert 9418 alt- + # wall window) put "Alternative wall" in the PRE-data slice + # (between the previous window's end and W×H×A). Search both + # slices so either layout resolves to the correct location. + pre_data = [lines[j].strip() for j in range(before_start, data_idx)] + location = ( + next((t for t in middle if "wall" in t.lower()), None) + or next((t for t in pre_data if "wall" in t.lower()), None) + or "External wall" + ) bp_inline = next((t for t in middle if t in self._BP_INLINE_TOKENS), None) orient_inline = next( (t for t in middle if t in self._ORIENTATION_TOKENS), None diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index f0020843..fa79dabe 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -63,6 +63,7 @@ _SUMMARY_000903_PDF = _FIXTURES / "Summary_000903.pdf" _SUMMARY_000901_PDF = _FIXTURES / "Summary_000901.pdf" # cert 3800 _SUMMARY_000904_PDF = _FIXTURES / "Summary_000904.pdf" # cert 9285 _SUMMARY_000900_PDF = _FIXTURES / "Summary_000900.pdf" # cert 2225 +_SUMMARY_000898_PDF = _FIXTURES / "Summary_000898.pdf" # cert 2636 # GOV.UK EPB API JSON for cert 001479 — the API-path counterpart of the # Summary_001479.pdf fixture. Together they drive the API ≡ Summary @@ -714,6 +715,31 @@ def test_summary_0350_full_chain_sap_within_spec_floor_of_worksheet() -> None: assert abs(result.sap_score_continuous - worksheet_unrounded_sap) < _ASHP_COHORT_CHAIN_TOLERANCE +def test_summary_2636_alt_wall_window_parses_alternative_wall_location() -> None: + # Arrange — cert 2636-0525-2600-0401-2296's §11 Windows block lodges + # one alt-wall window (the 1.19 m² north-facing one): the row's + # "Alternative wall" string appears BEFORE the W×H×A line, not + # after the frame_factor (the normal position for "External wall"). + # The extractor's `_parse_window_from_anchors` was only scanning + # the post-frame_factor `middle` slice for wall-location tokens → + # defaulted to "External wall" for the alt-wall row → cascade + # allocated the window to the main wall instead of the alt-wall, + # leaving Main external walls W/K under-deducted by ~0.54 vs + # worksheet (29a). Fix: also scan the PRE-data slice + # `lines[before_start:data_idx]` for wall tokens. + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000898_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + + # Act + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Assert — the 1.19 m² window is recorded with wall_type = + # "Alternative wall"; all other windows stay on "External wall". + by_area = {round(w.window_width, 2): w.window_wall_type for w in epc.sap_windows} + assert by_area[1.19] == "Alternative wall" + assert by_area[2.25] == "External wall" # main-wall windows unchanged + + def test_summary_2225_no_showers_lodged_resolves_to_zero_counts() -> None: # Arrange — cert 2225-3062-8205-2856-7204's Summary §1x Baths and # Showers block lodges 0 baths and ZERO showers (no shower rows at