From 03c4ea49215458adad559f8943f690a523177043 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 May 2026 21:27:47 +0000 Subject: [PATCH] Slice S0380.12: parse 'Alternative wall' window-location in pre-data slice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cert 2636-0525-2600-0401-2296's Summary §11 Windows block lodges one alt-wall window (1.19 m², north-facing). The PDF layout for alt-wall rows puts the "Alternative wall" string in the slot BEFORE the W×H×A data line — not after frame_factor where regular "External wall" rows put it. Without this fix the extractor's `_parse_window_from_anchors` only scanned the post-frame_factor `middle` slice for wall tokens, defaulted to "External wall" for the alt-wall row, and the cascade allocated the 1.19 m² opening to the main wall instead of the alt-wall — under-deducting from main and leaving the alt-wall gross instead of net. Fix at `elmhurst_extractor.py:865`: also scan `lines[before_start:data_idx]` (the pre-data slice) for "wall" tokens. Search order: 1. `middle` — first preference (normal layout for regular rows) 2. `pre_data` — alt-wall rows (cert 2636) 3. "External wall" default — no wall lodging found Forcing function: cert 2636 walls_w_per_k moves from 20.5595 to **20.0240 — EXACT match against worksheet (29a) Main 11.9250 + alt.1 8.0990 = 20.0240**. (Header (29a) sum is now fabric-exact; the remaining +0.52 SAP residual on cert 2636 is in the ventilation cascade — HTC 153.97 vs API 159.02 vs worksheet (39) avg 158.85 — to be investigated in a follow-up slice.) Added focused unit test `test_summary_2636_alt_wall_window_parses_alternative_wall_location` that pins the by-area lookup: 1.19 m² → "Alternative wall"; the six 2.25 m² windows stay on "External wall". Guards against future window-location parser regressions. Pyright: 0 errors on the edited extractor + test files. Regression suite: 685 pass + 10 fail (handover baseline 669 + 10 + 16 new GREEN tests across S0380.2..S0380.12). Cohort status: cert Δ vs worksheet spec floor? 0380 +0.0594 ✓ 0350 +0.0458 ✓ 2225 +0.0441 ✓ 2636 +0.5167 ✗ (fabric exact; ventilation residual) 3800 +0.0442 ✓ 9285 +0.0502 ✓ 9418 +2.5973 ✗ (Daikin) Spec refs: - Slice 102f-prep.10 (commit 24a7351f) — API-path equivalent "Alt-wall opening allocation per window_wall_type". - SAP 10.2 §3.7 — opening (window + door) deduction from gross wall area, per-window allocated to the lodged wall type. Co-Authored-By: Claude Opus 4.7 --- .../documents_parser/elmhurst_extractor.py | 12 ++++++++- .../tests/test_summary_pdf_mapper_chain.py | 26 +++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 4e222bc8..012a9573 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -862,7 +862,17 @@ class ElmhurstSiteNotesExtractor: # Variable-order tokens between frame_factor and Manufacturer. middle = [lines[j].strip() for j in range(middle_start, manuf_idx)] glazing_gap = next((t for t in middle if "mm" in t.lower()), None) - location = next((t for t in middle if "wall" in t.lower()), "External wall") + # Wall-location lodging. Most rows put "External wall" in + # `middle`; alt-wall rows (cert 2636 window-4 / cert 9418 alt- + # wall window) put "Alternative wall" in the PRE-data slice + # (between the previous window's end and W×H×A). Search both + # slices so either layout resolves to the correct location. + pre_data = [lines[j].strip() for j in range(before_start, data_idx)] + location = ( + next((t for t in middle if "wall" in t.lower()), None) + or next((t for t in pre_data if "wall" in t.lower()), None) + or "External wall" + ) bp_inline = next((t for t in middle if t in self._BP_INLINE_TOKENS), None) orient_inline = next( (t for t in middle if t in self._ORIENTATION_TOKENS), None diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index f0020843..fa79dabe 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -63,6 +63,7 @@ _SUMMARY_000903_PDF = _FIXTURES / "Summary_000903.pdf" _SUMMARY_000901_PDF = _FIXTURES / "Summary_000901.pdf" # cert 3800 _SUMMARY_000904_PDF = _FIXTURES / "Summary_000904.pdf" # cert 9285 _SUMMARY_000900_PDF = _FIXTURES / "Summary_000900.pdf" # cert 2225 +_SUMMARY_000898_PDF = _FIXTURES / "Summary_000898.pdf" # cert 2636 # GOV.UK EPB API JSON for cert 001479 — the API-path counterpart of the # Summary_001479.pdf fixture. Together they drive the API ≡ Summary @@ -714,6 +715,31 @@ def test_summary_0350_full_chain_sap_within_spec_floor_of_worksheet() -> None: assert abs(result.sap_score_continuous - worksheet_unrounded_sap) < _ASHP_COHORT_CHAIN_TOLERANCE +def test_summary_2636_alt_wall_window_parses_alternative_wall_location() -> None: + # Arrange — cert 2636-0525-2600-0401-2296's §11 Windows block lodges + # one alt-wall window (the 1.19 m² north-facing one): the row's + # "Alternative wall" string appears BEFORE the W×H×A line, not + # after the frame_factor (the normal position for "External wall"). + # The extractor's `_parse_window_from_anchors` was only scanning + # the post-frame_factor `middle` slice for wall-location tokens → + # defaulted to "External wall" for the alt-wall row → cascade + # allocated the window to the main wall instead of the alt-wall, + # leaving Main external walls W/K under-deducted by ~0.54 vs + # worksheet (29a). Fix: also scan the PRE-data slice + # `lines[before_start:data_idx]` for wall tokens. + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000898_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + + # Act + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Assert — the 1.19 m² window is recorded with wall_type = + # "Alternative wall"; all other windows stay on "External wall". + by_area = {round(w.window_width, 2): w.window_wall_type for w in epc.sap_windows} + assert by_area[1.19] == "Alternative wall" + assert by_area[2.25] == "External wall" # main-wall windows unchanged + + def test_summary_2225_no_showers_lodged_resolves_to_zero_counts() -> None: # Arrange — cert 2225-3062-8205-2856-7204's Summary §1x Baths and # Showers block lodges 0 baths and ZERO showers (no shower rows at