diff --git a/backend/documents_parser/tests/fixtures/Summary_000565.pdf b/backend/documents_parser/tests/fixtures/Summary_000565.pdf new file mode 100644 index 00000000..8d31885b Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_000565.pdf differ diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 348d27b9..1e5cec98 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -3724,10 +3724,23 @@ def _elmhurst_cylinder_insulation_code( # fixing the upstream extractor is deferred to a future slice. _ELMHURST_GLAZING_LABEL_TO_SAP10: Dict[str, int] = { "Single": 1, + # Elmhurst §11 lodgement variant of the bare "Single" form — surfaced + # on cert 000565 Window 3 (Wood frame, U=3.35, g=0.85). Same enum as + # "Single" per Table U2 code 1; g_L=0.90 / g⊥=0.85. + "Single glazing": 1, "Double pre 2002": 2, "Double between 2002 and 2021": 3, "Double with unknown install date": 3, "Double with unknown 16 mm or install date more": 3, + # Elmhurst §11 lodgement of RdSAP-21 schema row 7 "double, known + # data" — manufacturer U-value and g-value are lodged on the + # SapWindow's `WindowTransmissionDetails` so the cascade reads + # those directly. The glazing_type code only affects the §5 + # (66)..(67) daylight factor where g_L=0.80 across all DG variants + # ({2, 3, 13}) — grouped under code 3 with the other unknown-date + # DG variants for cascade-equivalence. First seen on cert 000565 + # Window 6 (Main, U=2.00, g=0.72). + "Double glazing, known data": 3, "Double post or during 2022": 5, "Triple post or during 2022": 6, # One window in cert 2636 (Summary_000898.pdf) lodges the year- @@ -3737,6 +3750,13 @@ _ELMHURST_GLAZING_LABEL_TO_SAP10: Dict[str, int] = { # Treated as the same enum as the full form per worksheet # "Triple glazed" lodging on cert 2636's dr87-0001-000898.pdf. "Triple post or during": 6, + # RdSAP-Schema-21 row "triple glazing, installed 2002-2022 in EAW" + # (epc_codes.csv code 9 — RdSAP-21 schema extension). First seen on + # cert 000565 Window 2 (Summary_000565.pdf §11 row 2, manufacturer + # U=2.00, g=0.72). Cascade's `_G_PERPENDICULAR_BY_GLAZING_TYPE` + # row 9 returns Table 6b triple-glazed g⊥=0.68; the lodged + # solar_transmittance=0.72 overrides per worksheet-pinned value. + "Triple between 2002 and 2021": 9, "Secondary": 7, } diff --git a/domain/sap10_calculator/worksheet/tests/_elmhurst_worksheet_000565.py b/domain/sap10_calculator/worksheet/tests/_elmhurst_worksheet_000565.py new file mode 100644 index 00000000..774b148c --- /dev/null +++ b/domain/sap10_calculator/worksheet/tests/_elmhurst_worksheet_000565.py @@ -0,0 +1,126 @@ +"""Mapper-driven cascade pin against Elmhurst U985-0001-000565. + +Unlike the 6 cohort fixtures (000474/000477/000480/000487/000490/ +000516), this fixture does NOT hand-build the EpcPropertyData. It +routes the Summary_000565.pdf through ElmhurstSiteNotesExtractor + +EpcPropertyDataMapper.from_elmhurst_site_notes so the SAP-result pin +grid exercises the WHOLE extractor + mapper + calculator pipeline. + +Failing SAP-result pins surface gaps in any of the three layers: +- Extractor: lodgement fields not parsed from the Summary PDF +- Mapper: code-to-int translations missing from the dispatch dicts +- Calculator: cascade gaps (e.g. CF cavity-filled party-wall U=0.20 + from Table 15 row 3 has no SAP10 wall_construction code today) + +Each failing pin localises to one of the three and becomes its own +slice. As more Elmhurst Summary PDFs land, the mapper will handle +them automatically rather than per-cert hand-building. + +Source: PDF supplied by user 2026-05-28 at `sap worksheets/extended +test case/`; mirrored into the tracked +`backend/documents_parser/tests/fixtures/Summary_000565.pdf` so the +test runs without depending on the unstaged user workspace. + +Cert shape (Summary §1-19): House, Enclosed End-Terrace, 4 heated +storeys, TFA 319.91 m², 5 building parts (Main + 4 extensions). Age +mix A→J. Heat pump SAP code 224 + gas combi (PCDB 15100 Vaillant +Ecotec plus 415) providing DHW only via water_heating_code 914 +("from second main system"). Solar HW (3 m² flat-panel, W, +30° elevation), FGHRS (Zenex SuperFlow index 60063), MEV +decentralised (PCDF 500755). Conservatory thermally separated WITH +fixed heaters. Curtain Wall Post-2023 (Ext2), basement walls +(Ext3+Ext4), CF cavity-filled party wall (Ext1), CU cavity-unfilled +party wall (Main). RR on every part with mixed age bands. + +Worksheet pin targets (U985-0001-000565.pdf, Block 1 — energy rating): +- SAP value 28.5087 (line 257) → SAP rating 29 (line 258) +- Energy cost factor 5.3866 (line 257) +- Total fuel cost £4680.2593 (line 255, Table 12 prices) +- CO2 6447.6263 kg/year (line 272) +- Space heating 59008.3499 kWh/year (line 98c) +- Main 1 fuel 34710.7941 kWh/year (line 211) — ASHP electricity +- Secondary fuel 0.0 (line 215) +- Hot water fuel 3755.0288 kWh/year (line 219) — gas combi via WHC 914 +- Lighting 1384.8353 kWh/year (line 232) +- Pumps/fans 252.5159 kWh/year (line 231) — MEV 127.5 + flue 45 + solar 80 + +Per [[feedback-zero-error-strict]] + [[feedback-e2e-validation- +philosophy]]: pins are abs=1e-4 against the worksheet PDF. Failing +pins are named extractor / mapper / calculator gaps to fix. +""" + +from __future__ import annotations + +import re +import subprocess +from pathlib import Path +from typing import Final + +from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.domain.mapper import EpcPropertyDataMapper + + +# Repo root → backend fixtures. parents[0]=tests/, parents[1]=worksheet/, +# parents[2]=sap10_calculator/, parents[3]=domain/, parents[4]=repo root. +_SUMMARY_PDF: Final[Path] = ( + Path(__file__).resolve().parents[4] + / "backend" / "documents_parser" / "tests" / "fixtures" + / "Summary_000565.pdf" +) + + +def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]: + """Convert a Summary PDF into the per-page text format the + ElmhurstSiteNotesExtractor expects (label\\nvalue sequences). + + Mirror of the helper in `backend/documents_parser/tests/ + test_summary_pdf_mapper_chain.py::_summary_pdf_to_textract_style_ + pages`. Duplicated here rather than imported across the test/ + fixture boundary; the canonical version lives next to its callers + and this fixture module is the only e2e harness consumer. + + `pdftotext -layout` preserves the spatial pairing of label and + value on each line; we split each line on 2+ spaces to surface + the label/value tokens, then concatenate them back into a single + newline-delimited stream per page. + """ + info = subprocess.run( + ["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True, + ).stdout + m = re.search(r"Pages:\s+(\d+)", info) + if m is None: + raise RuntimeError(f"Could not parse page count from {pdf_path}") + page_count = int(m.group(1)) + + pages: list[str] = [] + for i in range(1, page_count + 1): + layout = subprocess.run( + [ + "pdftotext", "-layout", "-f", str(i), "-l", str(i), + str(pdf_path), "-", + ], + capture_output=True, text=True, check=True, + ).stdout + tokens: list[str] = [] + for line in layout.splitlines(): + if not line.strip(): + tokens.append("") + continue + parts = [p for p in re.split(r"\s{2,}", line.strip()) if p] + tokens.extend(parts) + pages.append("\n".join(tokens)) + return pages + + +def build_epc() -> EpcPropertyData: + """Route Summary_000565.pdf through extractor + mapper. + + No hand-built EpcPropertyData. The Elmhurst extractor and the + mapper are part of the test target — failing SAP-result pins + surface gaps in any of the three layers (extractor, mapper, + calculator). Each gap becomes its own follow-up slice. + """ + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) diff --git a/domain/sap10_calculator/worksheet/tests/test_e2e_elmhurst_sap_score.py b/domain/sap10_calculator/worksheet/tests/test_e2e_elmhurst_sap_score.py index 514c9c41..514c438b 100644 --- a/domain/sap10_calculator/worksheet/tests/test_e2e_elmhurst_sap_score.py +++ b/domain/sap10_calculator/worksheet/tests/test_e2e_elmhurst_sap_score.py @@ -33,6 +33,7 @@ from domain.sap10_calculator.worksheet.tests import ( _elmhurst_worksheet_000487 as _w000487, _elmhurst_worksheet_000490 as _w000490, _elmhurst_worksheet_000516 as _w000516, + _elmhurst_worksheet_000565 as _w000565, ) from domain.sap10_calculator.worksheet.tests._elmhurst_fixtures import ( ALL_FIXTURES as _ELMHURST_FIXTURES, @@ -129,6 +130,20 @@ _FIXTURE_PINS: Final[dict[str, FixtureCascadePins]] = { lighting_kwh_per_yr=230.8853, pumps_fans_kwh_per_yr=160.0, ), + # Mapper-driven cohort entry — Summary_000565.pdf → extractor → + # mapper → calculator. 5 BPs, heat pump + gas combi DHW via WHC 914, + # solar HW, FGHRS, conservatory with heaters, curtain wall, basement + # walls. Pins are worksheet PDF Block 1 (energy-rating) line refs. + "000565": FixtureCascadePins( + sap_score=29, sap_score_continuous=28.5087, ecf=5.3866, + total_fuel_cost_gbp=4680.2593, co2_kg_per_yr=6447.6263, + space_heating_kwh_per_yr=59008.3499, + main_heating_fuel_kwh_per_yr=34710.7941, + secondary_heating_fuel_kwh_per_yr=0.0, + hot_water_kwh_per_yr=3755.0288, + lighting_kwh_per_yr=1384.8353, + pumps_fans_kwh_per_yr=252.5159, + ), } @@ -139,6 +154,7 @@ _FIXTURE_MODULES: Final[dict[str, ModuleType]] = { "000487": _w000487, "000490": _w000490, "000516": _w000516, + "000565": _w000565, }