diff --git a/backend/documents_parser/tests/fixtures/Summary_001431_gas_combi.pdf b/backend/documents_parser/tests/fixtures/Summary_001431_gas_combi.pdf new file mode 100644 index 00000000..1a15e3da Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_001431_gas_combi.pdf differ diff --git a/sap worksheets/golden fixture debugging/simulated case 1/P960-0001-001431 - 2026-06-02T221203.958.pdf b/sap worksheets/golden fixture debugging/simulated case 1/P960-0001-001431 - 2026-06-02T221203.958.pdf new file mode 100644 index 00000000..04de6151 Binary files /dev/null and b/sap worksheets/golden fixture debugging/simulated case 1/P960-0001-001431 - 2026-06-02T221203.958.pdf differ diff --git a/sap worksheets/golden fixture debugging/simulated case 1/Summary_001431 (1).pdf b/sap worksheets/golden fixture debugging/simulated case 1/Summary_001431 (1).pdf new file mode 100644 index 00000000..1a15e3da Binary files /dev/null and b/sap worksheets/golden fixture debugging/simulated case 1/Summary_001431 (1).pdf differ diff --git a/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431.py b/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431.py new file mode 100644 index 00000000..96609e90 --- /dev/null +++ b/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431.py @@ -0,0 +1,126 @@ +"""Mapper-driven cascade pin against the Elmhurst P960-0001-001431 +"simulated case 1" worksheet (gas-combi archetype). + +Like 000565, this fixture does NOT hand-build the EpcPropertyData. It +routes the Summary PDF through ElmhurstSiteNotesExtractor + +EpcPropertyDataMapper.from_elmhurst_site_notes so the SAP-result pin +grid exercises the WHOLE extractor + mapper + calculator pipeline. + +This is the cert that motivated S0380.190 — the newer Elmhurst export +lodges the gas combi as §14.0 "Fuel Type" EMPTY + "Main Heating SAP +Code" 104 (condensing combi, EES "BGW"), with the carrier ("Mains +gas") only in §15.0 "Water Heating Fuel Type". Before S0380.190 the +mapper left `main_fuel_type=''` → `cert_to_inputs` raised +`MissingMainFuelType`; `_elmhurst_gas_boiler_main_fuel` now derives +mains gas (code 26) from §15.0 per SAP 10.2 Table 4b (rows 101-119 are +gas-family boilers; the §15.0 fuel disambiguates the carrier because +the combi heats space + water from one appliance). + +It is also the cert that motivated S0380.189 (thermal mass parameter +per RdSAP 10 §5.16 Table 22): solid brick WITH internal insulation → +TMP 100, not the previously-hardcoded 250. + +Source: user-simulated PDFs at `sap worksheets/golden fixture +debugging/simulated case 1/` (Summary_001431 (1).pdf input + +P960-0001-001431 - 2026-06-02T221203.958.pdf worksheet). The Summary +is mirrored into the tracked +`backend/documents_parser/tests/fixtures/Summary_001431_gas_combi.pdf` +(distinct name — the corpus reuses cert 001431 across every heating +variant) so the test runs without depending on the unstaged workspace. + +Cert shape (Summary §1-19): gas-combi mid-terrace, TFA 128 m², solid +brick WITH internal insulation (→ Table 22 TMP 100), no PV, no +secondary heating, no cylinder (combi instantaneous HW, WHC HWP / SAP +code 901). Condensing combi SAP code 104, EES "BGW". + +Worksheet pin targets (P960-0001-001431 …958.pdf, Block 1 — energy +rating, lines 115-410; the second "FOR IMPROVED DWELLING" block is the +potential rating and is NOT pinned): +- SAP rating 78 (line 258) +- Energy cost factor 1.6047 (line 257; cascade carries it unrounded as + (255)*(256)/((4)+45) = 660.9750*0.4200/173.0 — the continuous SAP + 100 - 13.95*ECF is reconstructed from the unrounded ECF, NOT the + display-rounded 1.6047, so sap_score_continuous = 77.6147) +- Total fuel cost £660.9750 (line 255) +- CO2 3000.1664 kg/year (line 272) +- Space heating 8987.7669 kWh/year (Σ monthly (98)) +- Main 1 fuel 10699.7225 kWh/year (line 211) — mains gas +- Secondary fuel 0.0 (line 215) +- Hot water fuel 3327.1592 kWh/year (line 219) — combi +- Lighting 283.2229 kWh/year (line 232) +- Pumps/fans 86.0 kWh/year (line 231) + +Per [[feedback-zero-error-strict]] + [[feedback-e2e-validation- +philosophy]]: pins are abs=1e-4 against the worksheet PDF. Failing +pins are named extractor / mapper / calculator gaps to fix. +""" + +from __future__ import annotations + +import re +import subprocess +from pathlib import Path +from typing import Final + +from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.domain.mapper import EpcPropertyDataMapper + + +# parents[0]=worksheet/, [1]=sap10_calculator/, [2]=domain/, [3]=tests/, +# [4]=repo root. +_SUMMARY_PDF: Final[Path] = ( + Path(__file__).resolve().parents[4] + / "backend" / "documents_parser" / "tests" / "fixtures" + / "Summary_001431_gas_combi.pdf" +) + + +def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]: + """Convert a Summary PDF into the per-page text format the + ElmhurstSiteNotesExtractor expects (label\\nvalue sequences). + + Mirror of the helper in `backend/documents_parser/tests/ + test_summary_pdf_mapper_chain.py::_summary_pdf_to_textract_style_ + pages` (and `_elmhurst_worksheet_000565.py`). `pdftotext -layout` + preserves the spatial label/value pairing on each line; we split on + 2+ spaces to surface the tokens, then rejoin newline-delimited. + """ + info = subprocess.run( + ["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True, + ).stdout + m = re.search(r"Pages:\s+(\d+)", info) + if m is None: + raise RuntimeError(f"Could not parse page count from {pdf_path}") + page_count = int(m.group(1)) + + pages: list[str] = [] + for i in range(1, page_count + 1): + layout = subprocess.run( + [ + "pdftotext", "-layout", "-f", str(i), "-l", str(i), + str(pdf_path), "-", + ], + capture_output=True, text=True, check=True, + ).stdout + tokens: list[str] = [] + for line in layout.splitlines(): + if not line.strip(): + tokens.append("") + continue + parts = [p for p in re.split(r"\s{2,}", line.strip()) if p] + tokens.extend(parts) + pages.append("\n".join(tokens)) + return pages + + +def build_epc() -> EpcPropertyData: + """Route the simulated 001431 Summary through extractor + mapper. + + No hand-built EpcPropertyData — the extractor and mapper are part of + the test target. Exercises the S0380.190 gas-combi fuel derivation + (§14.0 Fuel Type empty + SAP code 104 → mains gas via §15.0). + """ + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) diff --git a/tests/domain/sap10_calculator/worksheet/test_e2e_elmhurst_sap_score.py b/tests/domain/sap10_calculator/worksheet/test_e2e_elmhurst_sap_score.py index 69ad44ba..4f4653fd 100644 --- a/tests/domain/sap10_calculator/worksheet/test_e2e_elmhurst_sap_score.py +++ b/tests/domain/sap10_calculator/worksheet/test_e2e_elmhurst_sap_score.py @@ -37,6 +37,7 @@ from tests.domain.sap10_calculator.worksheet import ( _elmhurst_worksheet_000490 as _w000490, _elmhurst_worksheet_000516 as _w000516, _elmhurst_worksheet_000565 as _w000565, + _elmhurst_worksheet_001431 as _w001431, ) from tests.domain.sap10_calculator.worksheet._elmhurst_fixtures import ( ALL_FIXTURES as _ELMHURST_FIXTURES, @@ -147,6 +148,25 @@ _FIXTURE_PINS: Final[dict[str, FixtureCascadePins]] = { lighting_kwh_per_yr=1384.8353, pumps_fans_kwh_per_yr=252.5159, ), + # Mapper-driven cohort entry — Summary_001431_gas_combi.pdf → + # extractor → mapper → calculator. Gas-combi mid-terrace, TFA 128, + # solid brick WITH internal insulation (Table 22 TMP 100), no PV / + # secondary / cylinder. The cert that motivated S0380.190 (gas-combi + # fuel from §15.0 when §14.0 Fuel Type is empty + SAP code 104) and + # S0380.189 (thermal mass parameter). Pins are worksheet Block 1 + # (energy rating) line refs. sap_score_continuous is reconstructed + # from the UNROUNDED ECF ((255)*(256)/((4)+45)), not the display- + # rounded (257)=1.6047 — see the fixture module docstring. + "001431": FixtureCascadePins( + sap_score=78, sap_score_continuous=77.6147, ecf=1.6047, + total_fuel_cost_gbp=660.9750, co2_kg_per_yr=3000.1664, + space_heating_kwh_per_yr=8987.7669, + main_heating_fuel_kwh_per_yr=10699.7225, + secondary_heating_fuel_kwh_per_yr=0.0, + hot_water_kwh_per_yr=3327.1592, + lighting_kwh_per_yr=283.2229, + pumps_fans_kwh_per_yr=86.0, + ), } @@ -158,6 +178,7 @@ _FIXTURE_MODULES: Final[dict[str, ModuleType]] = { "000490": _w000490, "000516": _w000516, "000565": _w000565, + "001431": _w001431, }