diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index b3fde06b..00aaf045 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -281,11 +281,7 @@ class ElmhurstSiteNotesExtractor: # with the §8 Roofs / §9 Floors blocks. None when the PDF # omits the line (no retrofit lodged). ins_thickness_raw = self._local_val(lines, "Insulation Thickness") - insulation_thickness_mm = ( - int(ins_thickness_raw.split()[0]) - if ins_thickness_raw and ins_thickness_raw.split()[0].isdigit() - else None - ) + insulation_thickness_mm = self._parse_thickness_mm(ins_thickness_raw) return WallDetails( wall_type=self._local_str(lines, "Type"), insulation=self._local_str(lines, "Insulation"), @@ -323,11 +319,7 @@ class ElmhurstSiteNotesExtractor: if area <= 0: continue thickness_raw = self._local_val(lines, f"Alternative Wall {n} Thickness") - thickness_mm = ( - int(thickness_raw.split()[0]) - if thickness_raw and thickness_raw.split()[0].isdigit() - else None - ) + thickness_mm = self._parse_thickness_mm(thickness_raw) result.append(AlternativeWall( area_m2=area, wall_type=self._local_str(lines, f"Alternative Wall {n} Type"), @@ -356,11 +348,25 @@ class ElmhurstSiteNotesExtractor: lines = [l.strip() for l in main_body.splitlines() if l.strip()] return self._wall_details_from_lines(lines) + @staticmethod + def _parse_thickness_mm(raw: Optional[str]) -> Optional[int]: + """Parse an Elmhurst "Insulation Thickness" cell ("100 mm", + "400+ mm") to integer mm. The bucket-cap "400+ mm" (Table 17/18 + max tabulated row) carries a trailing "+" that a bare + `.split()[0].isdigit()` test rejects — strip to the leading + digits so the cap parses through to the cascade with its numeric + value (simulated case 5: roof "400+ mm" was silently dropped → + u_roof fell back to the age-J default 0.16 instead of the + 300mm+ value 0.11). Returns None when the cell is absent or + carries no leading number ("As Built", "N None").""" + if not raw: + return None + match = re.match(r"\d+", raw.strip()) + return int(match.group()) if match else None + def _roof_details_from_lines(self, lines: List[str]) -> RoofDetails: thickness_raw = self._local_val(lines, "Insulation Thickness") - thickness_mm = ( - int(thickness_raw.split()[0]) if thickness_raw and thickness_raw.split()[0].isdigit() else None - ) + thickness_mm = self._parse_thickness_mm(thickness_raw) insulation = self._local_str(lines, "Insulation") # The Summary PDF omits the "Insulation Thickness" line entirely # when no retrofit insulation is lodged (e.g. "Insulation: N None" @@ -391,11 +397,7 @@ class ElmhurstSiteNotesExtractor: # via the per-thickness column. Mirror of the §8 roof extractor # at `_roof_details_from_lines`. thickness_raw = self._local_val(lines, "Insulation Thickness") - thickness_mm = ( - int(thickness_raw.split()[0]) - if thickness_raw and thickness_raw.split()[0].isdigit() - else None - ) + thickness_mm = self._parse_thickness_mm(thickness_raw) return FloorDetails( location=self._local_str(lines, "Location"), floor_type=self._local_str(lines, "Type"), diff --git a/backend/documents_parser/tests/fixtures/Summary_001431_case5.pdf b/backend/documents_parser/tests/fixtures/Summary_001431_case5.pdf new file mode 100644 index 00000000..335e6242 Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_001431_case5.pdf differ diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 1f205786..143e734b 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -2090,6 +2090,10 @@ _ELMHURST_WALL_CODE_TO_SAP10: Dict[str, int] = { "SG": 1, # Stone: granite or whinstone (cert 000565 Ext1) — the # granite-specific Elmhurst variant of "ST"; same SAP10 # WALL_STONE_GRANITE=1 cascade entry. + "SS": 2, # Stone: sandstone or limestone (simulated case 5 / cert + # 0240 archetype) — SAP10 WALL_STONE_SANDSTONE=2. The + # sandstone-specific Elmhurst variant; the API path lodges + # the same wall as integer wall_construction=2. "SB": 3, # Solid brick (cohort cert lodgement) "SO": 3, # Solid brick (newer Elmhurst PDF variant — same SAP10 # mapping; cert 9501 lodges "SO Solid Brick" where the diff --git a/sap worksheets/golden fixture debugging/simulated case 5/P960-0001-001431 - 2026-06-03T115608.865.pdf b/sap worksheets/golden fixture debugging/simulated case 5/P960-0001-001431 - 2026-06-03T115608.865.pdf new file mode 100644 index 00000000..55cb5940 Binary files /dev/null and b/sap worksheets/golden fixture debugging/simulated case 5/P960-0001-001431 - 2026-06-03T115608.865.pdf differ diff --git a/sap worksheets/golden fixture debugging/simulated case 5/Summary_001431 (1).pdf b/sap worksheets/golden fixture debugging/simulated case 5/Summary_001431 (1).pdf new file mode 100644 index 00000000..335e6242 Binary files /dev/null and b/sap worksheets/golden fixture debugging/simulated case 5/Summary_001431 (1).pdf differ diff --git a/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431_case5.py b/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431_case5.py new file mode 100644 index 00000000..c1f6e8f7 --- /dev/null +++ b/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431_case5.py @@ -0,0 +1,122 @@ +"""Mapper-driven cascade pin against the Elmhurst P960-0001-001431 +"simulated case 5" worksheet — a DETACHED, SANDSTONE-walled cousin of +golden cert 0240 (Main + Extension + room-in-roof, age band J). + +Like the other 001431 cases, this fixture does NOT hand-build the +EpcPropertyData: it routes the Summary PDF through +ElmhurstSiteNotesExtractor + from_elmhurst_site_notes so the SAP-result +pin grid exercises the WHOLE extractor + mapper + calculator pipeline. + +Purpose: prove the calculator is spec-correct for a DETACHED room-in-roof +with one Exposed + one Party gable, validating S0380.196 (API Simplified +Type 1 RR gables deduct from the A_RR shell) against a real worksheet. +The worksheet prints the exact routing the cascade implements: + + Roof room Main Gable Wall 1 15.68 U=0.35 (29a) ← Exposed → walls @ main-wall U + Roof room Main remaining area 61.73 U=0.30 (30) ← A_RR shell − Σ gables (residual) + External roof Main 14.52 U=0.11 (30) ← loft residual + Roof room Main Gable Wall 2 15.68 U=0.25 (32) ← Party → party @ 0.25 + +gable area = 6.40 × 2.45 = 15.68 m² (the §3.9.1 default RR storey height). +A_RR remaining = 12.5√(83.2/1.5) − 2×15.68 = 93.09 − 31.36 = 61.73. + +This case surfaced two extractor/mapper gaps fixed in the same slice +(S0380.197): +- the sandstone wall label "SS Stone: sandstone or limestone" had no + `_ELMHURST_WALL_CODE_TO_SAP10` entry (→ WALL_STONE_SANDSTONE=2, matching + 0240's API `wall_construction=2`); +- the roof "Insulation Thickness 400+ mm" was silently dropped by the + extractor's `.split()[0].isdigit()` thickness parse (the trailing "+"), + so u_roof fell back to the age-J default 0.16 instead of 0.11 + (`_parse_thickness_mm` now strips to leading digits). + +Cert shape: Detached house, Main + Extension 1, sandstone insulated walls, +2 storeys + room-in-roof on the Main (floor 83.2 m², one Exposed + one +Party gable, L=6.40 each), oil community/boiler (SAP code 901 combi route, +control 2106), no PV, 20 low-energy lighting bulbs. + +Source: user-simulated PDFs at `sap worksheets/golden fixture +debugging/simulated case 5/`. The Summary is mirrored into the tracked +`backend/documents_parser/tests/fixtures/Summary_001431_case5.pdf`. + +Worksheet pin targets (P960-0001-001431, Block 1 — energy rating): +- SAP rating 61 (line 258), ECF 2.7724 (line 257) +- Total fuel cost £1586.4549 (line 255) +- CO2 8387.6229 kg/year (line 272) +- Space heating 12838.6489 kWh/year (Σ monthly (98)) +- Main 1 fuel 21397.7480 kWh/year (line 211) +- Secondary fuel 0.0 (line 215) +- Hot water fuel 6498.2518 kWh/year (line 219) +- Lighting 381.4601 kWh/year (line 232) +- Pumps/fans 141.0 kWh/year (line 231) + +Per [[feedback-zero-error-strict]] + [[feedback-e2e-validation- +philosophy]]: pins are abs=1e-4 against the worksheet PDF. +""" + +from __future__ import annotations + +import re +import subprocess +from pathlib import Path +from typing import Final + +from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.domain.mapper import EpcPropertyDataMapper + + +# parents[0]=worksheet/, [1]=sap10_calculator/, [2]=domain/, [3]=tests/, +# [4]=repo root. +_SUMMARY_PDF: Final[Path] = ( + Path(__file__).resolve().parents[4] + / "backend" / "documents_parser" / "tests" / "fixtures" + / "Summary_001431_case5.pdf" +) + + +def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]: + """Convert a Summary PDF into the per-page text format the + ElmhurstSiteNotesExtractor expects (label\\nvalue sequences). + + Mirror of the helper in `test_summary_pdf_mapper_chain.py` / + `_elmhurst_worksheet_000565.py`. + """ + info = subprocess.run( + ["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True, + ).stdout + m = re.search(r"Pages:\s+(\d+)", info) + if m is None: + raise RuntimeError(f"Could not parse page count from {pdf_path}") + page_count = int(m.group(1)) + + pages: list[str] = [] + for i in range(1, page_count + 1): + layout = subprocess.run( + [ + "pdftotext", "-layout", "-f", str(i), "-l", str(i), + str(pdf_path), "-", + ], + capture_output=True, text=True, check=True, + ).stdout + tokens: list[str] = [] + for line in layout.splitlines(): + if not line.strip(): + tokens.append("") + continue + parts = [p for p in re.split(r"\s{2,}", line.strip()) if p] + tokens.extend(parts) + pages.append("\n".join(tokens)) + return pages + + +def build_epc() -> EpcPropertyData: + """Route the simulated case-5 Summary through extractor + mapper. + + No hand-built EpcPropertyData — the extractor and mapper are part of + the test target. Exercises the S0380.196 RR-gable deduction, the + S0380.197 sandstone-wall-label + "400+ mm" roof-thickness fixes. + """ + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) diff --git a/tests/domain/sap10_calculator/worksheet/test_e2e_elmhurst_sap_score.py b/tests/domain/sap10_calculator/worksheet/test_e2e_elmhurst_sap_score.py index 1091c013..1dbee580 100644 --- a/tests/domain/sap10_calculator/worksheet/test_e2e_elmhurst_sap_score.py +++ b/tests/domain/sap10_calculator/worksheet/test_e2e_elmhurst_sap_score.py @@ -41,6 +41,7 @@ from tests.domain.sap10_calculator.worksheet import ( _elmhurst_worksheet_001431_rr as _w001431_rr, _elmhurst_worksheet_001431_rr8 as _w001431_rr8, _elmhurst_worksheet_001431_6035 as _w001431_6035, + _elmhurst_worksheet_001431_case5 as _w001431_case5, ) from tests.domain.sap10_calculator.worksheet._elmhurst_fixtures import ( ALL_FIXTURES as _ELMHURST_FIXTURES, @@ -217,6 +218,25 @@ _FIXTURE_PINS: Final[dict[str, FixtureCascadePins]] = { lighting_kwh_per_yr=262.0885, pumps_fans_kwh_per_yr=86.0, ), + # Mapper-driven cohort entry — Summary_001431_case5.pdf → extractor → + # mapper → calculator. DETACHED, SANDSTONE-walled cousin of cert 0240: + # Main + Extension + room-in-roof (floor 83.2 m², one Exposed + one + # Party gable L=6.40), age J, oil combi (SAP 901), no PV. Validates + # S0380.196 (RR gable deduction) against a real worksheet — the + # worksheet prints Gable 1 (Exposed) at (29a) U=0.35, Gable 2 (Party) + # at (32) U=0.25, remaining area = shell − Σ gables at (30). Also pins + # the S0380.197 sandstone "SS" wall label + "400+ mm" roof-thickness + # extractor fixes (without the latter, roof U fell to 0.16 not 0.11). + "001431_case5": FixtureCascadePins( + sap_score=61, sap_score_continuous=61.3255, ecf=2.7724, + total_fuel_cost_gbp=1586.4549, co2_kg_per_yr=8387.6229, + space_heating_kwh_per_yr=12838.6489, + main_heating_fuel_kwh_per_yr=21397.7480, + secondary_heating_fuel_kwh_per_yr=0.0, + hot_water_kwh_per_yr=6498.2518, + lighting_kwh_per_yr=381.4601, + pumps_fans_kwh_per_yr=141.0, + ), } @@ -232,6 +252,7 @@ _FIXTURE_MODULES: Final[dict[str, ModuleType]] = { "001431_rr": _w001431_rr, "001431_rr8": _w001431_rr8, "001431_6035": _w001431_6035, + "001431_case5": _w001431_case5, }