Slice 46a: Elmhurst mapper handles multi-bp Summary PDFs — Summary_000474 chain test flips green

ElmhurstSiteNotes had no representation for extensions: singular dimensions / walls / roof / floor fields could only describe the main bp. Summary PDFs lodge "1st Extension" / "2nd Extension" subsections in §4, §7, §8, §9 with optional "As Main: Yes" inheritance. This slice:

- Adds `ExtensionPart` dataclass and `ElmhurstSiteNotes.extensions: List[ExtensionPart]`.
- Adds `_split_section_by_bp` helper + per-bp parsing of dimensions / walls / roof / floor in the extractor; "As Main" inherits from the main bp.
- Refactors `_map_elmhurst_building_part` into a parameterised builder; adds `_map_elmhurst_building_parts` that yields Main + one SapBuildingPart per extension (capped at 4 per RdSAP10 §1.2).
- Scaffold test `test_summary_000474_mapper_produces_three_building_parts` flips from strict-xfail to passing.

Single-bp behaviour is unchanged (empty extensions list defaults). 752 existing tests stay green.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-05-24 17:55:13 +00:00
parent ccf7aa2118
commit 36f2c7bbdf
4 changed files with 234 additions and 44 deletions

View file

@ -6,6 +6,7 @@ from datatypes.epc.surveys.elmhurst_site_notes import (
BathsAndShowers,
BuildingPartDimensions,
ElmhurstSiteNotes,
ExtensionPart,
FloorDetails,
FloorDimension,
Lighting,
@ -79,6 +80,36 @@ class ElmhurstSiteNotesExtractor:
except ValueError:
return ""
# Multi-bp helpers: Summary PDFs subdivide §4/§7/§8/§9 with explicit
# "Main Property" / "1st Extension" / "2nd Extension" headers. The
# existing single-bp fixture also carries "Main Property" as a header
# before the body. This helper splits a section into per-bp chunks.
_BP_HEADER_RE = re.compile(
r"^(Main Property|\d+(?:st|nd|rd|th) Extension)\s*$",
re.MULTILINE,
)
def _split_section_by_bp(self, section_text: str) -> List[tuple[str, str]]:
"""Split a section's text into per-bp subsections.
Returns ``[(bp_name, body), ...]`` in document order. Body is
the text between this bp's header and the next bp's header
(exclusive). Returns ``[("Main Property", section_text)]`` when
no headers are found (defensive fallback for malformed PDFs).
"""
matches = list(self._BP_HEADER_RE.finditer(section_text))
if not matches:
return [("Main Property", section_text)]
result: List[tuple[str, str]] = []
for i, m in enumerate(matches):
name = m.group(1)
body_start = m.end()
body_end = (
matches[i + 1].start() if i + 1 < len(matches) else len(section_text)
)
result.append((name, section_text[body_start:body_end]))
return result
def _section_lines(self, start: str, end: str) -> List[str]:
text = self._between(start, end)
return [l.strip() for l in text.splitlines() if l.strip()]
@ -151,14 +182,13 @@ class ElmhurstSiteNotesExtractor:
m = re.search(r"1\.0 Property type:\n[^\n]+\n([^\n]+)", self._text)
return " ".join(m.group(1).strip().split()) if m else ""
def _extract_dimensions(self) -> BuildingPartDimensions:
dim_type = self._str_val("Dimension type")
section = self._between("4.0 Dimensions:", "5.0 Conservatory:")
floor_matches = re.findall(
def _floors_from_dimensions_body(self, body: str) -> List[FloorDimension]:
"""Parse FloorDimension entries from a single bp's §4 body."""
matches = re.findall(
r"([A-Za-z ]+Floor):\n([\d.]+)\n([\d.]+)\n([\d.]+)\n([\d.]+)",
section,
body,
)
floors = [
return [
FloorDimension(
name=name.strip(),
area_m2=float(area),
@ -166,12 +196,22 @@ class ElmhurstSiteNotesExtractor:
heat_loss_perimeter_m=float(hlp),
party_wall_length_m=float(pwl),
)
for name, area, height, hlp, pwl in floor_matches
for name, area, height, hlp, pwl in matches
]
return BuildingPartDimensions(dimension_type=dim_type, floors=floors)
def _extract_walls(self) -> WallDetails:
lines = self._section_lines("7.0 Walls:", "8.0 Roofs:")
def _extract_dimensions(self) -> BuildingPartDimensions:
"""Main-property dimensions only. Extensions are picked up by
`_extract_extensions`."""
dim_type = self._str_val("Dimension type")
section = self._between("4.0 Dimensions:", "5.0 Conservatory:")
bp_chunks = self._split_section_by_bp(section)
main_body = bp_chunks[0][1] if bp_chunks else section
return BuildingPartDimensions(
dimension_type=dim_type,
floors=self._floors_from_dimensions_body(main_body),
)
def _wall_details_from_lines(self, lines: List[str]) -> WallDetails:
thickness_raw = self._local_val(lines, "Wall Thickness")
thickness_mm = (
int(thickness_raw.split()[0]) if thickness_raw else None
@ -185,11 +225,17 @@ class ElmhurstSiteNotesExtractor:
thickness_mm=thickness_mm,
)
def _extract_roof(self) -> RoofDetails:
lines = self._section_lines("8.0 Roofs:", "8.1 Rooms in Roof:")
def _extract_walls(self) -> WallDetails:
section = self._between("7.0 Walls:", "8.0 Roofs:")
bp_chunks = self._split_section_by_bp(section)
main_body = bp_chunks[0][1] if bp_chunks else section
lines = [l.strip() for l in main_body.splitlines() if l.strip()]
return self._wall_details_from_lines(lines)
def _roof_details_from_lines(self, lines: List[str]) -> RoofDetails:
thickness_raw = self._local_val(lines, "Insulation Thickness")
thickness_mm = (
int(thickness_raw.split()[0]) if thickness_raw else None
int(thickness_raw.split()[0]) if thickness_raw and thickness_raw.split()[0].isdigit() else None
)
return RoofDetails(
roof_type=self._local_str(lines, "Type"),
@ -198,8 +244,14 @@ class ElmhurstSiteNotesExtractor:
insulation_thickness_mm=thickness_mm,
)
def _extract_floor(self) -> FloorDetails:
lines = self._section_lines("9.0 Floors:", "10.0 Doors:")
def _extract_roof(self) -> RoofDetails:
section = self._between("8.0 Roofs:", "8.1 Rooms in Roof:")
bp_chunks = self._split_section_by_bp(section)
main_body = bp_chunks[0][1] if bp_chunks else section
lines = [l.strip() for l in main_body.splitlines() if l.strip()]
return self._roof_details_from_lines(lines)
def _floor_details_from_lines(self, lines: List[str]) -> FloorDetails:
u_val_raw = self._local_val(lines, "Default U-value")
default_u = float(u_val_raw) if u_val_raw else None
return FloorDetails(
@ -210,6 +262,79 @@ class ElmhurstSiteNotesExtractor:
default_u_value=default_u,
)
def _extract_floor(self) -> FloorDetails:
section = self._between("9.0 Floors:", "10.0 Doors:")
bp_chunks = self._split_section_by_bp(section)
main_body = bp_chunks[0][1] if bp_chunks else section
lines = [l.strip() for l in main_body.splitlines() if l.strip()]
return self._floor_details_from_lines(lines)
def _extract_extensions(self) -> List[ExtensionPart]:
"""Collect non-Main building parts. Cross-references the §4, §7,
§8, §9 per-bp subsections by extension name. "As Main: Yes"
within a section body inherits the main bp's data for that
section; otherwise the section body is parsed in isolation."""
# Gather per-section chunks once.
dim_section = self._between("4.0 Dimensions:", "5.0 Conservatory:")
wall_section = self._between("7.0 Walls:", "8.0 Roofs:")
roof_section = self._between("8.0 Roofs:", "8.1 Rooms in Roof:")
floor_section = self._between("9.0 Floors:", "10.0 Doors:")
dim_type = self._str_val("Dimension type")
dim_chunks = dict(self._split_section_by_bp(dim_section))
wall_chunks = dict(self._split_section_by_bp(wall_section))
roof_chunks = dict(self._split_section_by_bp(roof_section))
floor_chunks = dict(self._split_section_by_bp(floor_section))
main_walls = self._extract_walls()
main_roof = self._extract_roof()
main_floor = self._extract_floor()
# Per-bp age-band lookup. Section 3 contains lines like
# "1st Extension B 1900-1929" — the band sits after the name.
age_band_re = re.compile(
r"^(\d+(?:st|nd|rd|th) Extension)\s+([A-M] [^\n]+)$",
re.MULTILINE,
)
age_bands = {m.group(1): m.group(2).strip() for m in age_band_re.finditer(self._text)}
# Collect names in document order from the dimensions section
# (excluding Main Property).
names = [
name for name, _ in self._split_section_by_bp(dim_section)
if name != "Main Property"
]
extensions: List[ExtensionPart] = []
for name in names:
dim_body = dim_chunks.get(name, "")
wall_body = wall_chunks.get(name, "")
roof_body = roof_chunks.get(name, "")
floor_body = floor_chunks.get(name, "")
wall_lines = [l.strip() for l in wall_body.splitlines() if l.strip()]
roof_lines = [l.strip() for l in roof_body.splitlines() if l.strip()]
floor_lines = [l.strip() for l in floor_body.splitlines() if l.strip()]
walls = main_walls if self._local_bool(wall_lines, "As Main Wall") else self._wall_details_from_lines(wall_lines)
roof = main_roof if self._local_bool(roof_lines, "As Main") else self._roof_details_from_lines(roof_lines)
floor = main_floor if self._local_bool(floor_lines, "As Main") else self._floor_details_from_lines(floor_lines)
extensions.append(
ExtensionPart(
name=name,
construction_age_band=age_bands.get(name, ""),
dimensions=BuildingPartDimensions(
dimension_type=dim_type,
floors=self._floors_from_dimensions_body(dim_body),
),
walls=walls,
roof=roof,
floor=floor,
)
)
return extensions
def _extract_windows(self) -> List[Window]:
m = re.search(
r"Permanent\s+Shutters\n(.*?)Draught Proofing",
@ -448,4 +573,5 @@ class ElmhurstSiteNotesExtractor:
water_heating=self._extract_water_heating(),
baths_and_showers=self._extract_baths_and_showers(),
renewables=self._extract_renewables(),
extensions=self._extract_extensions(),
)

View file

@ -34,8 +34,6 @@ import re
import subprocess
from pathlib import Path
import pytest
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
@ -80,15 +78,6 @@ def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]:
return pages
@pytest.mark.xfail(
reason=(
"Elmhurst mapper `from_elmhurst_site_notes` currently produces a "
"single SapBuildingPart regardless of the cert's actual count; "
"cert 000474 lodges Main + Extension 1 + Extension 2 (3 bps). "
"See module docstring for full punch list."
),
strict=True,
)
def test_summary_000474_mapper_produces_three_building_parts() -> None:
# Arrange — cert U985-0001-000474 is a mid-terrace with 3 building
# parts (Main + 2 extensions) per the hand-built worksheet fixture

View file

@ -58,8 +58,12 @@ from datatypes.epc.schema.rdsap_schema_21_0_1 import (
EnergyElement as EnergyElement_21_0_1,
)
from datatypes.epc.surveys.elmhurst_site_notes import (
BuildingPartDimensions as ElmhurstBuildingPartDimensions,
ElmhurstSiteNotes,
FloorDetails as ElmhurstFloorDetails,
RoofDetails as ElmhurstRoofDetails,
VentilationAndCooling as ElmhurstVentilation,
WallDetails as ElmhurstWallDetails,
Window as ElmhurstWindow,
)
from datatypes.epc.surveys.pashub_rdsap_site_notes import (
@ -285,7 +289,7 @@ class EpcPropertyDataMapper:
wind_turbines_terrain_type=survey.renewables.wind_turbines_terrain_type,
electricity_smart_meter_present=survey.meters.electricity_smart_meter,
),
sap_building_parts=[_map_elmhurst_building_part(survey)],
sap_building_parts=_map_elmhurst_building_parts(survey),
solar_water_heating=survey.renewables.solar_water_heating,
has_hot_water_cylinder=survey.water_heating.hot_water_cylinder_present,
has_fixed_air_conditioning=survey.ventilation.fixed_space_cooling,
@ -1945,8 +1949,17 @@ def _map_sap_ventilation(ventilation: Ventilation) -> SapVentilation:
)
def _map_elmhurst_building_part(survey: ElmhurstSiteNotes) -> SapBuildingPart:
dims = survey.dimensions
def _map_elmhurst_building_part(
*,
identifier: BuildingPartIdentifier,
age_band: str,
dimensions: ElmhurstBuildingPartDimensions,
walls: ElmhurstWallDetails,
roof: ElmhurstRoofDetails,
floor: ElmhurstFloorDetails,
) -> SapBuildingPart:
"""Build a `SapBuildingPart` from one bp's worth of Elmhurst site-
notes data. `identifier` distinguishes Main from each extension."""
floor_dims = [
SapFloorDimension(
room_height_m=f.room_height_m,
@ -1955,26 +1968,65 @@ def _map_elmhurst_building_part(survey: ElmhurstSiteNotes) -> SapBuildingPart:
heat_loss_perimeter_m=f.heat_loss_perimeter_m,
floor=i,
)
for i, f in enumerate(dims.floors)
for i, f in enumerate(dimensions.floors)
]
return SapBuildingPart(
identifier=BuildingPartIdentifier.MAIN,
construction_age_band=_strip_code(survey.construction_age_band),
wall_construction=_strip_code(survey.walls.wall_type),
wall_insulation_type=_strip_code(survey.walls.insulation),
wall_thickness_measured=not survey.walls.thickness_unknown,
party_wall_construction=_strip_code(survey.walls.party_wall_type),
identifier=identifier,
construction_age_band=_strip_code(age_band),
wall_construction=_strip_code(walls.wall_type),
wall_insulation_type=_strip_code(walls.insulation),
wall_thickness_measured=not walls.thickness_unknown,
party_wall_construction=_strip_code(walls.party_wall_type),
sap_floor_dimensions=floor_dims,
wall_thickness_mm=survey.walls.thickness_mm,
roof_insulation_location=_strip_code(survey.roof.insulation),
roof_insulation_thickness=survey.roof.insulation_thickness_mm,
floor_type=_strip_code(survey.floor.location),
floor_construction_type=_strip_code(survey.floor.floor_type),
floor_insulation_type_str=_strip_code(survey.floor.insulation),
floor_u_value_known=survey.floor.u_value_known,
wall_thickness_mm=walls.thickness_mm,
roof_insulation_location=_strip_code(roof.insulation),
roof_insulation_thickness=roof.insulation_thickness_mm,
floor_type=_strip_code(floor.location),
floor_construction_type=_strip_code(floor.floor_type),
floor_insulation_type_str=_strip_code(floor.insulation),
floor_u_value_known=floor.u_value_known,
)
# RdSAP10 §1.2 caps extensions at 4 per dwelling. Indexed allocation
# of identifiers in document order.
_EXTENSION_IDENTIFIERS: tuple[BuildingPartIdentifier, ...] = (
BuildingPartIdentifier.EXTENSION_1,
BuildingPartIdentifier.EXTENSION_2,
BuildingPartIdentifier.EXTENSION_3,
BuildingPartIdentifier.EXTENSION_4,
)
def _map_elmhurst_building_parts(survey: ElmhurstSiteNotes) -> List[SapBuildingPart]:
"""Produce a list of `SapBuildingPart` covering the main dwelling plus
each lodged extension. Empty `survey.extensions` collapses to a
single-element list (the Main bp) backward-compatible with single-
bp certs."""
parts: List[SapBuildingPart] = [
_map_elmhurst_building_part(
identifier=BuildingPartIdentifier.MAIN,
age_band=survey.construction_age_band,
dimensions=survey.dimensions,
walls=survey.walls,
roof=survey.roof,
floor=survey.floor,
)
]
for ext, identifier in zip(survey.extensions, _EXTENSION_IDENTIFIERS):
parts.append(
_map_elmhurst_building_part(
identifier=identifier,
age_band=ext.construction_age_band,
dimensions=ext.dimensions,
walls=ext.walls,
roof=ext.roof,
floor=ext.floor,
)
)
return parts
def _map_elmhurst_window(w: ElmhurstWindow) -> SapWindow:
return SapWindow(
frame_material=w.frame_type or None,

View file

@ -1,4 +1,4 @@
from dataclasses import dataclass
from dataclasses import dataclass, field
from datetime import date
from typing import List, Optional
@ -184,6 +184,21 @@ class Renewables:
hydro_electricity_generated_kwh: float
@dataclass
class ExtensionPart:
"""Additional building part on a multi-bp cert (e.g. "1st Extension",
"2nd Extension" on the Elmhurst Summary PDF). Mirrors the per-bp
fabric fields the main dwelling carries at the top-level
ElmhurstSiteNotes."""
name: str # e.g. "1st Extension", "2nd Extension"
construction_age_band: str # e.g. "B 1900-1929" (may differ from main)
dimensions: BuildingPartDimensions
walls: WallDetails
roof: RoofDetails
floor: FloorDetails
@dataclass
class ElmhurstSiteNotes:
surveyor_info: SurveyorInfo
@ -245,3 +260,11 @@ class ElmhurstSiteNotes:
# Sections 16.022.0
renewables: Renewables
# Additional building parts beyond the main dwelling. The singular
# `dimensions`, `walls`, `roof`, `floor`, and `construction_age_band`
# fields above describe the "Main" property; each ExtensionPart in
# this list describes a discrete extension with its own age band,
# dimensions, and fabric details. Empty list = single-bp cert
# (preserves backward compatibility with the existing fixture).
extensions: List[ExtensionPart] = field(default_factory=lambda: []) # type: ignore[reportUnknownLambdaType]