diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index e78d98de..40e2b5a8 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -6,6 +6,7 @@ from datatypes.epc.surveys.elmhurst_site_notes import ( BathsAndShowers, BuildingPartDimensions, ElmhurstSiteNotes, + ExtensionPart, FloorDetails, FloorDimension, Lighting, @@ -79,6 +80,36 @@ class ElmhurstSiteNotesExtractor: except ValueError: return "" + # Multi-bp helpers: Summary PDFs subdivide §4/§7/§8/§9 with explicit + # "Main Property" / "1st Extension" / "2nd Extension" headers. The + # existing single-bp fixture also carries "Main Property" as a header + # before the body. This helper splits a section into per-bp chunks. + _BP_HEADER_RE = re.compile( + r"^(Main Property|\d+(?:st|nd|rd|th) Extension)\s*$", + re.MULTILINE, + ) + + def _split_section_by_bp(self, section_text: str) -> List[tuple[str, str]]: + """Split a section's text into per-bp subsections. + + Returns ``[(bp_name, body), ...]`` in document order. Body is + the text between this bp's header and the next bp's header + (exclusive). Returns ``[("Main Property", section_text)]`` when + no headers are found (defensive fallback for malformed PDFs). + """ + matches = list(self._BP_HEADER_RE.finditer(section_text)) + if not matches: + return [("Main Property", section_text)] + result: List[tuple[str, str]] = [] + for i, m in enumerate(matches): + name = m.group(1) + body_start = m.end() + body_end = ( + matches[i + 1].start() if i + 1 < len(matches) else len(section_text) + ) + result.append((name, section_text[body_start:body_end])) + return result + def _section_lines(self, start: str, end: str) -> List[str]: text = self._between(start, end) return [l.strip() for l in text.splitlines() if l.strip()] @@ -151,14 +182,13 @@ class ElmhurstSiteNotesExtractor: m = re.search(r"1\.0 Property type:\n[^\n]+\n([^\n]+)", self._text) return " ".join(m.group(1).strip().split()) if m else "" - def _extract_dimensions(self) -> BuildingPartDimensions: - dim_type = self._str_val("Dimension type") - section = self._between("4.0 Dimensions:", "5.0 Conservatory:") - floor_matches = re.findall( + def _floors_from_dimensions_body(self, body: str) -> List[FloorDimension]: + """Parse FloorDimension entries from a single bp's §4 body.""" + matches = re.findall( r"([A-Za-z ]+Floor):\n([\d.]+)\n([\d.]+)\n([\d.]+)\n([\d.]+)", - section, + body, ) - floors = [ + return [ FloorDimension( name=name.strip(), area_m2=float(area), @@ -166,12 +196,22 @@ class ElmhurstSiteNotesExtractor: heat_loss_perimeter_m=float(hlp), party_wall_length_m=float(pwl), ) - for name, area, height, hlp, pwl in floor_matches + for name, area, height, hlp, pwl in matches ] - return BuildingPartDimensions(dimension_type=dim_type, floors=floors) - def _extract_walls(self) -> WallDetails: - lines = self._section_lines("7.0 Walls:", "8.0 Roofs:") + def _extract_dimensions(self) -> BuildingPartDimensions: + """Main-property dimensions only. Extensions are picked up by + `_extract_extensions`.""" + dim_type = self._str_val("Dimension type") + section = self._between("4.0 Dimensions:", "5.0 Conservatory:") + bp_chunks = self._split_section_by_bp(section) + main_body = bp_chunks[0][1] if bp_chunks else section + return BuildingPartDimensions( + dimension_type=dim_type, + floors=self._floors_from_dimensions_body(main_body), + ) + + def _wall_details_from_lines(self, lines: List[str]) -> WallDetails: thickness_raw = self._local_val(lines, "Wall Thickness") thickness_mm = ( int(thickness_raw.split()[0]) if thickness_raw else None @@ -185,11 +225,17 @@ class ElmhurstSiteNotesExtractor: thickness_mm=thickness_mm, ) - def _extract_roof(self) -> RoofDetails: - lines = self._section_lines("8.0 Roofs:", "8.1 Rooms in Roof:") + def _extract_walls(self) -> WallDetails: + section = self._between("7.0 Walls:", "8.0 Roofs:") + bp_chunks = self._split_section_by_bp(section) + main_body = bp_chunks[0][1] if bp_chunks else section + lines = [l.strip() for l in main_body.splitlines() if l.strip()] + return self._wall_details_from_lines(lines) + + def _roof_details_from_lines(self, lines: List[str]) -> RoofDetails: thickness_raw = self._local_val(lines, "Insulation Thickness") thickness_mm = ( - int(thickness_raw.split()[0]) if thickness_raw else None + int(thickness_raw.split()[0]) if thickness_raw and thickness_raw.split()[0].isdigit() else None ) return RoofDetails( roof_type=self._local_str(lines, "Type"), @@ -198,8 +244,14 @@ class ElmhurstSiteNotesExtractor: insulation_thickness_mm=thickness_mm, ) - def _extract_floor(self) -> FloorDetails: - lines = self._section_lines("9.0 Floors:", "10.0 Doors:") + def _extract_roof(self) -> RoofDetails: + section = self._between("8.0 Roofs:", "8.1 Rooms in Roof:") + bp_chunks = self._split_section_by_bp(section) + main_body = bp_chunks[0][1] if bp_chunks else section + lines = [l.strip() for l in main_body.splitlines() if l.strip()] + return self._roof_details_from_lines(lines) + + def _floor_details_from_lines(self, lines: List[str]) -> FloorDetails: u_val_raw = self._local_val(lines, "Default U-value") default_u = float(u_val_raw) if u_val_raw else None return FloorDetails( @@ -210,6 +262,79 @@ class ElmhurstSiteNotesExtractor: default_u_value=default_u, ) + def _extract_floor(self) -> FloorDetails: + section = self._between("9.0 Floors:", "10.0 Doors:") + bp_chunks = self._split_section_by_bp(section) + main_body = bp_chunks[0][1] if bp_chunks else section + lines = [l.strip() for l in main_body.splitlines() if l.strip()] + return self._floor_details_from_lines(lines) + + def _extract_extensions(self) -> List[ExtensionPart]: + """Collect non-Main building parts. Cross-references the §4, §7, + §8, §9 per-bp subsections by extension name. "As Main: Yes" + within a section body inherits the main bp's data for that + section; otherwise the section body is parsed in isolation.""" + # Gather per-section chunks once. + dim_section = self._between("4.0 Dimensions:", "5.0 Conservatory:") + wall_section = self._between("7.0 Walls:", "8.0 Roofs:") + roof_section = self._between("8.0 Roofs:", "8.1 Rooms in Roof:") + floor_section = self._between("9.0 Floors:", "10.0 Doors:") + dim_type = self._str_val("Dimension type") + + dim_chunks = dict(self._split_section_by_bp(dim_section)) + wall_chunks = dict(self._split_section_by_bp(wall_section)) + roof_chunks = dict(self._split_section_by_bp(roof_section)) + floor_chunks = dict(self._split_section_by_bp(floor_section)) + + main_walls = self._extract_walls() + main_roof = self._extract_roof() + main_floor = self._extract_floor() + + # Per-bp age-band lookup. Section 3 contains lines like + # "1st Extension B 1900-1929" — the band sits after the name. + age_band_re = re.compile( + r"^(\d+(?:st|nd|rd|th) Extension)\s+([A-M] [^\n]+)$", + re.MULTILINE, + ) + age_bands = {m.group(1): m.group(2).strip() for m in age_band_re.finditer(self._text)} + + # Collect names in document order from the dimensions section + # (excluding Main Property). + names = [ + name for name, _ in self._split_section_by_bp(dim_section) + if name != "Main Property" + ] + + extensions: List[ExtensionPart] = [] + for name in names: + dim_body = dim_chunks.get(name, "") + wall_body = wall_chunks.get(name, "") + roof_body = roof_chunks.get(name, "") + floor_body = floor_chunks.get(name, "") + + wall_lines = [l.strip() for l in wall_body.splitlines() if l.strip()] + roof_lines = [l.strip() for l in roof_body.splitlines() if l.strip()] + floor_lines = [l.strip() for l in floor_body.splitlines() if l.strip()] + + walls = main_walls if self._local_bool(wall_lines, "As Main Wall") else self._wall_details_from_lines(wall_lines) + roof = main_roof if self._local_bool(roof_lines, "As Main") else self._roof_details_from_lines(roof_lines) + floor = main_floor if self._local_bool(floor_lines, "As Main") else self._floor_details_from_lines(floor_lines) + + extensions.append( + ExtensionPart( + name=name, + construction_age_band=age_bands.get(name, ""), + dimensions=BuildingPartDimensions( + dimension_type=dim_type, + floors=self._floors_from_dimensions_body(dim_body), + ), + walls=walls, + roof=roof, + floor=floor, + ) + ) + return extensions + def _extract_windows(self) -> List[Window]: m = re.search( r"Permanent\s+Shutters\n(.*?)Draught Proofing", @@ -448,4 +573,5 @@ class ElmhurstSiteNotesExtractor: water_heating=self._extract_water_heating(), baths_and_showers=self._extract_baths_and_showers(), renewables=self._extract_renewables(), + extensions=self._extract_extensions(), ) diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index ed32dafc..f3f3b209 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -34,8 +34,6 @@ import re import subprocess from pathlib import Path -import pytest - from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor from datatypes.epc.domain.mapper import EpcPropertyDataMapper @@ -80,15 +78,6 @@ def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]: return pages -@pytest.mark.xfail( - reason=( - "Elmhurst mapper `from_elmhurst_site_notes` currently produces a " - "single SapBuildingPart regardless of the cert's actual count; " - "cert 000474 lodges Main + Extension 1 + Extension 2 (3 bps). " - "See module docstring for full punch list." - ), - strict=True, -) def test_summary_000474_mapper_produces_three_building_parts() -> None: # Arrange — cert U985-0001-000474 is a mid-terrace with 3 building # parts (Main + 2 extensions) per the hand-built worksheet fixture diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index a5c89914..e7f30253 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -58,8 +58,12 @@ from datatypes.epc.schema.rdsap_schema_21_0_1 import ( EnergyElement as EnergyElement_21_0_1, ) from datatypes.epc.surveys.elmhurst_site_notes import ( + BuildingPartDimensions as ElmhurstBuildingPartDimensions, ElmhurstSiteNotes, + FloorDetails as ElmhurstFloorDetails, + RoofDetails as ElmhurstRoofDetails, VentilationAndCooling as ElmhurstVentilation, + WallDetails as ElmhurstWallDetails, Window as ElmhurstWindow, ) from datatypes.epc.surveys.pashub_rdsap_site_notes import ( @@ -285,7 +289,7 @@ class EpcPropertyDataMapper: wind_turbines_terrain_type=survey.renewables.wind_turbines_terrain_type, electricity_smart_meter_present=survey.meters.electricity_smart_meter, ), - sap_building_parts=[_map_elmhurst_building_part(survey)], + sap_building_parts=_map_elmhurst_building_parts(survey), solar_water_heating=survey.renewables.solar_water_heating, has_hot_water_cylinder=survey.water_heating.hot_water_cylinder_present, has_fixed_air_conditioning=survey.ventilation.fixed_space_cooling, @@ -1945,8 +1949,17 @@ def _map_sap_ventilation(ventilation: Ventilation) -> SapVentilation: ) -def _map_elmhurst_building_part(survey: ElmhurstSiteNotes) -> SapBuildingPart: - dims = survey.dimensions +def _map_elmhurst_building_part( + *, + identifier: BuildingPartIdentifier, + age_band: str, + dimensions: ElmhurstBuildingPartDimensions, + walls: ElmhurstWallDetails, + roof: ElmhurstRoofDetails, + floor: ElmhurstFloorDetails, +) -> SapBuildingPart: + """Build a `SapBuildingPart` from one bp's worth of Elmhurst site- + notes data. `identifier` distinguishes Main from each extension.""" floor_dims = [ SapFloorDimension( room_height_m=f.room_height_m, @@ -1955,26 +1968,65 @@ def _map_elmhurst_building_part(survey: ElmhurstSiteNotes) -> SapBuildingPart: heat_loss_perimeter_m=f.heat_loss_perimeter_m, floor=i, ) - for i, f in enumerate(dims.floors) + for i, f in enumerate(dimensions.floors) ] return SapBuildingPart( - identifier=BuildingPartIdentifier.MAIN, - construction_age_band=_strip_code(survey.construction_age_band), - wall_construction=_strip_code(survey.walls.wall_type), - wall_insulation_type=_strip_code(survey.walls.insulation), - wall_thickness_measured=not survey.walls.thickness_unknown, - party_wall_construction=_strip_code(survey.walls.party_wall_type), + identifier=identifier, + construction_age_band=_strip_code(age_band), + wall_construction=_strip_code(walls.wall_type), + wall_insulation_type=_strip_code(walls.insulation), + wall_thickness_measured=not walls.thickness_unknown, + party_wall_construction=_strip_code(walls.party_wall_type), sap_floor_dimensions=floor_dims, - wall_thickness_mm=survey.walls.thickness_mm, - roof_insulation_location=_strip_code(survey.roof.insulation), - roof_insulation_thickness=survey.roof.insulation_thickness_mm, - floor_type=_strip_code(survey.floor.location), - floor_construction_type=_strip_code(survey.floor.floor_type), - floor_insulation_type_str=_strip_code(survey.floor.insulation), - floor_u_value_known=survey.floor.u_value_known, + wall_thickness_mm=walls.thickness_mm, + roof_insulation_location=_strip_code(roof.insulation), + roof_insulation_thickness=roof.insulation_thickness_mm, + floor_type=_strip_code(floor.location), + floor_construction_type=_strip_code(floor.floor_type), + floor_insulation_type_str=_strip_code(floor.insulation), + floor_u_value_known=floor.u_value_known, ) +# RdSAP10 §1.2 caps extensions at 4 per dwelling. Indexed allocation +# of identifiers in document order. +_EXTENSION_IDENTIFIERS: tuple[BuildingPartIdentifier, ...] = ( + BuildingPartIdentifier.EXTENSION_1, + BuildingPartIdentifier.EXTENSION_2, + BuildingPartIdentifier.EXTENSION_3, + BuildingPartIdentifier.EXTENSION_4, +) + + +def _map_elmhurst_building_parts(survey: ElmhurstSiteNotes) -> List[SapBuildingPart]: + """Produce a list of `SapBuildingPart` covering the main dwelling plus + each lodged extension. Empty `survey.extensions` collapses to a + single-element list (the Main bp) — backward-compatible with single- + bp certs.""" + parts: List[SapBuildingPart] = [ + _map_elmhurst_building_part( + identifier=BuildingPartIdentifier.MAIN, + age_band=survey.construction_age_band, + dimensions=survey.dimensions, + walls=survey.walls, + roof=survey.roof, + floor=survey.floor, + ) + ] + for ext, identifier in zip(survey.extensions, _EXTENSION_IDENTIFIERS): + parts.append( + _map_elmhurst_building_part( + identifier=identifier, + age_band=ext.construction_age_band, + dimensions=ext.dimensions, + walls=ext.walls, + roof=ext.roof, + floor=ext.floor, + ) + ) + return parts + + def _map_elmhurst_window(w: ElmhurstWindow) -> SapWindow: return SapWindow( frame_material=w.frame_type or None, diff --git a/datatypes/epc/surveys/elmhurst_site_notes.py b/datatypes/epc/surveys/elmhurst_site_notes.py index eec22a27..e943ad48 100644 --- a/datatypes/epc/surveys/elmhurst_site_notes.py +++ b/datatypes/epc/surveys/elmhurst_site_notes.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import date from typing import List, Optional @@ -184,6 +184,21 @@ class Renewables: hydro_electricity_generated_kwh: float +@dataclass +class ExtensionPart: + """Additional building part on a multi-bp cert (e.g. "1st Extension", + "2nd Extension" on the Elmhurst Summary PDF). Mirrors the per-bp + fabric fields the main dwelling carries at the top-level + ElmhurstSiteNotes.""" + + name: str # e.g. "1st Extension", "2nd Extension" + construction_age_band: str # e.g. "B 1900-1929" (may differ from main) + dimensions: BuildingPartDimensions + walls: WallDetails + roof: RoofDetails + floor: FloorDetails + + @dataclass class ElmhurstSiteNotes: surveyor_info: SurveyorInfo @@ -245,3 +260,11 @@ class ElmhurstSiteNotes: # Sections 16.0–22.0 renewables: Renewables + + # Additional building parts beyond the main dwelling. The singular + # `dimensions`, `walls`, `roof`, `floor`, and `construction_age_band` + # fields above describe the "Main" property; each ExtensionPart in + # this list describes a discrete extension with its own age band, + # dimensions, and fabric details. Empty list = single-bp cert + # (preserves backward compatibility with the existing fixture). + extensions: List[ExtensionPart] = field(default_factory=lambda: []) # type: ignore[reportUnknownLambdaType]