mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Slice 46a: Elmhurst mapper handles multi-bp Summary PDFs — Summary_000474 chain test flips green
ElmhurstSiteNotes had no representation for extensions: singular dimensions / walls / roof / floor fields could only describe the main bp. Summary PDFs lodge "1st Extension" / "2nd Extension" subsections in §4, §7, §8, §9 with optional "As Main: Yes" inheritance. This slice: - Adds `ExtensionPart` dataclass and `ElmhurstSiteNotes.extensions: List[ExtensionPart]`. - Adds `_split_section_by_bp` helper + per-bp parsing of dimensions / walls / roof / floor in the extractor; "As Main" inherits from the main bp. - Refactors `_map_elmhurst_building_part` into a parameterised builder; adds `_map_elmhurst_building_parts` that yields Main + one SapBuildingPart per extension (capped at 4 per RdSAP10 §1.2). - Scaffold test `test_summary_000474_mapper_produces_three_building_parts` flips from strict-xfail to passing. Single-bp behaviour is unchanged (empty extensions list defaults). 752 existing tests stay green. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
ccf7aa2118
commit
36f2c7bbdf
4 changed files with 234 additions and 44 deletions
|
|
@ -6,6 +6,7 @@ from datatypes.epc.surveys.elmhurst_site_notes import (
|
|||
BathsAndShowers,
|
||||
BuildingPartDimensions,
|
||||
ElmhurstSiteNotes,
|
||||
ExtensionPart,
|
||||
FloorDetails,
|
||||
FloorDimension,
|
||||
Lighting,
|
||||
|
|
@ -79,6 +80,36 @@ class ElmhurstSiteNotesExtractor:
|
|||
except ValueError:
|
||||
return ""
|
||||
|
||||
# Multi-bp helpers: Summary PDFs subdivide §4/§7/§8/§9 with explicit
|
||||
# "Main Property" / "1st Extension" / "2nd Extension" headers. The
|
||||
# existing single-bp fixture also carries "Main Property" as a header
|
||||
# before the body. This helper splits a section into per-bp chunks.
|
||||
_BP_HEADER_RE = re.compile(
|
||||
r"^(Main Property|\d+(?:st|nd|rd|th) Extension)\s*$",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
def _split_section_by_bp(self, section_text: str) -> List[tuple[str, str]]:
|
||||
"""Split a section's text into per-bp subsections.
|
||||
|
||||
Returns ``[(bp_name, body), ...]`` in document order. Body is
|
||||
the text between this bp's header and the next bp's header
|
||||
(exclusive). Returns ``[("Main Property", section_text)]`` when
|
||||
no headers are found (defensive fallback for malformed PDFs).
|
||||
"""
|
||||
matches = list(self._BP_HEADER_RE.finditer(section_text))
|
||||
if not matches:
|
||||
return [("Main Property", section_text)]
|
||||
result: List[tuple[str, str]] = []
|
||||
for i, m in enumerate(matches):
|
||||
name = m.group(1)
|
||||
body_start = m.end()
|
||||
body_end = (
|
||||
matches[i + 1].start() if i + 1 < len(matches) else len(section_text)
|
||||
)
|
||||
result.append((name, section_text[body_start:body_end]))
|
||||
return result
|
||||
|
||||
def _section_lines(self, start: str, end: str) -> List[str]:
|
||||
text = self._between(start, end)
|
||||
return [l.strip() for l in text.splitlines() if l.strip()]
|
||||
|
|
@ -151,14 +182,13 @@ class ElmhurstSiteNotesExtractor:
|
|||
m = re.search(r"1\.0 Property type:\n[^\n]+\n([^\n]+)", self._text)
|
||||
return " ".join(m.group(1).strip().split()) if m else ""
|
||||
|
||||
def _extract_dimensions(self) -> BuildingPartDimensions:
|
||||
dim_type = self._str_val("Dimension type")
|
||||
section = self._between("4.0 Dimensions:", "5.0 Conservatory:")
|
||||
floor_matches = re.findall(
|
||||
def _floors_from_dimensions_body(self, body: str) -> List[FloorDimension]:
|
||||
"""Parse FloorDimension entries from a single bp's §4 body."""
|
||||
matches = re.findall(
|
||||
r"([A-Za-z ]+Floor):\n([\d.]+)\n([\d.]+)\n([\d.]+)\n([\d.]+)",
|
||||
section,
|
||||
body,
|
||||
)
|
||||
floors = [
|
||||
return [
|
||||
FloorDimension(
|
||||
name=name.strip(),
|
||||
area_m2=float(area),
|
||||
|
|
@ -166,12 +196,22 @@ class ElmhurstSiteNotesExtractor:
|
|||
heat_loss_perimeter_m=float(hlp),
|
||||
party_wall_length_m=float(pwl),
|
||||
)
|
||||
for name, area, height, hlp, pwl in floor_matches
|
||||
for name, area, height, hlp, pwl in matches
|
||||
]
|
||||
return BuildingPartDimensions(dimension_type=dim_type, floors=floors)
|
||||
|
||||
def _extract_walls(self) -> WallDetails:
|
||||
lines = self._section_lines("7.0 Walls:", "8.0 Roofs:")
|
||||
def _extract_dimensions(self) -> BuildingPartDimensions:
|
||||
"""Main-property dimensions only. Extensions are picked up by
|
||||
`_extract_extensions`."""
|
||||
dim_type = self._str_val("Dimension type")
|
||||
section = self._between("4.0 Dimensions:", "5.0 Conservatory:")
|
||||
bp_chunks = self._split_section_by_bp(section)
|
||||
main_body = bp_chunks[0][1] if bp_chunks else section
|
||||
return BuildingPartDimensions(
|
||||
dimension_type=dim_type,
|
||||
floors=self._floors_from_dimensions_body(main_body),
|
||||
)
|
||||
|
||||
def _wall_details_from_lines(self, lines: List[str]) -> WallDetails:
|
||||
thickness_raw = self._local_val(lines, "Wall Thickness")
|
||||
thickness_mm = (
|
||||
int(thickness_raw.split()[0]) if thickness_raw else None
|
||||
|
|
@ -185,11 +225,17 @@ class ElmhurstSiteNotesExtractor:
|
|||
thickness_mm=thickness_mm,
|
||||
)
|
||||
|
||||
def _extract_roof(self) -> RoofDetails:
|
||||
lines = self._section_lines("8.0 Roofs:", "8.1 Rooms in Roof:")
|
||||
def _extract_walls(self) -> WallDetails:
|
||||
section = self._between("7.0 Walls:", "8.0 Roofs:")
|
||||
bp_chunks = self._split_section_by_bp(section)
|
||||
main_body = bp_chunks[0][1] if bp_chunks else section
|
||||
lines = [l.strip() for l in main_body.splitlines() if l.strip()]
|
||||
return self._wall_details_from_lines(lines)
|
||||
|
||||
def _roof_details_from_lines(self, lines: List[str]) -> RoofDetails:
|
||||
thickness_raw = self._local_val(lines, "Insulation Thickness")
|
||||
thickness_mm = (
|
||||
int(thickness_raw.split()[0]) if thickness_raw else None
|
||||
int(thickness_raw.split()[0]) if thickness_raw and thickness_raw.split()[0].isdigit() else None
|
||||
)
|
||||
return RoofDetails(
|
||||
roof_type=self._local_str(lines, "Type"),
|
||||
|
|
@ -198,8 +244,14 @@ class ElmhurstSiteNotesExtractor:
|
|||
insulation_thickness_mm=thickness_mm,
|
||||
)
|
||||
|
||||
def _extract_floor(self) -> FloorDetails:
|
||||
lines = self._section_lines("9.0 Floors:", "10.0 Doors:")
|
||||
def _extract_roof(self) -> RoofDetails:
|
||||
section = self._between("8.0 Roofs:", "8.1 Rooms in Roof:")
|
||||
bp_chunks = self._split_section_by_bp(section)
|
||||
main_body = bp_chunks[0][1] if bp_chunks else section
|
||||
lines = [l.strip() for l in main_body.splitlines() if l.strip()]
|
||||
return self._roof_details_from_lines(lines)
|
||||
|
||||
def _floor_details_from_lines(self, lines: List[str]) -> FloorDetails:
|
||||
u_val_raw = self._local_val(lines, "Default U-value")
|
||||
default_u = float(u_val_raw) if u_val_raw else None
|
||||
return FloorDetails(
|
||||
|
|
@ -210,6 +262,79 @@ class ElmhurstSiteNotesExtractor:
|
|||
default_u_value=default_u,
|
||||
)
|
||||
|
||||
def _extract_floor(self) -> FloorDetails:
|
||||
section = self._between("9.0 Floors:", "10.0 Doors:")
|
||||
bp_chunks = self._split_section_by_bp(section)
|
||||
main_body = bp_chunks[0][1] if bp_chunks else section
|
||||
lines = [l.strip() for l in main_body.splitlines() if l.strip()]
|
||||
return self._floor_details_from_lines(lines)
|
||||
|
||||
def _extract_extensions(self) -> List[ExtensionPart]:
|
||||
"""Collect non-Main building parts. Cross-references the §4, §7,
|
||||
§8, §9 per-bp subsections by extension name. "As Main: Yes"
|
||||
within a section body inherits the main bp's data for that
|
||||
section; otherwise the section body is parsed in isolation."""
|
||||
# Gather per-section chunks once.
|
||||
dim_section = self._between("4.0 Dimensions:", "5.0 Conservatory:")
|
||||
wall_section = self._between("7.0 Walls:", "8.0 Roofs:")
|
||||
roof_section = self._between("8.0 Roofs:", "8.1 Rooms in Roof:")
|
||||
floor_section = self._between("9.0 Floors:", "10.0 Doors:")
|
||||
dim_type = self._str_val("Dimension type")
|
||||
|
||||
dim_chunks = dict(self._split_section_by_bp(dim_section))
|
||||
wall_chunks = dict(self._split_section_by_bp(wall_section))
|
||||
roof_chunks = dict(self._split_section_by_bp(roof_section))
|
||||
floor_chunks = dict(self._split_section_by_bp(floor_section))
|
||||
|
||||
main_walls = self._extract_walls()
|
||||
main_roof = self._extract_roof()
|
||||
main_floor = self._extract_floor()
|
||||
|
||||
# Per-bp age-band lookup. Section 3 contains lines like
|
||||
# "1st Extension B 1900-1929" — the band sits after the name.
|
||||
age_band_re = re.compile(
|
||||
r"^(\d+(?:st|nd|rd|th) Extension)\s+([A-M] [^\n]+)$",
|
||||
re.MULTILINE,
|
||||
)
|
||||
age_bands = {m.group(1): m.group(2).strip() for m in age_band_re.finditer(self._text)}
|
||||
|
||||
# Collect names in document order from the dimensions section
|
||||
# (excluding Main Property).
|
||||
names = [
|
||||
name for name, _ in self._split_section_by_bp(dim_section)
|
||||
if name != "Main Property"
|
||||
]
|
||||
|
||||
extensions: List[ExtensionPart] = []
|
||||
for name in names:
|
||||
dim_body = dim_chunks.get(name, "")
|
||||
wall_body = wall_chunks.get(name, "")
|
||||
roof_body = roof_chunks.get(name, "")
|
||||
floor_body = floor_chunks.get(name, "")
|
||||
|
||||
wall_lines = [l.strip() for l in wall_body.splitlines() if l.strip()]
|
||||
roof_lines = [l.strip() for l in roof_body.splitlines() if l.strip()]
|
||||
floor_lines = [l.strip() for l in floor_body.splitlines() if l.strip()]
|
||||
|
||||
walls = main_walls if self._local_bool(wall_lines, "As Main Wall") else self._wall_details_from_lines(wall_lines)
|
||||
roof = main_roof if self._local_bool(roof_lines, "As Main") else self._roof_details_from_lines(roof_lines)
|
||||
floor = main_floor if self._local_bool(floor_lines, "As Main") else self._floor_details_from_lines(floor_lines)
|
||||
|
||||
extensions.append(
|
||||
ExtensionPart(
|
||||
name=name,
|
||||
construction_age_band=age_bands.get(name, ""),
|
||||
dimensions=BuildingPartDimensions(
|
||||
dimension_type=dim_type,
|
||||
floors=self._floors_from_dimensions_body(dim_body),
|
||||
),
|
||||
walls=walls,
|
||||
roof=roof,
|
||||
floor=floor,
|
||||
)
|
||||
)
|
||||
return extensions
|
||||
|
||||
def _extract_windows(self) -> List[Window]:
|
||||
m = re.search(
|
||||
r"Permanent\s+Shutters\n(.*?)Draught Proofing",
|
||||
|
|
@ -448,4 +573,5 @@ class ElmhurstSiteNotesExtractor:
|
|||
water_heating=self._extract_water_heating(),
|
||||
baths_and_showers=self._extract_baths_and_showers(),
|
||||
renewables=self._extract_renewables(),
|
||||
extensions=self._extract_extensions(),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -34,8 +34,6 @@ import re
|
|||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
|
||||
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
||||
|
||||
|
|
@ -80,15 +78,6 @@ def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]:
|
|||
return pages
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
reason=(
|
||||
"Elmhurst mapper `from_elmhurst_site_notes` currently produces a "
|
||||
"single SapBuildingPart regardless of the cert's actual count; "
|
||||
"cert 000474 lodges Main + Extension 1 + Extension 2 (3 bps). "
|
||||
"See module docstring for full punch list."
|
||||
),
|
||||
strict=True,
|
||||
)
|
||||
def test_summary_000474_mapper_produces_three_building_parts() -> None:
|
||||
# Arrange — cert U985-0001-000474 is a mid-terrace with 3 building
|
||||
# parts (Main + 2 extensions) per the hand-built worksheet fixture
|
||||
|
|
|
|||
|
|
@ -58,8 +58,12 @@ from datatypes.epc.schema.rdsap_schema_21_0_1 import (
|
|||
EnergyElement as EnergyElement_21_0_1,
|
||||
)
|
||||
from datatypes.epc.surveys.elmhurst_site_notes import (
|
||||
BuildingPartDimensions as ElmhurstBuildingPartDimensions,
|
||||
ElmhurstSiteNotes,
|
||||
FloorDetails as ElmhurstFloorDetails,
|
||||
RoofDetails as ElmhurstRoofDetails,
|
||||
VentilationAndCooling as ElmhurstVentilation,
|
||||
WallDetails as ElmhurstWallDetails,
|
||||
Window as ElmhurstWindow,
|
||||
)
|
||||
from datatypes.epc.surveys.pashub_rdsap_site_notes import (
|
||||
|
|
@ -285,7 +289,7 @@ class EpcPropertyDataMapper:
|
|||
wind_turbines_terrain_type=survey.renewables.wind_turbines_terrain_type,
|
||||
electricity_smart_meter_present=survey.meters.electricity_smart_meter,
|
||||
),
|
||||
sap_building_parts=[_map_elmhurst_building_part(survey)],
|
||||
sap_building_parts=_map_elmhurst_building_parts(survey),
|
||||
solar_water_heating=survey.renewables.solar_water_heating,
|
||||
has_hot_water_cylinder=survey.water_heating.hot_water_cylinder_present,
|
||||
has_fixed_air_conditioning=survey.ventilation.fixed_space_cooling,
|
||||
|
|
@ -1945,8 +1949,17 @@ def _map_sap_ventilation(ventilation: Ventilation) -> SapVentilation:
|
|||
)
|
||||
|
||||
|
||||
def _map_elmhurst_building_part(survey: ElmhurstSiteNotes) -> SapBuildingPart:
|
||||
dims = survey.dimensions
|
||||
def _map_elmhurst_building_part(
|
||||
*,
|
||||
identifier: BuildingPartIdentifier,
|
||||
age_band: str,
|
||||
dimensions: ElmhurstBuildingPartDimensions,
|
||||
walls: ElmhurstWallDetails,
|
||||
roof: ElmhurstRoofDetails,
|
||||
floor: ElmhurstFloorDetails,
|
||||
) -> SapBuildingPart:
|
||||
"""Build a `SapBuildingPart` from one bp's worth of Elmhurst site-
|
||||
notes data. `identifier` distinguishes Main from each extension."""
|
||||
floor_dims = [
|
||||
SapFloorDimension(
|
||||
room_height_m=f.room_height_m,
|
||||
|
|
@ -1955,26 +1968,65 @@ def _map_elmhurst_building_part(survey: ElmhurstSiteNotes) -> SapBuildingPart:
|
|||
heat_loss_perimeter_m=f.heat_loss_perimeter_m,
|
||||
floor=i,
|
||||
)
|
||||
for i, f in enumerate(dims.floors)
|
||||
for i, f in enumerate(dimensions.floors)
|
||||
]
|
||||
return SapBuildingPart(
|
||||
identifier=BuildingPartIdentifier.MAIN,
|
||||
construction_age_band=_strip_code(survey.construction_age_band),
|
||||
wall_construction=_strip_code(survey.walls.wall_type),
|
||||
wall_insulation_type=_strip_code(survey.walls.insulation),
|
||||
wall_thickness_measured=not survey.walls.thickness_unknown,
|
||||
party_wall_construction=_strip_code(survey.walls.party_wall_type),
|
||||
identifier=identifier,
|
||||
construction_age_band=_strip_code(age_band),
|
||||
wall_construction=_strip_code(walls.wall_type),
|
||||
wall_insulation_type=_strip_code(walls.insulation),
|
||||
wall_thickness_measured=not walls.thickness_unknown,
|
||||
party_wall_construction=_strip_code(walls.party_wall_type),
|
||||
sap_floor_dimensions=floor_dims,
|
||||
wall_thickness_mm=survey.walls.thickness_mm,
|
||||
roof_insulation_location=_strip_code(survey.roof.insulation),
|
||||
roof_insulation_thickness=survey.roof.insulation_thickness_mm,
|
||||
floor_type=_strip_code(survey.floor.location),
|
||||
floor_construction_type=_strip_code(survey.floor.floor_type),
|
||||
floor_insulation_type_str=_strip_code(survey.floor.insulation),
|
||||
floor_u_value_known=survey.floor.u_value_known,
|
||||
wall_thickness_mm=walls.thickness_mm,
|
||||
roof_insulation_location=_strip_code(roof.insulation),
|
||||
roof_insulation_thickness=roof.insulation_thickness_mm,
|
||||
floor_type=_strip_code(floor.location),
|
||||
floor_construction_type=_strip_code(floor.floor_type),
|
||||
floor_insulation_type_str=_strip_code(floor.insulation),
|
||||
floor_u_value_known=floor.u_value_known,
|
||||
)
|
||||
|
||||
|
||||
# RdSAP10 §1.2 caps extensions at 4 per dwelling. Indexed allocation
|
||||
# of identifiers in document order.
|
||||
_EXTENSION_IDENTIFIERS: tuple[BuildingPartIdentifier, ...] = (
|
||||
BuildingPartIdentifier.EXTENSION_1,
|
||||
BuildingPartIdentifier.EXTENSION_2,
|
||||
BuildingPartIdentifier.EXTENSION_3,
|
||||
BuildingPartIdentifier.EXTENSION_4,
|
||||
)
|
||||
|
||||
|
||||
def _map_elmhurst_building_parts(survey: ElmhurstSiteNotes) -> List[SapBuildingPart]:
|
||||
"""Produce a list of `SapBuildingPart` covering the main dwelling plus
|
||||
each lodged extension. Empty `survey.extensions` collapses to a
|
||||
single-element list (the Main bp) — backward-compatible with single-
|
||||
bp certs."""
|
||||
parts: List[SapBuildingPart] = [
|
||||
_map_elmhurst_building_part(
|
||||
identifier=BuildingPartIdentifier.MAIN,
|
||||
age_band=survey.construction_age_band,
|
||||
dimensions=survey.dimensions,
|
||||
walls=survey.walls,
|
||||
roof=survey.roof,
|
||||
floor=survey.floor,
|
||||
)
|
||||
]
|
||||
for ext, identifier in zip(survey.extensions, _EXTENSION_IDENTIFIERS):
|
||||
parts.append(
|
||||
_map_elmhurst_building_part(
|
||||
identifier=identifier,
|
||||
age_band=ext.construction_age_band,
|
||||
dimensions=ext.dimensions,
|
||||
walls=ext.walls,
|
||||
roof=ext.roof,
|
||||
floor=ext.floor,
|
||||
)
|
||||
)
|
||||
return parts
|
||||
|
||||
|
||||
def _map_elmhurst_window(w: ElmhurstWindow) -> SapWindow:
|
||||
return SapWindow(
|
||||
frame_material=w.frame_type or None,
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import date
|
||||
from typing import List, Optional
|
||||
|
||||
|
|
@ -184,6 +184,21 @@ class Renewables:
|
|||
hydro_electricity_generated_kwh: float
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtensionPart:
|
||||
"""Additional building part on a multi-bp cert (e.g. "1st Extension",
|
||||
"2nd Extension" on the Elmhurst Summary PDF). Mirrors the per-bp
|
||||
fabric fields the main dwelling carries at the top-level
|
||||
ElmhurstSiteNotes."""
|
||||
|
||||
name: str # e.g. "1st Extension", "2nd Extension"
|
||||
construction_age_band: str # e.g. "B 1900-1929" (may differ from main)
|
||||
dimensions: BuildingPartDimensions
|
||||
walls: WallDetails
|
||||
roof: RoofDetails
|
||||
floor: FloorDetails
|
||||
|
||||
|
||||
@dataclass
|
||||
class ElmhurstSiteNotes:
|
||||
surveyor_info: SurveyorInfo
|
||||
|
|
@ -245,3 +260,11 @@ class ElmhurstSiteNotes:
|
|||
|
||||
# Sections 16.0–22.0
|
||||
renewables: Renewables
|
||||
|
||||
# Additional building parts beyond the main dwelling. The singular
|
||||
# `dimensions`, `walls`, `roof`, `floor`, and `construction_age_band`
|
||||
# fields above describe the "Main" property; each ExtensionPart in
|
||||
# this list describes a discrete extension with its own age band,
|
||||
# dimensions, and fabric details. Empty list = single-bp cert
|
||||
# (preserves backward compatibility with the existing fixture).
|
||||
extensions: List[ExtensionPart] = field(default_factory=lambda: []) # type: ignore[reportUnknownLambdaType]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue