fix(mapper): map Elmhurst "Value known" cylinder to measured volume (code 6)

The Elmhurst Summary §15.1 lodges "Cylinder Size: Value known" with the
measured volume in the "Cylinder Volume (l)" line — the Summary-path
equivalent of the gov-API "Exact" descriptor. The mapper had no entry for
"Value known" so `_elmhurst_cylinder_size_code` raised UnmappedElmhurstLabel,
and even once mapped the measured volume was never threaded through, so the
cascade dropped the cylinder storage loss (~468 kWh/yr) from (219) water
heating on every measured-volume-cylinder Summary.

Per RdSAP 10 §10.5 Table 28 (p.55) a measured cylinder volume is used
directly. Map "Value known" → cascade code 6 (Exact) and thread the §15.1
"Cylinder Volume (l)" value into SapHeating.cylinder_volume_measured_l, which
`_cylinder_volume_l_from_code` (cert_to_inputs.py:5281) already reads for
code 6 — mirroring the gov-API path (mapper.py:1575/1885).

Pins simulated case 39 (P960-0001-001431): an age-A mid-terrace on direct-
acting electric room heaters (SAP code 691, cat 10, control 2602) with
electric-immersion DHW off a 117 L "Value known" cylinder. The full
extractor→mapper→calculator cascade now reproduces the worksheet's SAP-rating
block EXACTLY — SAP value 36.6365 (band F) and (272) CO2 2056.0731 kg/yr,
with (219) water heating 2637.5049 and (255) total energy cost 1802.0039.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-06-15 23:57:25 +00:00
parent 1fe67fe814
commit b2b6f8e954
6 changed files with 214 additions and 0 deletions

View file

@ -1528,6 +1528,18 @@ class ElmhurstSiteNotesExtractor:
first = cylinder_ins_thickness_raw.split()[0]
if first.isdigit():
cylinder_insulation_thickness_mm = int(first)
# §15.1 "Cylinder Volume (l)" — the measured volume lodged alongside
# a "Value known" Cylinder Size. The value is written as a decimal
# ("117.00"); take the integer part for the cascade's measured-volume
# field (gov-API "Exact" descriptor, code 6).
cylinder_volume_raw = self._local_val(cylinder_lines, "Cylinder Volume (l)")
cylinder_volume_measured_l: Optional[int] = None
if cylinder_volume_raw:
first = cylinder_volume_raw.split()[0]
try:
cylinder_volume_measured_l = int(float(first))
except ValueError:
cylinder_volume_measured_l = None
cylinder_thermostat_raw = self._local_val(
cylinder_lines, "Cylinder Thermostat",
)
@ -1560,6 +1572,7 @@ class ElmhurstSiteNotesExtractor:
cylinder_size_label=cylinder_size_label,
cylinder_insulation_label=cylinder_insulation_label,
cylinder_insulation_thickness_mm=cylinder_insulation_thickness_mm,
cylinder_volume_measured_l=cylinder_volume_measured_l,
cylinder_thermostat=cylinder_thermostat,
immersion_type=immersion_type,
)

Binary file not shown.

View file

@ -5942,6 +5942,13 @@ def _elmhurst_cylinder_size_code(
Table 28 page 55."""
if not cylinder_present or cylinder_size_label is None:
return None
if cylinder_size_label == "Value known":
# Measured-volume cylinder — the Summary-path equivalent of the
# gov-API "Exact" descriptor. RdSAP 10 §10.5 Table 28 (p.55): when
# the cylinder volume is measured it is used directly. Cascade code
# 6 routes `_cylinder_volume_l_from_code` to the lodged
# `cylinder_volume_measured_l` (`cert_to_inputs.py:5281`).
return 6 # Exact / measured volume
if cylinder_size_label == "No Access":
if water_heating_fuel_label is None or meter_type_label is None:
raise UnmappedElmhurstLabel(
@ -6587,6 +6594,14 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating:
),
cylinder_insulation_type=cylinder_insulation_type_field,
cylinder_insulation_thickness_mm=cylinder_insulation_thickness_mm_field,
# §15.1 "Cylinder Volume (l)" — measured volume for a "Value known"
# cylinder (cascade code 6 / Exact). None unless a cylinder is
# present; the cascade reads it only when `cylinder_size == 6`.
cylinder_volume_measured_l=(
survey.water_heating.cylinder_volume_measured_l
if survey.water_heating.hot_water_cylinder_present
else None
),
# Cascade reads `cylinder_thermostat == "Y"` (string compare) per
# `cert_to_inputs.py:2252` / `:2218`. Map the bool to the Y/N
# string the cascade expects; None when no cylinder is present.

View file

@ -369,6 +369,11 @@ class WaterHeating:
cylinder_insulation_label: Optional[str] = None
# §15.1 "Insulation Thickness" lodging in mm (an integer or None).
cylinder_insulation_thickness_mm: Optional[int] = None
# §15.1 "Cylinder Volume (l)" lodging — the measured cylinder volume in
# litres, present when "Cylinder Size" is lodged as "Value known"
# (the Summary-path equivalent of the gov-API "Exact" descriptor,
# cascade code 6). None when no cylinder is present or the line is absent.
cylinder_volume_measured_l: Optional[int] = None
# §15.1 "Cylinder Thermostat" lodging (Yes / No). False or absent
# keeps the cascade's no-thermostat Table 2b temperature factor.
cylinder_thermostat: Optional[bool] = None

View file

@ -0,0 +1,60 @@
"""Mapper boundary: the Elmhurst §15.1 "Cylinder Size" label.
A cylinder lodged "Value known" carries a measured volume in the §15.1
"Cylinder Volume (l)" line the Summary-path equivalent of the gov-API
"Exact" descriptor. Per RdSAP 10 §10.5 Table 28 (p.55) the measured volume
is used directly; cascade code 6 routes `_cylinder_volume_l_from_code` to
the lodged `cylinder_volume_measured_l`. Before this was mapped the label
raised `UnmappedElmhurstLabel`, blocking every measured-volume-cylinder
Summary.
"""
from datatypes.epc.domain.mapper import (
UnmappedElmhurstLabel,
_elmhurst_cylinder_size_code, # pyright: ignore[reportPrivateUsage]
)
def test_value_known_label_maps_to_exact_code_6() -> None:
# Arrange
label = "Value known"
# Act
code = _elmhurst_cylinder_size_code(label, cylinder_present=True)
# Assert
assert code == 6
def test_value_known_label_with_no_cylinder_maps_to_none() -> None:
# Arrange
label = "Value known"
# Act
code = _elmhurst_cylinder_size_code(label, cylinder_present=False)
# Assert
assert code is None
def test_normal_label_still_maps_to_code_2() -> None:
# Arrange
label = "Normal"
# Act
code = _elmhurst_cylinder_size_code(label, cylinder_present=True)
# Assert
assert code == 2
def test_unknown_label_still_raises() -> None:
# Arrange
label = "Spray-on unicorn cylinder"
# Act / Assert
try:
_elmhurst_cylinder_size_code(label, cylinder_present=True)
except UnmappedElmhurstLabel:
return
raise AssertionError("expected UnmappedElmhurstLabel for an unknown label")

View file

@ -0,0 +1,121 @@
"""Mapper-driven cascade pin against the Elmhurst P960-0001-001431
"simulated case 39" worksheet an age-A (pre-1900) mid-terrace heated by
**direct-acting electric room heaters** (SAP code 691, category 10, control
2602 appliance thermostats), with an electric room-heater secondary (also
691) and electric-immersion DHW (WHC 903) off a **measured-volume hot-water
cylinder** ("Cylinder Size: Value known", 117 L, foam 38 mm), on a single
(standard) electricity meter.
This case was generated to probe the API-corpus's worst-served cohort
(category-10 direct-acting electric, 46% within-0.5). It exposed a real
Summary-path gap: the §15.1 "Cylinder Size: Value known" lodging (the
Summary equivalent of the gov-API "Exact" descriptor) was unmapped, so the
extractor/mapper raised `UnmappedElmhurstLabel` and once that was mapped
the measured "Cylinder Volume (l)" was not threaded through, dropping the
cylinder storage loss (~468 kWh/yr) from (219) water heating. Wiring the
measured volume (cascade code 6 `_cylinder_volume_l_from_code`) closes the
whole cascade EXACTLY.
Like 000565 / the _rr cases / case 20 / 21 / 38, this fixture does NOT hand-
build the EpcPropertyData: it routes the Summary PDF through
ElmhurstSiteNotesExtractor + from_elmhurst_site_notes so the pin exercises
the WHOLE extractor + mapper + calculator pipeline.
Source: user-simulated PDFs at `sap worksheets/golden fixture debugging/
simulated case 39/`. The Summary is mirrored into the tracked
`backend/documents_parser/tests/fixtures/Summary_001431_case39.pdf` so the
test runs without depending on the unstaged workspace.
Worksheet pin targets (P960-0001-001431, "11a. SAP rating" / "12a. CO2
emissions" block — the UK-average-climate rating block our cascade
reproduces; the P960's separate postcode-climate EPC block (272)=1803.19 is
a known regional-climate gap, not a SAP-rating divergence):
- SAP value (un-rounded, before (258) integer rounding) = 36.6365 (band F)
- (272) Total CO2, kg/year = 2056.0731
Per [[feedback-zero-error-strict]] + [[feedback-continuous-sap-tolerance]]:
pins are abs <= 1e-3 against the worksheet PDF (printed to 4 dp).
"""
from __future__ import annotations
import re
import subprocess
from pathlib import Path
from typing import Final
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
from datatypes.epc.domain.epc_property_data import EpcPropertyData
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from domain.sap10_calculator.calculator import calculate_sap_from_inputs
from domain.sap10_calculator.rdsap.cert_to_inputs import cert_to_inputs
# parents[0]=worksheet/, [1]=sap10_calculator/, [2]=domain/, [3]=tests/,
# [4]=repo root.
_SUMMARY_PDF: Final[Path] = (
Path(__file__).resolve().parents[4]
/ "backend" / "documents_parser" / "tests" / "fixtures"
/ "Summary_001431_case39.pdf"
)
LINE_258_SAP_VALUE_CONTINUOUS: Final[float] = 36.6365
LINE_272_TOTAL_CO2_KG_PER_YR: Final[float] = 2056.0731
_PIN_ABS: Final[float] = 1e-3
def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]:
"""Convert a Summary PDF into the per-page text format the
ElmhurstSiteNotesExtractor expects (label/value token sequences).
Mirror of the helper in the other `_elmhurst_worksheet_*` fixtures.
"""
info = subprocess.run(
["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True,
).stdout
m = re.search(r"Pages:\s+(\d+)", info)
if m is None:
raise RuntimeError(f"Could not parse page count from {pdf_path}")
page_count = int(m.group(1))
pages: list[str] = []
for i in range(1, page_count + 1):
layout = subprocess.run(
[
"pdftotext", "-layout", "-f", str(i), "-l", str(i),
str(pdf_path), "-",
],
capture_output=True, text=True, check=True,
).stdout
tokens: list[str] = []
for line in layout.splitlines():
if not line.strip():
tokens.append("")
continue
parts = [p for p in re.split(r"\s{2,}", line.strip()) if p]
tokens.extend(parts)
pages.append("\n".join(tokens))
return pages
def build_epc() -> EpcPropertyData:
"""Route the simulated case-39 Summary through extractor + mapper.
No hand-built EpcPropertyData the extractor and mapper are part of
the test target."""
pages = _summary_pdf_to_textract_style_pages(_SUMMARY_PDF)
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
def test_case39_measured_volume_cylinder_reproduces_the_worksheet_sap_and_co2() -> None:
# Arrange — the full extractor -> mapper -> calculator pipeline on the
# simulated case-39 Summary (direct-electric room heaters + electric
# immersion DHW off a "Value known" 117 L measured-volume cylinder).
epc = build_epc()
# Act
result = calculate_sap_from_inputs(cert_to_inputs(epc))
# Assert — the SAP-rating block reproduces the worksheet exactly.
assert (
abs(result.sap_score_continuous - LINE_258_SAP_VALUE_CONTINUOUS)
<= _PIN_ABS
)
assert abs(result.co2_kg_per_yr - LINE_272_TOTAL_CO2_KG_PER_YR) <= _PIN_ABS