diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 55dd04d6..11e94fba 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -1528,6 +1528,18 @@ class ElmhurstSiteNotesExtractor: first = cylinder_ins_thickness_raw.split()[0] if first.isdigit(): cylinder_insulation_thickness_mm = int(first) + # §15.1 "Cylinder Volume (l)" — the measured volume lodged alongside + # a "Value known" Cylinder Size. The value is written as a decimal + # ("117.00"); take the integer part for the cascade's measured-volume + # field (gov-API "Exact" descriptor, code 6). + cylinder_volume_raw = self._local_val(cylinder_lines, "Cylinder Volume (l)") + cylinder_volume_measured_l: Optional[int] = None + if cylinder_volume_raw: + first = cylinder_volume_raw.split()[0] + try: + cylinder_volume_measured_l = int(float(first)) + except ValueError: + cylinder_volume_measured_l = None cylinder_thermostat_raw = self._local_val( cylinder_lines, "Cylinder Thermostat", ) @@ -1560,6 +1572,7 @@ class ElmhurstSiteNotesExtractor: cylinder_size_label=cylinder_size_label, cylinder_insulation_label=cylinder_insulation_label, cylinder_insulation_thickness_mm=cylinder_insulation_thickness_mm, + cylinder_volume_measured_l=cylinder_volume_measured_l, cylinder_thermostat=cylinder_thermostat, immersion_type=immersion_type, ) diff --git a/backend/documents_parser/tests/fixtures/Summary_001431_case39.pdf b/backend/documents_parser/tests/fixtures/Summary_001431_case39.pdf new file mode 100644 index 00000000..137985f2 Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_001431_case39.pdf differ diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 06771a7e..280806d1 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -5942,6 +5942,13 @@ def _elmhurst_cylinder_size_code( Table 28 page 55.""" if not cylinder_present or cylinder_size_label is None: return None + if cylinder_size_label == "Value known": + # Measured-volume cylinder — the Summary-path equivalent of the + # gov-API "Exact" descriptor. RdSAP 10 §10.5 Table 28 (p.55): when + # the cylinder volume is measured it is used directly. Cascade code + # 6 routes `_cylinder_volume_l_from_code` to the lodged + # `cylinder_volume_measured_l` (`cert_to_inputs.py:5281`). + return 6 # Exact / measured volume if cylinder_size_label == "No Access": if water_heating_fuel_label is None or meter_type_label is None: raise UnmappedElmhurstLabel( @@ -6587,6 +6594,14 @@ def _map_elmhurst_sap_heating(survey: ElmhurstSiteNotes) -> SapHeating: ), cylinder_insulation_type=cylinder_insulation_type_field, cylinder_insulation_thickness_mm=cylinder_insulation_thickness_mm_field, + # §15.1 "Cylinder Volume (l)" — measured volume for a "Value known" + # cylinder (cascade code 6 / Exact). None unless a cylinder is + # present; the cascade reads it only when `cylinder_size == 6`. + cylinder_volume_measured_l=( + survey.water_heating.cylinder_volume_measured_l + if survey.water_heating.hot_water_cylinder_present + else None + ), # Cascade reads `cylinder_thermostat == "Y"` (string compare) per # `cert_to_inputs.py:2252` / `:2218`. Map the bool to the Y/N # string the cascade expects; None when no cylinder is present. diff --git a/datatypes/epc/surveys/elmhurst_site_notes.py b/datatypes/epc/surveys/elmhurst_site_notes.py index 3d5b2b21..eded346f 100644 --- a/datatypes/epc/surveys/elmhurst_site_notes.py +++ b/datatypes/epc/surveys/elmhurst_site_notes.py @@ -369,6 +369,11 @@ class WaterHeating: cylinder_insulation_label: Optional[str] = None # §15.1 "Insulation Thickness" lodging in mm (an integer or None). cylinder_insulation_thickness_mm: Optional[int] = None + # §15.1 "Cylinder Volume (l)" lodging — the measured cylinder volume in + # litres, present when "Cylinder Size" is lodged as "Value known" + # (the Summary-path equivalent of the gov-API "Exact" descriptor, + # cascade code 6). None when no cylinder is present or the line is absent. + cylinder_volume_measured_l: Optional[int] = None # §15.1 "Cylinder Thermostat" lodging (Yes / No). False or absent # keeps the cascade's no-thermostat Table 2b temperature factor. cylinder_thermostat: Optional[bool] = None diff --git a/tests/datatypes/epc/domain/test_mapper_cylinder_size.py b/tests/datatypes/epc/domain/test_mapper_cylinder_size.py new file mode 100644 index 00000000..b8d03a9a --- /dev/null +++ b/tests/datatypes/epc/domain/test_mapper_cylinder_size.py @@ -0,0 +1,60 @@ +"""Mapper boundary: the Elmhurst §15.1 "Cylinder Size" label. + +A cylinder lodged "Value known" carries a measured volume in the §15.1 +"Cylinder Volume (l)" line — the Summary-path equivalent of the gov-API +"Exact" descriptor. Per RdSAP 10 §10.5 Table 28 (p.55) the measured volume +is used directly; cascade code 6 routes `_cylinder_volume_l_from_code` to +the lodged `cylinder_volume_measured_l`. Before this was mapped the label +raised `UnmappedElmhurstLabel`, blocking every measured-volume-cylinder +Summary. +""" + +from datatypes.epc.domain.mapper import ( + UnmappedElmhurstLabel, + _elmhurst_cylinder_size_code, # pyright: ignore[reportPrivateUsage] +) + + +def test_value_known_label_maps_to_exact_code_6() -> None: + # Arrange + label = "Value known" + + # Act + code = _elmhurst_cylinder_size_code(label, cylinder_present=True) + + # Assert + assert code == 6 + + +def test_value_known_label_with_no_cylinder_maps_to_none() -> None: + # Arrange + label = "Value known" + + # Act + code = _elmhurst_cylinder_size_code(label, cylinder_present=False) + + # Assert + assert code is None + + +def test_normal_label_still_maps_to_code_2() -> None: + # Arrange + label = "Normal" + + # Act + code = _elmhurst_cylinder_size_code(label, cylinder_present=True) + + # Assert + assert code == 2 + + +def test_unknown_label_still_raises() -> None: + # Arrange + label = "Spray-on unicorn cylinder" + + # Act / Assert + try: + _elmhurst_cylinder_size_code(label, cylinder_present=True) + except UnmappedElmhurstLabel: + return + raise AssertionError("expected UnmappedElmhurstLabel for an unknown label") diff --git a/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431_case39.py b/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431_case39.py new file mode 100644 index 00000000..88547fbc --- /dev/null +++ b/tests/domain/sap10_calculator/worksheet/_elmhurst_worksheet_001431_case39.py @@ -0,0 +1,121 @@ +"""Mapper-driven cascade pin against the Elmhurst P960-0001-001431 +"simulated case 39" worksheet — an age-A (pre-1900) mid-terrace heated by +**direct-acting electric room heaters** (SAP code 691, category 10, control +2602 appliance thermostats), with an electric room-heater secondary (also +691) and electric-immersion DHW (WHC 903) off a **measured-volume hot-water +cylinder** ("Cylinder Size: Value known", 117 L, foam 38 mm), on a single +(standard) electricity meter. + +This case was generated to probe the API-corpus's worst-served cohort +(category-10 direct-acting electric, 46% within-0.5). It exposed a real +Summary-path gap: the §15.1 "Cylinder Size: Value known" lodging (the +Summary equivalent of the gov-API "Exact" descriptor) was unmapped, so the +extractor/mapper raised `UnmappedElmhurstLabel` and — once that was mapped — +the measured "Cylinder Volume (l)" was not threaded through, dropping the +cylinder storage loss (~468 kWh/yr) from (219) water heating. Wiring the +measured volume (cascade code 6 → `_cylinder_volume_l_from_code`) closes the +whole cascade EXACTLY. + +Like 000565 / the _rr cases / case 20 / 21 / 38, this fixture does NOT hand- +build the EpcPropertyData: it routes the Summary PDF through +ElmhurstSiteNotesExtractor + from_elmhurst_site_notes so the pin exercises +the WHOLE extractor + mapper + calculator pipeline. + +Source: user-simulated PDFs at `sap worksheets/golden fixture debugging/ +simulated case 39/`. The Summary is mirrored into the tracked +`backend/documents_parser/tests/fixtures/Summary_001431_case39.pdf` so the +test runs without depending on the unstaged workspace. + +Worksheet pin targets (P960-0001-001431, "11a. SAP rating" / "12a. CO2 +emissions" block — the UK-average-climate rating block our cascade +reproduces; the P960's separate postcode-climate EPC block (272)=1803.19 is +a known regional-climate gap, not a SAP-rating divergence): +- SAP value (un-rounded, before (258) integer rounding) = 36.6365 (band F) +- (272) Total CO2, kg/year = 2056.0731 + +Per [[feedback-zero-error-strict]] + [[feedback-continuous-sap-tolerance]]: +pins are abs <= 1e-3 against the worksheet PDF (printed to 4 dp). +""" + +from __future__ import annotations + +import re +import subprocess +from pathlib import Path +from typing import Final + +from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.domain.mapper import EpcPropertyDataMapper +from domain.sap10_calculator.calculator import calculate_sap_from_inputs +from domain.sap10_calculator.rdsap.cert_to_inputs import cert_to_inputs + +# parents[0]=worksheet/, [1]=sap10_calculator/, [2]=domain/, [3]=tests/, +# [4]=repo root. +_SUMMARY_PDF: Final[Path] = ( + Path(__file__).resolve().parents[4] + / "backend" / "documents_parser" / "tests" / "fixtures" + / "Summary_001431_case39.pdf" +) + +LINE_258_SAP_VALUE_CONTINUOUS: Final[float] = 36.6365 +LINE_272_TOTAL_CO2_KG_PER_YR: Final[float] = 2056.0731 +_PIN_ABS: Final[float] = 1e-3 + + +def _summary_pdf_to_textract_style_pages(pdf_path: Path) -> list[str]: + """Convert a Summary PDF into the per-page text format the + ElmhurstSiteNotesExtractor expects (label/value token sequences). + Mirror of the helper in the other `_elmhurst_worksheet_*` fixtures. + """ + info = subprocess.run( + ["pdfinfo", str(pdf_path)], capture_output=True, text=True, check=True, + ).stdout + m = re.search(r"Pages:\s+(\d+)", info) + if m is None: + raise RuntimeError(f"Could not parse page count from {pdf_path}") + page_count = int(m.group(1)) + pages: list[str] = [] + for i in range(1, page_count + 1): + layout = subprocess.run( + [ + "pdftotext", "-layout", "-f", str(i), "-l", str(i), + str(pdf_path), "-", + ], + capture_output=True, text=True, check=True, + ).stdout + tokens: list[str] = [] + for line in layout.splitlines(): + if not line.strip(): + tokens.append("") + continue + parts = [p for p in re.split(r"\s{2,}", line.strip()) if p] + tokens.extend(parts) + pages.append("\n".join(tokens)) + return pages + + +def build_epc() -> EpcPropertyData: + """Route the simulated case-39 Summary through extractor + mapper. + No hand-built EpcPropertyData — the extractor and mapper are part of + the test target.""" + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + +def test_case39_measured_volume_cylinder_reproduces_the_worksheet_sap_and_co2() -> None: + # Arrange — the full extractor -> mapper -> calculator pipeline on the + # simulated case-39 Summary (direct-electric room heaters + electric + # immersion DHW off a "Value known" 117 L measured-volume cylinder). + epc = build_epc() + + # Act + result = calculate_sap_from_inputs(cert_to_inputs(epc)) + + # Assert — the SAP-rating block reproduces the worksheet exactly. + assert ( + abs(result.sap_score_continuous - LINE_258_SAP_VALUE_CONTINUOUS) + <= _PIN_ABS + ) + assert abs(result.co2_kg_per_yr - LINE_272_TOTAL_CO2_KG_PER_YR) <= _PIN_ABS