diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index bd37d60d..c859ce4f 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -28,14 +28,17 @@ Textract directly. from __future__ import annotations +import dataclasses import re import subprocess from pathlib import Path +from typing import cast from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor from datatypes.epc.domain.mapper import EpcPropertyDataMapper from domain.sap.calculator import calculate_sap_from_inputs from domain.sap.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES, cert_to_inputs +from domain.sap.worksheet.tests import _elmhurst_worksheet_000474 as _w000474 _FIXTURES = Path(__file__).parent / "fixtures" _SUMMARY_000474_PDF = _FIXTURES / "Summary_000474.pdf" @@ -378,3 +381,151 @@ def test_summary_001479_full_chain_sap_matches_worksheet_pdf_exactly() -> None: # `feedback_zero_error_strict`). worksheet_unrounded_sap = 69.0094 assert abs(result.sap_score_continuous - worksheet_unrounded_sap) < 1e-4 + + +# ============================================================================ +# Mapper-vs-hand-built EpcPropertyData diff tests +# ============================================================================ +# The 6 cohort hand-builts (_elmhurst_worksheet_NNNNNN.build_epc) are the +# 100%-correct calculator-input ground truth — each cascades to its +# worksheet PDF's lodged SAP at 1e-4. The chain tests above only assert +# cascade-output equivalence; the mapper can pass them by producing a +# *different* EpcPropertyData that happens to cascade to the same number. +# +# These tests pin the missing layer: the mapper's EpcPropertyData must +# match the hand-built's load-bearing fields exactly. Every divergence +# surfaced here is a mapper coverage gap to close as its own slice. +# +# "Load-bearing" = the subset of EpcPropertyData fields that drive the +# SAP cascade or carry semantic cross-mapper meaning. Cert-metadata +# fields (address, registration dates, descriptive EnergyElement lists, +# tariff strings) are excluded because they don't change calculator +# output and vary by mapper pathway (the API publishes some, the +# Elmhurst Summary publishes others) without semantic disagreement. + +_LOAD_BEARING_FIELDS: tuple[str, ...] = ( + # Cascade-driving structural fields + "sap_building_parts", + "sap_windows", + "sap_roof_windows", + "sap_heating", + "sap_ventilation", + "sap_energy_source", + "total_floor_area_m2", + # Building-classification fields driving default cascades + "dwelling_type", + "built_form", + "property_type", + "country_code", + "postcode", + # Counts and openings + "door_count", + "insulated_door_count", + "insulated_door_u_value", + "habitable_rooms_count", + "heated_rooms_count", + "wet_rooms_count", + "extensions_count", + "open_chimneys_count", + "blocked_chimneys_count", + "extract_fans_count", + # Lighting + "cfl_fixed_lighting_bulbs_count", + "led_fixed_lighting_bulbs_count", + "incandescent_fixed_lighting_bulbs_count", + "low_energy_fixed_lighting_bulbs_count", + "fixed_lighting_outlets_count", + "low_energy_fixed_lighting_outlets_count", + # HW / appliances + "solar_water_heating", + "has_hot_water_cylinder", + "has_fixed_air_conditioning", + "has_conservatory", + "has_heated_separate_conservatory", + # Envelope drivers + "percent_draughtproofed", + "mechanical_ventilation", + "pressure_test", + # Construction-detail flags + "addendum", + "lzc_energy_sources", + "any_unheated_rooms", + "number_of_storeys", + "sap_flat_details", +) + + +def _diff_load_bearing( + mapped: object, hand_built: object, path: str = "", +) -> list[str]: + """Recursive field diff; yields one line per leaf divergence between + mapped EpcPropertyData and the hand-built fixture. Int/float type + differences with the same numeric value are not flagged. + + Strict-pyright posture: arguments typed `object` so each branch + narrows via `isinstance` rather than threading `Any` through the + recursion (which pyright can't reason about under + `strict`/`typeCheckingMode = strict`).""" + out: list[str] = [] + if type(mapped) is not type(hand_built): + if not (isinstance(mapped, (int, float)) and isinstance(hand_built, (int, float))): + out.append( + f"{path}: TYPE {type(mapped).__name__} vs " + f"{type(hand_built).__name__} mapped={mapped!r} " + f"handbuilt={hand_built!r}" + ) + return out + if dataclasses.is_dataclass(mapped) and not isinstance(mapped, type) \ + and dataclasses.is_dataclass(hand_built) and not isinstance(hand_built, type): + for fld in dataclasses.fields(mapped): + out.extend(_diff_load_bearing( + getattr(mapped, fld.name), + getattr(hand_built, fld.name), + f"{path}.{fld.name}" if path else fld.name, + )) + return out + if isinstance(mapped, list) and isinstance(hand_built, list): + mapped_list = cast("list[object]", mapped) + hand_built_list = cast("list[object]", hand_built) + if len(mapped_list) != len(hand_built_list): + out.append(f"{path}: LEN {len(mapped_list)} vs {len(hand_built_list)}") + return out + for i, (m_item, h_item) in enumerate(zip(mapped_list, hand_built_list)): + out.extend(_diff_load_bearing(m_item, h_item, f"{path}[{i}]")) + return out + if mapped != hand_built: + out.append(f"{path}: mapped={mapped!r} handbuilt={hand_built!r}") + return out + + +def test_from_elmhurst_site_notes_matches_hand_built_000474() -> None: + # Arrange — _elmhurst_worksheet_000474.build_epc() is the canonical + # hand-built EpcPropertyData for cert U985-0001-000474; it cascades + # to the worksheet PDF's `SAP value 62.2584` at 1e-4 (cohort SAP- + # result pin). Routing the corresponding Summary PDF through the + # Elmhurst mapper MUST produce a load-bearing-field-equivalent + # EpcPropertyData; any divergence is a mapper-coverage gap. + # + # Tracer-bullet scope: cert 000474 only. Once GREEN, parametrize + # over the 5 other cohort fixtures and add cert 001479 (after + # `_elmhurst_worksheet_001479` lands at 1e-4 via Slice 62 iteration). + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000474_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + mapped = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + hand_built = _w000474.build_epc() + + # Act + diffs: list[str] = [] + for field_name in _LOAD_BEARING_FIELDS: + diffs.extend(_diff_load_bearing( + getattr(mapped, field_name, None), + getattr(hand_built, field_name, None), + field_name, + )) + + # Assert + assert not diffs, ( + f"{len(diffs)} load-bearing divergence(s) between mapped and " + f"hand-built EpcPropertyData for cohort cert 000474:\n " + + "\n ".join(diffs) + )