From 09fb6f1b733ba4948cdea33dd91488d91be37844 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 26 May 2026 13:34:51 +0000 Subject: [PATCH] fix: address 22 project-wide test failures from previous sweep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three orthogonal issues surfaced by the full project test sweep: 1. Dockerfile.test: install poppler-utils alongside postgresql. The 20× `pdfinfo: No such file or directory` failures in test_summary_pdf_mapper_chain.py traced to the CI test image missing the poppler-utils system package (pdfinfo + pdftotext). `_summary_pdf_to_textract_style_pages` shells out to these for layout-preserving PDF text extraction. Pure-Python alternatives (pymupdf, pypdf) don't reproduce pdftotext -layout's row-major table cell ordering, which the Elmhurst Summary extractor depends on. So system poppler is the right fix; added to apt-get install with an explanatory comment. 2. test_from_rdsap_schema.py::test_total_floor_area: expected 55.0, got 45.82. Slice 95 (commit f502db8c) changed the API mapper to compute total_floor_area_m2 from the precise sum of per-bp sap_floor_dimensions[*].total_floor_area rather than the lodged scalar. The synthetic 21_0_1.json fixture has lodged total_floor_ area=55 + a single fd of 45.82 (per-bp sum doesn't match lodged). Updated the expected to 45.82 with a comment explaining the Slice 95 per-bp-sum precedence. 3. test_elmhurst_end_to_end.py::test_emitter_temperature: expected "Unknown", got int 1. Pre-existing failure (confirmed by checking out commit 985a59e1 and reproducing). `_elmhurst_emitter_ temperature_int` in datatypes/epc/domain/mapper.py converts the Elmhurst Summary §14 "Design flow temperature: Unknown" to SAP10.2 Table 4d code 1 (high-temp / ≥45 °C, worst-case for unmeasured boilers). The int encoding mirrors the API mapper's MainHeating Detail.emitter_temperature for cross-mapper field parity. Test updated to expect 1 (with comment) since the conversion is the correct production behaviour. Verified: - Layer 4 1e-4 gate (test_api_001479_full_chain_sap_matches_worksheet_ pdf_exactly) still GREEN. - Wider domain sweep (domain/sap10_calculator + domain/sap10_ml): 1654 passed / 20 failed, exact pre-fix baseline. - All three originally-failing tests now PASS. Co-Authored-By: Claude Opus 4.7 --- Dockerfile.test | 11 +++++++++-- .../tests/test_elmhurst_end_to_end.py | 12 +++++++++--- datatypes/epc/domain/tests/test_from_rdsap_schema.py | 9 ++++++++- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/Dockerfile.test b/Dockerfile.test index 802eb3a4..74bee47b 100644 --- a/Dockerfile.test +++ b/Dockerfile.test @@ -1,8 +1,15 @@ FROM python:3.11-slim -# Install PostgreSQL binaries — required by pytest-postgresql to spawn ephemeral test databases +# System binaries: +# - postgresql: pytest-postgresql spawns ephemeral test databases +# - poppler-utils: provides pdfinfo / pdftotext, used by +# backend/documents_parser/tests/test_summary_pdf_mapper_chain.py's +# `_summary_pdf_to_textract_style_pages` helper for layout-preserving +# PDF text extraction. Pure-Python alternatives (pymupdf, pypdf) don't +# reproduce pdftotext -layout's row-major table cell ordering, which +# the Elmhurst Summary extractor depends on. RUN apt-get update \ - && apt-get install -y --no-install-recommends postgresql \ + && apt-get install -y --no-install-recommends postgresql poppler-utils \ && rm -rf /var/lib/apt/lists/* WORKDIR /app diff --git a/backend/documents_parser/tests/test_elmhurst_end_to_end.py b/backend/documents_parser/tests/test_elmhurst_end_to_end.py index 133c7816..f5b339bb 100644 --- a/backend/documents_parser/tests/test_elmhurst_end_to_end.py +++ b/backend/documents_parser/tests/test_elmhurst_end_to_end.py @@ -262,9 +262,15 @@ class TestHeating: assert result.sap_heating.main_heating_details[0].heat_emitter_type == 1 def test_emitter_temperature(self, result: EpcPropertyData) -> None: - assert ( - result.sap_heating.main_heating_details[0].emitter_temperature == "Unknown" - ) + # The Elmhurst Summary §14 lodges "Design flow temperature: Unknown" + # for this cert. `_elmhurst_emitter_temperature_int` (mapper.py) + # converts that to SAP10.2 Table 4d code 1 (high-temp / ≥45 °C — + # the worst-case assumption for an unmeasured gas boiler). This + # int encoding mirrors the API mapper's `MainHeatingDetail. + # emitter_temperature` for cross-mapper field parity; the older + # behaviour of surfacing the raw "Unknown" string was replaced + # when the int conversion landed. + assert result.sap_heating.main_heating_details[0].emitter_temperature == 1 def test_fan_flue_present(self, result: EpcPropertyData) -> None: assert result.sap_heating.main_heating_details[0].fan_flue_present is True diff --git a/datatypes/epc/domain/tests/test_from_rdsap_schema.py b/datatypes/epc/domain/tests/test_from_rdsap_schema.py index e5ef22c5..e91ca73a 100644 --- a/datatypes/epc/domain/tests/test_from_rdsap_schema.py +++ b/datatypes/epc/domain/tests/test_from_rdsap_schema.py @@ -367,7 +367,14 @@ class TestFromRdSapSchema21_0_1: assert result.inspection_date == date(2025, 4, 4) def test_total_floor_area(self, result: EpcPropertyData) -> None: - assert result.total_floor_area_m2 == 55.0 + # Slice 95 (commit f502db8c) changed the API mapper to compute + # `total_floor_area_m2` from the precise sum of per-bp + # `sap_floor_dimensions[*].total_floor_area` (here: 45.82, a + # single ground-floor dimension) rather than the lodged scalar + # (here: 55, an integer-rounded display value that doesn't + # match the per-bp geometry in this synthetic fixture). The + # worksheet uses per-bp sums and the mapper now mirrors that. + assert result.total_floor_area_m2 == 45.82 # --- property flags ---