diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 14831ccf..07b02248 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -182,8 +182,24 @@ class ElmhurstSiteNotesExtractor: ) def _extract_attachment(self) -> str: + """Extract the Summary's "attachment" line — the §1.0 built-form + descriptor (e.g. "M Mid-Terrace", "D Detached") that sits + between the property-type value and the §2.0 section header + for HOUSES. + + Flats DON'T lodge an attachment line in the Elmhurst Summary; + the §2.0 Number of Storeys header follows immediately after + the "F Flat" property-type value. Detect that case and return + "" so the mapper's `built_form` doesn't capture section- + header noise. + """ m = re.search(r"1\.0 Property type:\n[^\n]+\n([^\n]+)", self._text) - return " ".join(m.group(1).strip().split()) if m else "" + if not m: + return "" + candidate = " ".join(m.group(1).strip().split()) + if re.match(r"^\d+\.\d+\s", candidate) or "Number of Storeys" in candidate: + return "" + return candidate def _floors_from_dimensions_body(self, body: str) -> List[FloorDimension]: """Parse FloorDimension entries from a single bp's §4 body.""" diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index 26df1543..2f65cd6e 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -57,6 +57,7 @@ _SUMMARY_000490_PDF = _FIXTURES / "Summary_000490.pdf" _SUMMARY_000516_PDF = _FIXTURES / "Summary_000516.pdf" _SUMMARY_001479_PDF = _FIXTURES / "Summary_001479.pdf" _SUMMARY_000897_PDF = _FIXTURES / "Summary_000897.pdf" +_SUMMARY_000784_PDF = _FIXTURES / "Summary_000784.pdf" # GOV.UK EPB API JSON for cert 001479 — the API-path counterpart of the # Summary_001479.pdf fixture. Together they drive the API ≡ Summary @@ -296,6 +297,30 @@ def test_summary_001479_secondary_heating_routes_mains_gas_fuel() -> None: assert epc.sap_heating.secondary_fuel_type == 26 +def test_summary_9501_flat_has_no_built_form_in_summary_pdf() -> None: + # Arrange — cert 9501 (Summary_000784.pdf) is a flat. The Elmhurst + # Summary's §1.0 "Property type" section lodges the built-form + # descriptor (e.g. "M Mid-Terrace", "D Detached") only for houses; + # flats have no built-form line — the §2.0 "Number of Storeys" + # section follows immediately after the "F Flat" property type. + # + # The extractor's `_extract_attachment` regex previously captured + # the line immediately after the property-type value + # unconditionally, so cert 9501 ends up with attachment + # "2.0 Number of Storeys:" — pure section-header noise that the + # mapper then surfaces on EpcPropertyData.built_form, breaking the + # cascade's flat-exposure routing downstream. + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000784_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + + # Act + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Assert — built_form is empty for flats. Houses set it to their + # attachment descriptor; flats lodge no attachment. + assert epc.built_form == "" + + def test_summary_001479_full_chain_sap_matches_worksheet_pdf_exactly() -> None: # Arrange — cert 001479 (Summary_001479.pdf / P960-0001-001479.pdf) # is the first cohort cert with a real GOV.UK EPB API counterpart