diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 01b50deb..e2a5f1e7 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -1727,6 +1727,25 @@ class ElmhurstSiteNotesExtractor: )) return arrays + def _extract_main_age_band(self) -> str: + """Read the Main Property age band from the §3.0 Date Built block. + + "Main Property" also heads the §4 Dimensions table, so a global + `_str_val("Main Property")` collides with it: when the layout + glues "3.0 Date Built:" onto the "Main Property" row token (the + recommendation worksheets do), the first standalone "Main + Property" match is the dimensions header — yielding its next + token ("Floor") instead of the age band. Scope the read to the + Date-Built block and take the first age row — a line beginning + with a single A-M band letter + space (e.g. "C 1930-1949", + "A before 1900", "J 2003-2006"). Building-part name rows + ("Main Property", "1st Extension", "Main Prop. Room(s) in + Roof") never start that way, and the Main row precedes any + extension / room-in-roof rows.""" + block = self._between("3.0 Date Built", "4.0 Dimensions") + m = re.search(r"^([A-M] .+)$", block, re.MULTILINE) + return " ".join(m.group(1).split()) if m else "" + def extract(self) -> ElmhurstSiteNotes: emissions_raw = self._next_val("Emissions (t/year)") co2 = float(emissions_raw.split()[0]) if emissions_raw else 0.0 @@ -1744,7 +1763,7 @@ class ElmhurstSiteNotesExtractor: number_of_storeys=self._int_val("Storeys"), habitable_rooms=self._int_val("Habitable Rooms"), heated_habitable_rooms=self._int_val("Heated Habitable Rooms"), - construction_age_band=self._str_val("Main Property"), + construction_age_band=self._extract_main_age_band(), dimensions=self._extract_dimensions(), has_conservatory=self._bool_val("Is there a conservatory?"), walls=self._extract_walls(), diff --git a/backend/documents_parser/tests/fixtures/Summary_001431_topfloor_flat.pdf b/backend/documents_parser/tests/fixtures/Summary_001431_topfloor_flat.pdf new file mode 100644 index 00000000..e61f3466 Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_001431_topfloor_flat.pdf differ diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index 83d1e094..406729ad 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -83,6 +83,7 @@ _SUMMARY_000910_PDF = _FIXTURES / "Summary_000910.pdf" # cert 0036 (Flat, party _SUMMARY_000890_PDF = _FIXTURES / "Summary_000890.pdf" # cert 7800 (two electric showers) _SUMMARY_000565_PDF = _FIXTURES / "Summary_000565.pdf" # cert 000565 (5-bp Elmhurst-only) _SUMMARY_001431_CASE20_PDF = _FIXTURES / "Summary_001431_case20.pdf" # sim case 20 (storage heaters + RR type-2 + wrapped "Double between 2002 and 2021" glazing) +_SUMMARY_001431_TOPFLOOR_PDF = _FIXTURES / "Summary_001431_topfloor_flat.pdf" # gas-boiler-upgrade recommendation "after" — top-floor flat, PS sloping roof; exercises the Date-Built age-band + flat-position layout regressions # GOV.UK EPB API JSON for cert 001479 — the API-path counterpart of the # Summary_001479.pdf fixture. Together they drive the API ≡ Summary @@ -162,6 +163,23 @@ def test_summary_001431_case20_fabric_heat_loss_matches_worksheet_line_33() -> N assert abs(ht.fabric_heat_loss_w_per_k - 285.9847) <= 1e-4 +def test_summary_001431_topfloor_extracts_main_property_age_band() -> None: + # Arrange — the gas-boiler-upgrade recommendation "after" Summary + # renders "3.0 Date Built:" glued to its "Main Property" row header + # on one layout line, so the FIRST standalone "Main Property" token + # is the §4 dimensions-table header (followed by "Floor"). The + # extractor must read the age band from the Date-Built block, not the + # first global "Main Property" match — the worksheet lodges age band + # C (1930-1949). + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_001431_TOPFLOOR_PDF) + + # Act + survey = ElmhurstSiteNotesExtractor(pages).extract() + + # Assert + assert survey.construction_age_band == "C 1930-1949" + + def test_summary_000474_mapper_produces_three_building_parts() -> None: # Arrange — cert U985-0001-000474 is a mid-terrace with 3 building # parts (Main + 2 extensions) per the hand-built worksheet fixture