mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
fix(elmhurst-extractor): read Main Property age band from §3.0 Date Built block
The Elmhurst Summary §3.0 "Date Built" lodges the per-building-part age
bands; the Main row reads "Main Property" / "C 1930-1949". But "Main
Property" ALSO heads the §4.0 Dimensions table, so the global
`_str_val("Main Property")` collides with it: when pdftotext renders
"3.0 Date Built:" glued onto its "Main Property" row token on one
layout line (as the recommendation worksheets do), the first standalone
"Main Property" match is the §4 dimensions header — returning its next
token "Floor" as the "age band".
That garbage age propagated to `u_roof`: for a "Pitched, sloping
ceiling" (PS) roof with no lodged insulation thickness, `u_roof` returns
the spec uninsulated U=2.3 for the correct age C but U=0.4 for the
unparseable "Floor" — collapsing the roof heat-loss term and inflating
SAP by ~14 points on the affected cert.
Scope the read to the Date-Built block (between "3.0 Date Built" and
"4.0 Dimensions") and take the first age row — a line beginning with a
single A-M band letter + space ("C 1930-1949", "A before 1900",
"J 2003-2006"). Building-part name rows never start that way, and the
Main row precedes any extension / room-in-roof rows.
Regression: full sap10_calculator + documents_parser suite green bar the
3 pre-existing unrelated fails (2 stone-wall U tests, test_total_floor_
area); the multi-bp / "A before 1900" fixtures (000516, 001431_case*,
6035) keep their age bands.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
590cb97ef6
commit
1033526812
3 changed files with 38 additions and 1 deletions
|
|
@ -1727,6 +1727,25 @@ class ElmhurstSiteNotesExtractor:
|
|||
))
|
||||
return arrays
|
||||
|
||||
def _extract_main_age_band(self) -> str:
|
||||
"""Read the Main Property age band from the §3.0 Date Built block.
|
||||
|
||||
"Main Property" also heads the §4 Dimensions table, so a global
|
||||
`_str_val("Main Property")` collides with it: when the layout
|
||||
glues "3.0 Date Built:" onto the "Main Property" row token (the
|
||||
recommendation worksheets do), the first standalone "Main
|
||||
Property" match is the dimensions header — yielding its next
|
||||
token ("Floor") instead of the age band. Scope the read to the
|
||||
Date-Built block and take the first age row — a line beginning
|
||||
with a single A-M band letter + space (e.g. "C 1930-1949",
|
||||
"A before 1900", "J 2003-2006"). Building-part name rows
|
||||
("Main Property", "1st Extension", "Main Prop. Room(s) in
|
||||
Roof") never start that way, and the Main row precedes any
|
||||
extension / room-in-roof rows."""
|
||||
block = self._between("3.0 Date Built", "4.0 Dimensions")
|
||||
m = re.search(r"^([A-M] .+)$", block, re.MULTILINE)
|
||||
return " ".join(m.group(1).split()) if m else ""
|
||||
|
||||
def extract(self) -> ElmhurstSiteNotes:
|
||||
emissions_raw = self._next_val("Emissions (t/year)")
|
||||
co2 = float(emissions_raw.split()[0]) if emissions_raw else 0.0
|
||||
|
|
@ -1744,7 +1763,7 @@ class ElmhurstSiteNotesExtractor:
|
|||
number_of_storeys=self._int_val("Storeys"),
|
||||
habitable_rooms=self._int_val("Habitable Rooms"),
|
||||
heated_habitable_rooms=self._int_val("Heated Habitable Rooms"),
|
||||
construction_age_band=self._str_val("Main Property"),
|
||||
construction_age_band=self._extract_main_age_band(),
|
||||
dimensions=self._extract_dimensions(),
|
||||
has_conservatory=self._bool_val("Is there a conservatory?"),
|
||||
walls=self._extract_walls(),
|
||||
|
|
|
|||
BIN
backend/documents_parser/tests/fixtures/Summary_001431_topfloor_flat.pdf
vendored
Normal file
BIN
backend/documents_parser/tests/fixtures/Summary_001431_topfloor_flat.pdf
vendored
Normal file
Binary file not shown.
|
|
@ -83,6 +83,7 @@ _SUMMARY_000910_PDF = _FIXTURES / "Summary_000910.pdf" # cert 0036 (Flat, party
|
|||
_SUMMARY_000890_PDF = _FIXTURES / "Summary_000890.pdf" # cert 7800 (two electric showers)
|
||||
_SUMMARY_000565_PDF = _FIXTURES / "Summary_000565.pdf" # cert 000565 (5-bp Elmhurst-only)
|
||||
_SUMMARY_001431_CASE20_PDF = _FIXTURES / "Summary_001431_case20.pdf" # sim case 20 (storage heaters + RR type-2 + wrapped "Double between 2002 and 2021" glazing)
|
||||
_SUMMARY_001431_TOPFLOOR_PDF = _FIXTURES / "Summary_001431_topfloor_flat.pdf" # gas-boiler-upgrade recommendation "after" — top-floor flat, PS sloping roof; exercises the Date-Built age-band + flat-position layout regressions
|
||||
|
||||
# GOV.UK EPB API JSON for cert 001479 — the API-path counterpart of the
|
||||
# Summary_001479.pdf fixture. Together they drive the API ≡ Summary
|
||||
|
|
@ -162,6 +163,23 @@ def test_summary_001431_case20_fabric_heat_loss_matches_worksheet_line_33() -> N
|
|||
assert abs(ht.fabric_heat_loss_w_per_k - 285.9847) <= 1e-4
|
||||
|
||||
|
||||
def test_summary_001431_topfloor_extracts_main_property_age_band() -> None:
|
||||
# Arrange — the gas-boiler-upgrade recommendation "after" Summary
|
||||
# renders "3.0 Date Built:" glued to its "Main Property" row header
|
||||
# on one layout line, so the FIRST standalone "Main Property" token
|
||||
# is the §4 dimensions-table header (followed by "Floor"). The
|
||||
# extractor must read the age band from the Date-Built block, not the
|
||||
# first global "Main Property" match — the worksheet lodges age band
|
||||
# C (1930-1949).
|
||||
pages = _summary_pdf_to_textract_style_pages(_SUMMARY_001431_TOPFLOOR_PDF)
|
||||
|
||||
# Act
|
||||
survey = ElmhurstSiteNotesExtractor(pages).extract()
|
||||
|
||||
# Assert
|
||||
assert survey.construction_age_band == "C 1930-1949"
|
||||
|
||||
|
||||
def test_summary_000474_mapper_produces_three_building_parts() -> None:
|
||||
# Arrange — cert U985-0001-000474 is a mid-terrace with 3 building
|
||||
# parts (Main + 2 extensions) per the hand-built worksheet fixture
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue