mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Slice 99a: Elmhurst extractor — no attachment line for flats
Cert 9501 (Summary_000784.pdf) is a flat. The Elmhurst Summary's
§1.0 "Property type" section lodges the built-form descriptor
("M Mid-Terrace", "D Detached", ...) only for houses — flats have no
attachment line, and the §2.0 "Number of Storeys" header follows
immediately after the "F Flat" property-type value.
The extractor's prior `_extract_attachment` regex captured the line
right after the property-type value unconditionally, so cert 9501
ended up with `attachment="2.0 Number of Storeys:"` — section-header
noise that the mapper surfaced on `EpcPropertyData.built_form`.
Downstream, this broke the cascade's `_dwelling_exposure` routing
(no prefix match → defaulted to fully-exposed houses) and so the
cert 9501 Summary path was Δ -5.25 SAP vs worksheet 68.5252.
Detect section-header noise via the leading `<digit>.<digit> `
pattern and the "Number of Storeys" substring; return "" in that
case so flats produce empty `built_form`. Houses still pick up their
real attachment (cohort 0330's "M Mid-Terrace" remains correct).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
158c08f10f
commit
a76af2ec2f
2 changed files with 42 additions and 1 deletions
|
|
@ -182,8 +182,24 @@ class ElmhurstSiteNotesExtractor:
|
|||
)
|
||||
|
||||
def _extract_attachment(self) -> str:
|
||||
"""Extract the Summary's "attachment" line — the §1.0 built-form
|
||||
descriptor (e.g. "M Mid-Terrace", "D Detached") that sits
|
||||
between the property-type value and the §2.0 section header
|
||||
for HOUSES.
|
||||
|
||||
Flats DON'T lodge an attachment line in the Elmhurst Summary;
|
||||
the §2.0 Number of Storeys header follows immediately after
|
||||
the "F Flat" property-type value. Detect that case and return
|
||||
"" so the mapper's `built_form` doesn't capture section-
|
||||
header noise.
|
||||
"""
|
||||
m = re.search(r"1\.0 Property type:\n[^\n]+\n([^\n]+)", self._text)
|
||||
return " ".join(m.group(1).strip().split()) if m else ""
|
||||
if not m:
|
||||
return ""
|
||||
candidate = " ".join(m.group(1).strip().split())
|
||||
if re.match(r"^\d+\.\d+\s", candidate) or "Number of Storeys" in candidate:
|
||||
return ""
|
||||
return candidate
|
||||
|
||||
def _floors_from_dimensions_body(self, body: str) -> List[FloorDimension]:
|
||||
"""Parse FloorDimension entries from a single bp's §4 body."""
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ _SUMMARY_000490_PDF = _FIXTURES / "Summary_000490.pdf"
|
|||
_SUMMARY_000516_PDF = _FIXTURES / "Summary_000516.pdf"
|
||||
_SUMMARY_001479_PDF = _FIXTURES / "Summary_001479.pdf"
|
||||
_SUMMARY_000897_PDF = _FIXTURES / "Summary_000897.pdf"
|
||||
_SUMMARY_000784_PDF = _FIXTURES / "Summary_000784.pdf"
|
||||
|
||||
# GOV.UK EPB API JSON for cert 001479 — the API-path counterpart of the
|
||||
# Summary_001479.pdf fixture. Together they drive the API ≡ Summary
|
||||
|
|
@ -296,6 +297,30 @@ def test_summary_001479_secondary_heating_routes_mains_gas_fuel() -> None:
|
|||
assert epc.sap_heating.secondary_fuel_type == 26
|
||||
|
||||
|
||||
def test_summary_9501_flat_has_no_built_form_in_summary_pdf() -> None:
|
||||
# Arrange — cert 9501 (Summary_000784.pdf) is a flat. The Elmhurst
|
||||
# Summary's §1.0 "Property type" section lodges the built-form
|
||||
# descriptor (e.g. "M Mid-Terrace", "D Detached") only for houses;
|
||||
# flats have no built-form line — the §2.0 "Number of Storeys"
|
||||
# section follows immediately after the "F Flat" property type.
|
||||
#
|
||||
# The extractor's `_extract_attachment` regex previously captured
|
||||
# the line immediately after the property-type value
|
||||
# unconditionally, so cert 9501 ends up with attachment
|
||||
# "2.0 Number of Storeys:" — pure section-header noise that the
|
||||
# mapper then surfaces on EpcPropertyData.built_form, breaking the
|
||||
# cascade's flat-exposure routing downstream.
|
||||
pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000784_PDF)
|
||||
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
|
||||
|
||||
# Act
|
||||
epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
|
||||
|
||||
# Assert — built_form is empty for flats. Houses set it to their
|
||||
# attachment descriptor; flats lodge no attachment.
|
||||
assert epc.built_form == ""
|
||||
|
||||
|
||||
def test_summary_001479_full_chain_sap_matches_worksheet_pdf_exactly() -> None:
|
||||
# Arrange — cert 001479 (Summary_001479.pdf / P960-0001-001479.pdf)
|
||||
# is the first cohort cert with a real GOV.UK EPB API counterpart
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue