mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
fix(elmhurst-mapper): strip interleaved Alternative-wall fragments from glazing label
When a property lodges an Alternative Wall, pdftotext interleaves the §11
"Location" column ("Alternative wall 1") into the wrapped glazing-TYPE cell,
producing labels like "Double between 2002 Alternative wall and 2021 1
Alternative wall" (cert 001431 storage-heater variants, simulated case 34).
The existing greedy trailing-suffix strip (\s+Alternative wall.*$) truncates
at the FIRST "Alternative wall", losing "and 2021" and yielding the
unmatchable "Double between 2002". Added a fallback that removes EVERY
"<External|Alternative|Party> wall [n]" fragment and any stray 1-2 digit
location index from the raw label, then retries the lookup. Loss-free: no
glazing-type key contains a wall-location phrase or a bare 1-2 digit number
(install-date years are 4 digits).
Unblocks the Summary cascade for any property with an Alternative Wall;
Summary-path only (the API path receives structured glazing codes, so the
API gauge is unaffected). Regression gate green (1 pre-existing fail
unrelated).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
85d6f8468c
commit
b0a47cda05
2 changed files with 44 additions and 0 deletions
|
|
@ -1540,6 +1540,23 @@ def test_summary_mapper_raises_on_unmapped_cylinder_insulation_label() -> None:
|
|||
assert excinfo.value.value == "Polyester wool"
|
||||
|
||||
|
||||
def test_elmhurst_glazing_type_code_strips_interleaved_alternative_wall() -> None:
|
||||
# Arrange — when a property lodges an Alternative Wall (cert 001431
|
||||
# storage-heater variants, "simulated case 34"), pdftotext interleaves
|
||||
# the §11 "Alternative wall 1" location column into the wrapped
|
||||
# glazing-type cell, e.g. "Double between 2002 Alternative wall and 2021
|
||||
# 1 Alternative wall". The wall-location fragments are not part of the
|
||||
# glazing type — the helper must recover "Double between 2002 and 2021".
|
||||
|
||||
# Act
|
||||
code = _elmhurst_glazing_type_code(
|
||||
"Double between 2002 Alternative wall and 2021 1 Alternative wall"
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert code == _elmhurst_glazing_type_code("Double between 2002 and 2021")
|
||||
|
||||
|
||||
def test_elmhurst_immersion_type_code_maps_dual_and_single() -> None:
|
||||
# Arrange — Elmhurst Summary §15.1 "Immersion Heater" lodges "Dual"
|
||||
# or "Single". RdSAP 10 §10.5 (PDF p.54): an immersion is "assumed
|
||||
|
|
|
|||
|
|
@ -5470,6 +5470,21 @@ _ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE: Final[re.Pattern[str]] = re.compile(
|
|||
_ELMHURST_GLAZING_LABEL_TRAILING_BP_RE: Final[re.Pattern[str]] = re.compile(
|
||||
r"\s+(?:\d+(?:st|nd|rd|th)|Main)$"
|
||||
)
|
||||
# Fallback only: when a property lodges an Alternative Wall, pdftotext
|
||||
# INTERLEAVES the §11 location column ("Alternative wall 1") into the
|
||||
# wrapped glazing-TYPE cell, e.g. "Double between 2002 Alternative wall
|
||||
# and 2021 1 Alternative wall" (cert 001431 storage-heater variants,
|
||||
# `simulated case 34`). The greedy trailing-suffix strip truncates at the
|
||||
# first "Alternative wall" (losing "and 2021"), so remove EVERY
|
||||
# wall-location fragment + any stray 1-2 digit location index globally and
|
||||
# retry. Loss-free: no glazing-type key contains a wall-location phrase or
|
||||
# a bare 1-2 digit number (install-date years are 4 digits).
|
||||
_ELMHURST_GLAZING_LABEL_EMBEDDED_WALL_RE: Final[re.Pattern[str]] = re.compile(
|
||||
r"\s*(?:External|Alternative|Party)\s+wall(?:\s+\d+)?"
|
||||
)
|
||||
_ELMHURST_GLAZING_LABEL_STRAY_LOCATION_DIGIT_RE: Final[re.Pattern[str]] = re.compile(
|
||||
r"\b\d{1,2}\b"
|
||||
)
|
||||
|
||||
|
||||
def _elmhurst_glazing_type_code(label: Optional[str]) -> int:
|
||||
|
|
@ -5501,6 +5516,18 @@ def _elmhurst_glazing_type_code(label: Optional[str]) -> int:
|
|||
code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(debp)
|
||||
if code is not None:
|
||||
return code
|
||||
# Fallback: remove INTERLEAVED wall-location fragments from the raw
|
||||
# label (Alternative/External/Party wall + stray location index) and
|
||||
# collapse whitespace. Operates on `label`, not the greedily-truncated
|
||||
# `cleaned`, so "Double between 2002 ... and 2021" survives.
|
||||
dewall = _ELMHURST_GLAZING_LABEL_EMBEDDED_WALL_RE.sub(" ", label)
|
||||
dewall = _ELMHURST_GLAZING_LABEL_STRAY_LOCATION_DIGIT_RE.sub(" ", dewall)
|
||||
dewall = _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE.sub("", dewall)
|
||||
dewall = re.sub(r"\s+", " ", dewall).strip()
|
||||
if dewall != cleaned:
|
||||
code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(dewall)
|
||||
if code is not None:
|
||||
return code
|
||||
raise UnmappedElmhurstLabel("glazing_type", label)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue