mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
S0380.216: extractor — handle wrapped glazing-gap column in §11 labels
pdftotext dumps of hand-entered Elmhurst worksheets wrap the §11 glazing-
GAP column ("16 mm or more") onto the glazing-TYPE token, yielding labels
like "Double between 2002 and 2021 16 mm or [1st]" that
`_elmhurst_glazing_type_code` didn't recognise → UnmappedElmhurstLabel,
blocking the whole Summary from parsing.
Added a fallback: when the lightly-cleaned label isn't a known key, strip a
trailing wrapped gap descriptor (`\s+\d+\s*mm\b.*$`) and retry. Applied
AFTER the direct lookup so explicitly-mapped interleaved variants (e.g.
"Double with unknown 16 mm or install date more", where the gap splits into
the middle) are unaffected. The gap drives the API-path U-value lookup, not
the site-notes glazing-type enum, so dropping it is loss-free for the
cascade.
Unblocks running our cascade on hand-entered worksheet Summaries — used to
validate the PV β-split against simulated case 18 (our split matches the
P960 worksheet exactly: gen 2684.17, onsite 970.77, export 1713.40).
Suite: 2391 passed, 1 skipped. Zero new pyright errors (mapper 32=32).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
2f5ca85854
commit
712cc6f3f8
2 changed files with 68 additions and 3 deletions
|
|
@ -4858,6 +4858,18 @@ _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE: Final[re.Pattern[str]] = re.compile(
|
|||
_ELMHURST_GLAZING_LABEL_NOISE_SUFFIX_RE: Final[re.Pattern[str]] = re.compile(
|
||||
r"\s+Summary Information$|\s+Alternative wall.*$"
|
||||
)
|
||||
# Fallback only: pdftotext wraps the §11 glazing-GAP column ("6 mm" /
|
||||
# "12 mm" / "16 mm or more") onto the glazing-TYPE token on hand-entered
|
||||
# worksheets, e.g. "Double between 2002 and 2021 16 mm or [1st]". When the
|
||||
# lightly-cleaned label isn't a known key, strip the trailing gap
|
||||
# descriptor (and any building-part fragment after it) and retry. Applied
|
||||
# AFTER the direct lookup so explicitly-mapped interleaved variants (e.g.
|
||||
# "Double with unknown 16 mm or install date more") are unaffected. The
|
||||
# gap drives the API-path U-value lookup, not the site-notes glazing-type
|
||||
# enum, so dropping it here is loss-free for the cascade.
|
||||
_ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE: Final[re.Pattern[str]] = re.compile(
|
||||
r"\s+\d+\s*mm\b.*$"
|
||||
)
|
||||
|
||||
|
||||
def _elmhurst_glazing_type_code(label: Optional[str]) -> int:
|
||||
|
|
@ -4874,9 +4886,15 @@ def _elmhurst_glazing_type_code(label: Optional[str]) -> int:
|
|||
cleaned = _ELMHURST_GLAZING_LABEL_NOISE_PREFIX_RE.sub("", label)
|
||||
cleaned = _ELMHURST_GLAZING_LABEL_NOISE_SUFFIX_RE.sub("", cleaned).strip()
|
||||
code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(cleaned)
|
||||
if code is None:
|
||||
raise UnmappedElmhurstLabel("glazing_type", label)
|
||||
return code
|
||||
if code is not None:
|
||||
return code
|
||||
# Fallback: strip a trailing wrapped glazing-gap descriptor and retry.
|
||||
degapped = _ELMHURST_GLAZING_LABEL_TRAILING_GAP_RE.sub("", cleaned).strip()
|
||||
if degapped != cleaned:
|
||||
code = _ELMHURST_GLAZING_LABEL_TO_SAP10.get(degapped)
|
||||
if code is not None:
|
||||
return code
|
||||
raise UnmappedElmhurstLabel("glazing_type", label)
|
||||
|
||||
|
||||
def _elmhurst_main_heating_category(
|
||||
|
|
|
|||
|
|
@ -748,3 +748,50 @@ class TestApiResolveWallInsulationThickness:
|
|||
|
||||
# Assert
|
||||
assert resolved == "measured"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Glazing-type label cleaning — pdftotext gap-column wrap
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestElmhurstGlazingTypeWrappedGap:
|
||||
"""When a hand-entered Elmhurst worksheet is dumped via pdftotext, the
|
||||
glazing-GAP column ("16 mm or more") wraps onto the glazing-TYPE token,
|
||||
yielding labels like "Double between 2002 and 2021 16 mm or" (plus a
|
||||
trailing building-part fragment). The extractor must strip the trailing
|
||||
gap descriptor and map the clean type, not raise UnmappedElmhurstLabel."""
|
||||
|
||||
def test_trailing_gap_descriptor_stripped(self) -> None:
|
||||
# Arrange
|
||||
from datatypes.epc.domain.mapper import _elmhurst_glazing_type_code
|
||||
|
||||
# Act
|
||||
code = _elmhurst_glazing_type_code(
|
||||
"Double between 2002 and 2021 16 mm or"
|
||||
)
|
||||
|
||||
# Assert — clean "Double between 2002 and 2021" → SAP10 code 3
|
||||
assert code == 3
|
||||
|
||||
def test_trailing_gap_plus_building_part_fragment_stripped(self) -> None:
|
||||
# Arrange
|
||||
from datatypes.epc.domain.mapper import _elmhurst_glazing_type_code
|
||||
|
||||
# Act
|
||||
code = _elmhurst_glazing_type_code(
|
||||
"Double between 2002 and 2021 16 mm or 1st"
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert code == 3
|
||||
|
||||
def test_clean_label_still_maps(self) -> None:
|
||||
# Arrange — regression guard: an un-wrapped label is unaffected.
|
||||
from datatypes.epc.domain.mapper import _elmhurst_glazing_type_code
|
||||
|
||||
# Act
|
||||
code = _elmhurst_glazing_type_code("Double pre 2002")
|
||||
|
||||
# Assert
|
||||
assert code == 2
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue