diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index 822254ca..a3449014 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -417,8 +417,14 @@ class ElmhurstSiteNotesExtractor: ) return windows - # Anchors used by the layout-style window parser. - _WIDTH_HEIGHT_AREA_RE = re.compile(r"^(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)$") + # Anchors used by the layout-style window parser. The W/H/Area anchor + # is sometimes followed by a joined glazing-type phrase on the same + # line (e.g. '1.22 1.76 2.15 Double pre 2002'); the optional 4th + # capture surfaces that text so the parser can use it instead of a + # separately-laid-out prefix line. + _WIDTH_HEIGHT_AREA_RE = re.compile( + r"^(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)(?:\s+(\S.*?))?$" + ) _MANUFACTURER_RE = re.compile(r"^(Manufacturer|Default)\s+(\d+\.\d+)$") _ORIENTATION_TOKENS = frozenset({ "North", "South", "East", "West", "NE", "NW", "SE", "SW", @@ -507,6 +513,28 @@ class ElmhurstSiteNotesExtractor: return j return None + _FRAME_TYPE_AND_FACTOR_RE = re.compile(r"^(\S+(?:\s+\S+)*?)\s+(\d\.\d+)$") + + def _parse_frame_type_and_factor( + self, lines: List[str], data_idx: int + ) -> tuple[str, Optional[float], int]: + """Return `(frame_type, frame_factor, middle_start_idx)` from + the lines immediately after the data anchor. Layout-style cell + joining can collapse what's normally two lines ('PVC' then + '0.70') into one ('Wood 0.70'); both shapes need to feed the + same downstream slice.""" + combined = self._FRAME_TYPE_AND_FACTOR_RE.match(lines[data_idx + 1].strip()) + if combined is not None: + return combined.group(1), float(combined.group(2)), data_idx + 2 + if data_idx + 2 >= len(lines): + return lines[data_idx + 1].strip(), None, data_idx + 2 + frame_type = lines[data_idx + 1].strip() + try: + frame_factor = float(lines[data_idx + 2].strip()) + except ValueError: + return frame_type, None, data_idx + 3 + return frame_type, frame_factor, data_idx + 3 + def _partition_after_manuf( self, lines: List[str], manuf_idx: int, next_data_idx: int ) -> int: @@ -514,14 +542,25 @@ class ElmhurstSiteNotesExtractor: block (and the inclusive lower bound for the next window's prefix block). After the manufacturer line come 3 fixed tokens (g_value, draught, shutters); the variable suffix lines start at manuf+4 - and run until the next window's glazing-type-start token (e.g. - 'Double between 2002', 'Single', 'Triple ...') or until the - next window's data line if no such token is present.""" + and run until either (a) the next window's glazing-type-start + token (e.g. 'Double between 2002', 'Single', 'Triple ...') or + (b) the second orientation token in the gap, whichever comes + first. Branch (b) covers layouts where the glazing-type is + joined to the data line (no separate prefix line exists), so + the only signal of window-transition is the orientation tokens + rotating: orient_suffix(k) → orient_prefix(k+1). Falls through + to `next_data_idx` when neither marker is present.""" scan_start = manuf_idx + 4 + seen_orient = False for j in range(scan_start, next_data_idx): - first_word = lines[j].strip().split(" ", 1)[0] + stripped = lines[j].strip() + first_word = stripped.split(" ", 1)[0] if first_word in self._GLAZING_TYPE_PREFIX_WORDS: return j + if stripped in self._ORIENTATION_TOKENS: + if seen_orient: + return j + seen_orient = True return next_data_idx def _parse_window_from_anchors( @@ -537,20 +576,28 @@ class ElmhurstSiteNotesExtractor: width = float(anchor.group(1)) height = float(anchor.group(2)) area = float(anchor.group(3)) + # Layout-style cell joining sometimes leaves the glazing-type + # phrase trailing the W H Area triplet on the same line (e.g. + # "1.22 1.76 2.15 Double pre 2002"); when present we pass it + # through as `inline_glazing_type` and the composer skips the + # would-be glazing-prefix scan. + inline_glazing_type = anchor.group(4) if anchor.lastindex and anchor.lastindex >= 4 else None # frame_type and frame_factor immediately follow the data line. - if data_idx + 2 >= len(lines): + # Layout-style cell joining sometimes collapses them onto a + # single "Wood 0.70" line; treat both shapes uniformly so the + # downstream `middle` slice still starts at the first variable + # field (glazing_gap / bp / location / orient). + if data_idx + 1 >= len(lines): return None - frame_type = lines[data_idx + 1].strip() - try: - frame_factor = float(lines[data_idx + 2].strip()) - except ValueError: - return None - if not 0.0 < frame_factor <= 1.0: + frame_type, frame_factor, middle_start = self._parse_frame_type_and_factor( + lines, data_idx + ) + if frame_factor is None or not 0.0 < frame_factor <= 1.0: return None # Variable-order tokens between frame_factor and Manufacturer. - middle = [lines[j].strip() for j in range(data_idx + 3, manuf_idx)] + middle = [lines[j].strip() for j in range(middle_start, manuf_idx)] glazing_gap = next((t for t in middle if "mm" in t.lower()), None) location = next((t for t in middle if "wall" in t.lower()), "External wall") bp_inline = next((t for t in middle if t in self._BP_INLINE_TOKENS), None) @@ -586,6 +633,7 @@ class ElmhurstSiteNotesExtractor: after=after, bp_inline=bp_inline, orient_inline=orient_inline, + inline_glazing_type=inline_glazing_type, ) return Window( @@ -613,6 +661,7 @@ class ElmhurstSiteNotesExtractor: after: List[str], bp_inline: Optional[str], orient_inline: Optional[str], + inline_glazing_type: Optional[str] = None, ) -> tuple[str, str, str]: """Re-join the glazing-type / building-part / orientation tokens split by the layout preprocessor. Each is at most 2 fragments @@ -645,8 +694,13 @@ class ElmhurstSiteNotesExtractor: bp_prefix_frag = pop_if_bp_fragment(prefix) bp_suffix_frag = pop_if_bp_fragment(suffix) - # Glazing type: remaining prefix + remaining suffix (joined). - glazing_type = " ".join([*prefix, *suffix]).strip() + # Glazing type: an inline glazing-type captured from the data + # line (layout-joined variant) wins; otherwise join the remaining + # prefix + suffix fragments. + if inline_glazing_type is not None: + glazing_type = inline_glazing_type + else: + glazing_type = " ".join([*prefix, *suffix]).strip() # Building part: inline token wins; otherwise join prefix + suffix. if bp_inline is not None: diff --git a/backend/documents_parser/tests/fixtures/Summary_000477.pdf b/backend/documents_parser/tests/fixtures/Summary_000477.pdf new file mode 100644 index 00000000..94ca3002 Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_000477.pdf differ diff --git a/backend/documents_parser/tests/fixtures/Summary_000480.pdf b/backend/documents_parser/tests/fixtures/Summary_000480.pdf new file mode 100644 index 00000000..f0cc3875 Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_000480.pdf differ diff --git a/backend/documents_parser/tests/fixtures/Summary_000487.pdf b/backend/documents_parser/tests/fixtures/Summary_000487.pdf new file mode 100644 index 00000000..9afae0e4 Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_000487.pdf differ diff --git a/backend/documents_parser/tests/fixtures/Summary_000490.pdf b/backend/documents_parser/tests/fixtures/Summary_000490.pdf new file mode 100644 index 00000000..9303bfe6 Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_000490.pdf differ diff --git a/backend/documents_parser/tests/fixtures/Summary_000516.pdf b/backend/documents_parser/tests/fixtures/Summary_000516.pdf new file mode 100644 index 00000000..60c73055 Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_000516.pdf differ diff --git a/docs/sap-spec/NEXT_AGENT_PROMPT.md b/docs/sap-spec/NEXT_AGENT_PROMPT.md index ebca5762..266fe437 100644 --- a/docs/sap-spec/NEXT_AGENT_PROMPT.md +++ b/docs/sap-spec/NEXT_AGENT_PROMPT.md @@ -1,245 +1,139 @@ -# Handover — close the Elmhurst Summary→SAP chain to 1e-4 +# Handover — close the remaining 5 Elmhurst Summary→SAP chains to 1e-4 You are picking up branch `ara-backend-design-prd` mid-stream. The -previous agent left a near-complete but **not actually complete** -validation chain. This handover is honest about what's done, what's -still wrong, and why a fresh approach may help. +previous chain of work closed **Summary_000474** to 1e-4 (Slice 47) +and landed extractor infrastructure that helps the other 5 fixtures +(Slice 48), but each of the 5 remaining certs still has its own diff +to close. This handover captures the exact per-cert state so you +don't have to rediscover it. ## The 30-second picture -There are two paths into the calculator: - ``` -Path A: hand-built EpcPropertyData → cascade → SAP 62.2584 ← matches Elmhurst worksheet PDF to 4 d.p. ✓ -Path B: Summary_NNNNNN.pdf → extractor → ElmhurstSiteNotes - → from_elmhurst_site_notes → EpcPropertyData - → cascade → SAP 62.5195 ← off by 0.2611 unrounded SAP points ✗ +Cert Mapped SAP Target SAP Δ Notes +000474 62.2584 62.2584 0.0000 ✓ CLOSED (Slice 47) +000477 71.3712 65.0057 +6.3655 secondary heating + lighting +000480 69.5681 61.2986 +8.2695 unknown (largest gap) +000487 69.7864 61.6431 +8.1433 window count 1/many (extractor) + others +000490 63.0530 57.3979 +5.6551 unknown +000516 68.7749 62.7937 +5.9812 roof window separation ``` -Both paths feed the same calculator (`calculate_sap_from_inputs`). Path -A proves the cascade is provably equivalent to Elmhurst's calculator -(`sap_score_continuous` = 62.2584 matches Elmhurst worksheet PDF line -257 exactly). Path B uses `from_elmhurst_site_notes` instead of the -hand-built fixture; **it should produce identical output**, because -the Summary PDF and the hand-built fixture encode the same source-of- -truth data. It doesn't. The 0.26 SAP gap means information is being -dropped in the extractor or mapper. +All Δ are positive — the mapper is computing SAP too HIGH, which +typically means missing cost contributions (secondary heating fuel, +lighting, etc.) or under-counted heat-loss area. -The user explicitly rejected "within 0.5 is good enough" — the chain -must reproduce Elmhurst to `1e-4` like every other Elmhurst worksheet -test. +Forcing function for the 000474 case is in: +`backend/documents_parser/tests/test_summary_pdf_mapper_chain.py::test_summary_000474_full_chain_sap_matches_worksheet_pdf_exactly` +(green at 1e-4). When you close each remaining cert, add a mirror +test next to it. -The end goal is: `api → EpcPropertyData → Sap10 calculator` with -`< 0.5` SAP error (the API publishes rounded SAP integers, so half a -point is rounding noise). But for the Elmhurst-site-notes path, -because Elmhurst's worksheet PDFs publish unrounded line-ref values, -the target is **zero error to 1e-4**. +## What landed already -## Forcing function - -There is one failing test pinning this: - -``` -backend/documents_parser/tests/test_summary_pdf_mapper_chain.py:: - test_summary_000474_full_chain_sap_matches_worksheet_pdf_exactly -``` - -It asserts `abs(mapped_sap - 62.2584) < 1e-4` and currently fails with -`Δ = 0.2611`. Your job is to drive it to GREEN. - -## Definition of done - -- The failing test above passes at `1e-4` tolerance. -- The two other `test_summary_pdf_mapper_chain.py` tests stay green - (`sap_building_parts == 3`, `sap_windows == 7`). -- The wider `datatypes/epc/` + `backend/documents_parser/tests/` - regression stays green (the 9 pre-existing `test_appendix_u.py` - failures and 1 pre-existing 1e-9 FEE-precision failure are - unrelated — leave them). -- Once 000474 is at 1e-4, replicate the test for the other 5 Summary - PDFs (000477, 000480, 000487, 000490, 000516) — files are under - `sap worksheets/` at the repo root, **untracked**, copy each into - `backend/documents_parser/tests/fixtures/` as you go. -- For each cert, pin against the unrounded SAP value lodged in line - 257 of the corresponding `U985-0001-NNNNNN.pdf` worksheet. - -## What the previous agent did right - -12 commits across the session. The architecturally-load-bearing ones: - -| Slice | Commit | Effect | +| Slice | Commit | What | |---|---|---| -| 44 | `ea6d4263` | flat_roof_insulation_thickness mapper passthrough | -| 45a/b/c | `f08252dc` / `24f35f8b` / `5acbecc5` | PV cascade per Appendix M + U (orientation × pitch × Table-M1 ZPV × rating-vs-demand climate) | -| audit pins | `15789f5a` / `acc6331d` / `8ac548ca` | u_wall / u_roof / u_floor description-cascade pins against Tables 6 / 16 / §5.12 — proves U-value cascade is spec-correct on the cohort | -| 46 scaffold | `ccf7aa21` | First scaffold test for Summary→EpcPropertyData chain (strict-xfail) | -| 46a | `36f2c7bb` | Multi-bp support: schema adds `ExtensionPart`; extractor parses Main + 1st Extension + 2nd Extension subsections in §4/§7/§8/§9 with "As Main: Yes" inheritance; mapper produces a `SapBuildingPart` per bp | -| 46b | `066dce19` | Layout-style window parser anchored on `W H Area` data line + `Manufacturer ` line — extracts 7 windows from the Summary table layout | -| **46c** | **`256a5afe`** | **String→int code translations for every Elmhurst-coded field the cascade reads (age band, wall_construction, wall_insulation_type, main_fuel_type, heat_emitter_type, main_heating_control, orientation); PCDB index parsed from `pcdf_boiler_reference`; floor ordering + 0.25 m upper-storey adjustment + `is_exposed_floor` flag for "above unheated space"** | +| 47 | `29ab80b0` | `main_heating_category=2` in mapper → pumps_fans 130→160; window-gap partitioned on glazing-type-start marker → fixes W4/W5 orientation mis-classification | +| 48 | (this slice) | Extractor handles combined `Wood 0.70` frame line; data anchor allows trailing glazing-type (`1.22 1.76 2.15 Double pre 2002`); partition falls back to second-orient-token when no glazing marker in gap; 5 fixture PDFs copied to `backend/documents_parser/tests/fixtures/` | -After 46c, mapped SAP is 62.52 vs target 62.26 (`Δ = 0.26`). +## Per-cert diff diagnoses (already done — don't re-discover) -## What the previous agent got wrong +### 000477 — Δ +6.37 — single-BP cert +Run `python /tmp/diff_cert.py 000477` to reproduce. Scalar input diffs: +- `secondary_heating_fraction`: mapped=**0.0** vs handbuilt=**0.1** ← root cause. Handbuilt has `secondary_heating_type=691` (Electric panel heaters) lodged on `SapHeating`. Mapper doesn't extract from §10 "Secondary heating" Summary PDF section. +- `lighting_kwh_per_yr`: mapped=160.88 vs handbuilt=201.68 — bulb count mismatch +- `secondary_heating_co2_factor_kg_per_kwh`, `secondary_heating_primary_factor`: None vs populated — downstream of the missing secondary_heating_type -**Overclaimed completion at the 0.5-tolerance milestone.** The -original test was written with a 0.5 tolerance (mirroring the -API-cert residual cohort, where the API publishes rounded SAP -integers so half a point is just rounding). That's the wrong -tolerance for the Elmhurst path: Elmhurst lodges full PDF lines with -4-d.p. unrounded values, and our cascade matches them exactly on -hand-built inputs. The bar is `1e-4`, not 0.5. The previous agent -committed Slice 46c with a 0.5-tolerance pin; this handover has -since tightened it to `1e-4` (the failing test above). +**Fix**: Extend `ElmhurstSiteNotesExtractor` to extract §10 Secondary heating fields, and `_map_elmhurst_sap_heating` to populate `secondary_heating_type` + `secondary_fuel_type` on the `SapHeating`. -## The remaining diffs (Slice 46c → 1e-4 SAP) +### 000487 — Δ +8.14 — only 1 window extracted (should be many) +Despite Slice 48's frame-line + data-anchor improvements, the §11 +table layout of this cert still drops most windows. Run +`python /tmp/dump_one.py 000487` to see the parsed window. Inspect +the raw layout via `python /tmp/dump_section.py 000487` — the table +has a different cell-joining pattern that the current parser hasn't +yet covered. -After Slice 46c, the only differing `cert_to_inputs(epc)` scalar -fields between mapped and hand-built are: +### 000516 — Δ +5.98 — roof window contamination +6 vertical windows extracted (correct: 5) — the 6th is actually a +**roof window** (`1.05×1.12=1.18 m² U=3.10 NE`, pitch 45°) being +treated as vertical. Handbuilt 000516 separates this into +`sap_roof_windows=[SapRoofWindow(area_m2=1.18, u_value_raw=3.40, +orientation=2 NE, pitch_deg=45.0, ...)]`. -``` -hot_water_kwh_per_yr: mapped=2291.7821223353485 handbuilt=2291.7784230242883 (Δ < 0.01 — float drift) -pumps_fans_kwh_per_yr: mapped=130.0 handbuilt=160.0 ← 30 kWh, real bug -lighting_kwh_per_yr: mapped=139.94522455112704 handbuilt=139.94522455112707 (Δ < 1e-10 — float noise) -pumps_fans_primary_factor: mapped=1.5128000000000001 handbuilt=1.5128 (float repr — harmless) -fabric_energy_efficiency_kwh_per_m2_yr: mapped=186.62 handbuilt=186.88 (output, not input — driven by solar gains) -``` +**Fix**: Add roof-window extraction in `ElmhurstSiteNotesExtractor` +(likely a "Roof Windows" subsection of §11). Map to `SapRoofWindow` +not `SapWindow`. -Plus a window-orientation mis-classification (see below). +### 000480 — Δ +8.27 — diagnosis pending +Run `python /tmp/diff_cert.py 000480` to start. Two BPs (Main + ext) +extracted correctly, 7 windows extracted. Largest gap of the 5 — may +have multiple compounding issues. -### Diff #1: `pumps_fans_kwh_per_yr` = 130 vs 160 (30 kWh) +### 000490 — Δ +5.66 — diagnosis pending +Same: `python /tmp/diff_cert.py 000490`. -This is the dominant residual contributor. Search `cert_to_inputs.py` -for `pumps_fans_kwh_per_yr` to find what drives it. Likely candidates: -- `central_heating_pump_age` (the Summary PDF lodges "Heat pump age: - Unknown" but that's the HEAT pump, not the central heating pump — - may need a separate field on `ElmhurstSiteNotes.MainHeating`) -- Boiler type / FGHRs / weather compensator flags -- Some specific Table 4d/4e cascade input we're dropping - -The cascade reads `MainHeatingDetail.central_heating_pump_age: Optional[int]` -which the Elmhurst mapper doesn't currently populate. - -### Diff #2: Window [4] orientation mis-classified as SE (4) — should be E (3) - -Mapped windows: -``` - [4] orient=4 (SE) W=1.1 H=1.6 U=2.0 -``` - -Hand-built has TWO East U=2.0 windows totalling 3.74 m² area. Mapped's -window [2] (East, 1.98 m²) + window [4] (mis-labelled SE, 1.76 m²) = -3.74 m² ✓ — exactly matches. So the layout-style window parser is -producing `orientation='East-South'` for window [4] when it should be -just `'East'`. Look at `_compose_window_descriptors` in -`backend/documents_parser/elmhurst_extractor.py` — the suffix -token "South" is being joined with the inline "East" prefix when it -shouldn't be (probably the "South" belongs to a different window). - -The window count itself also differs: mapped extracts 7 individual -windows, hand-built consolidates to 5 by `(orientation, U)` group with -width = total area / 1.0 m. Both should be functionally equivalent -to the cascade IF orientations + U-values + total areas all match. So -fixing the orientation should close this gap. - -### Diff #3: float-precision noise - -`hot_water_kwh_per_yr` (sub-0.01 kWh) and `lighting_kwh_per_yr` -(sub-1e-10 kWh) are downstream of accumulation order in the cascade. -`pumps_fans_primary_factor` 1.5128000000000001 vs 1.5128 is a Python -float repr quirk. These won't close the SAP gap; ignore them unless -the other fixes leave the test slightly red and one of these turns -out to be the last decimal. - -## Why a fresh approach may help - -The previous agent's pattern was "fix one bug at a time, ship when -test passes loosely." The right pattern for `1e-4` is the opposite: -**systematically diff every input field between mapped and hand-built, -fix every diff, then run the test once.** That's two more slices -(pumps_fans + window orientation) — not enough for a session of -incremental shipping, so consider doing both in one slice with the -pin as the forcing function. - -You may also want to look harder at the *architectural* question: -why does Elmhurst's "site notes" (the surveyor's input form) need to -go through THREE schemas — `ElmhurstSiteNotes`, then `EpcPropertyData`, -then `CalculatorInputs` — when the hand-built fixture skips straight -to the middle one? The string→int translations in `from_elmhurst_site_notes` -are essentially doing what the extractor should do (or what an -`elmhurst_codes.py` codes-module could express). If `ElmhurstSiteNotes` -stored integer codes alongside the human-readable strings, the mapper -would be a pure projection. - -## Quick-orient commands +## Probe scripts (already in `/tmp`) ```bash -# Failing test (Δ = 0.26 SAP → target < 1e-4) -python -m pytest backend/documents_parser/tests/test_summary_pdf_mapper_chain.py::test_summary_000474_full_chain_sap_matches_worksheet_pdf_exactly --no-cov --no-header --tb=short +# Probe all 6 fixtures' SAP delta +python /tmp/probe_all.py -# All Summary→chain tests (2/3 green, 1 failing — the one above) -python -m pytest backend/documents_parser/tests/test_summary_pdf_mapper_chain.py --no-cov --no-header -v +# Dump raw extracted windows for a cert +python /tmp/dump_one.py 000516 -# Wider regression to confirm no fresh breakage -python -m pytest datatypes/epc/ backend/documents_parser/tests/ --no-cov --no-header -q +# Dump the §11 Windows section layout (debug extractor) +python /tmp/dump_section.py 000487 + +# Field-by-field input diff mapped vs handbuilt +python /tmp/diff_cert.py 000477 ``` -The 9 `test_appendix_u.py` failures and 1 `test_no_ac_cert_round_trips_fee_equals_space_heating_per_m2` -failure are **pre-existing** from before this session — don't try to -fix them as part of this work. +If `/tmp` got wiped, the contents are: +- `probe_all.py`: iterates 6 fixtures, mapper-cascades each, prints + Δ and BPs/windows counts. +- `dump_one.py `: prints raw `ElmhurstSiteNotes.windows` for + the given cert. +- `dump_section.py `: prints the line-by-line `§11 Windows` + section from the layout-preprocessed pages. +- `diff_cert.py `: cascades both mapped and handbuilt EPCs + through `cert_to_inputs`, diffs the scalar input fields. + +## Suggested next slices + +| Slice | Cert | Effort | Why next | +|---|---|---|---| +| 49 | 000477 | Medium | 1 BP topology → smallest moving parts; secondary-heating extraction unlocks the largest single fix | +| 50 | 000516 | Medium | Roof-window separation — adds new schema field, generalisable | +| 51 | 000487 | High | Extractor parser improvements (the §11 layout is uncovered terrain) | +| 52 | 000490 | Unknown | Diagnose then close | +| 53 | 000480 | Unknown | Largest Δ; do last after others reveal patterns | + +## Definition of done (unchanged from before) + +- All 6 `test_summary_pdf_mapper_chain.py` chain tests pin at 1e-4. +- Wider regression stays green (currently 754 pass). +- Pyright net-zero on every commit (strict mode). +- One slice = one commit. +- No widening, no xfail (project memory `feedback_zero_error_strict`). ## Reference materials -- **`docs/sap-spec/HANDOVER_NEXT.md`** — original calculator-closure - handover; still useful as the canonical reference for cascade - conventions (AAA tests, 1e-4 tolerance, etc.). -- **`docs/sap-spec/SAP_CALCULATOR.md`** — public API + two-cascade - architecture (rating vs demand). -- **`sap worksheets/Summary_000474.pdf`** (untracked) — the source-of- - truth input for fixture 000474. Mirror tracked at - `backend/documents_parser/tests/fixtures/Summary_000474.pdf`. -- **`sap worksheets/U985-0001-000474.pdf`** (untracked) — the - Elmhurst-computed worksheet with line refs the test pins against - (line 257 for unrounded SAP). -- **`packages/domain/src/domain/sap/worksheet/tests/_elmhurst_worksheet_000474.py`** - — the hand-built `EpcPropertyData` for the same fixture. The CALCULATOR- - EQUIVALENT target the mapper must reproduce. - -## File map for the work ahead - -| File | Role | -|---|---| -| `backend/documents_parser/elmhurst_extractor.py` | PDF → `ElmhurstSiteNotes` extractor; layout-style window parser at `_compose_window_descriptors` is where diff #2 lives | -| `datatypes/epc/surveys/elmhurst_site_notes.py` | The schema (recently extended with `ExtensionPart`); `MainHeating` may need a `central_heating_pump_age` field for diff #1 | -| `datatypes/epc/domain/mapper.py:254-326` | `from_elmhurst_site_notes` — the mapper itself | -| `datatypes/epc/domain/mapper.py:1772-1830` | Code-translation helpers (`_leading_code`, `_elmhurst_wall_construction_int`, etc.) | -| `datatypes/epc/domain/mapper.py:2010-2080` | `_map_elmhurst_building_part` + extension iteration | -| `datatypes/epc/domain/mapper.py:2180+` | `_map_elmhurst_sap_heating` — likely where diff #1 (pumps_fans) is fixed | -| `packages/domain/src/domain/sap/rdsap/cert_to_inputs.py` | Cascade — search `pumps_fans_kwh_per_yr` for diff #1's root cause | - -## Conventions you must honour (from project memory) - -- AAA test convention: every new test uses literal `# Arrange / # Act - / # Assert` headers -- `abs(diff) <= tol` not `pytest.approx` (strict pyright) -- One slice = one commit; stage by name (`?? non_invasive_photos/` and - similar untracked junk must not be staged) -- 1e-4 tolerance, no widening, no xfail (`feedback_zero_error_strict`) -- Strict pyright net-zero on every commit +- `docs/sap-spec/HANDOVER_NEXT.md` — original cascade-conventions handover. +- `docs/sap-spec/SAP_CALCULATOR.md` — public API + two-cascade architecture. +- Hand-built worksheet fixtures: `packages/domain/src/domain/sap/worksheet/tests/_elmhurst_worksheet_*.py` — the calculator-equivalent EpcPropertyData each mapped chain must reproduce. +- Untracked source PDFs: `sap worksheets/` (`Summary_NNNNNN.pdf` + `U985-0001-NNNNNN.pdf` worksheets — the unrounded SAP target lives next to "SAP value" in the U985 PDF). ## Branch state at handover ``` -$ git log --oneline -5 -256a5afe Slice 46c: Elmhurst mapper produces calculator-equivalent EpcPropertyData — Summary_000474 SAP within 0.5 of worksheet PDF -066dce19 Slice 46b: Elmhurst extractor parses windows from layout-style Summary PDFs -36f2c7bb Slice 46a: Elmhurst mapper handles multi-bp Summary PDFs — Summary_000474 chain test flips green -ccf7aa21 Scaffold: end-to-end Summary→EpcPropertyData chain test for 000474 (xfail) -8ac548ca Audit: pin u_floor §5.12 formula cascade for cert 0240 cohort geometry +$ git log --oneline -7 + Slice 48: Elmhurst extractor handles 5 new layout quirks; 5 fixture PDFs added +29ab80b0 Slice 47: Summary_000474 chain pins SAP at 1e-4 vs worksheet PDF +b6544e1c Handover: tighten Summary→SAP chain pin to 1e-4 + brief next agent +256a5afe Slice 46c: Elmhurst mapper produces calculator-equivalent EpcPropertyData — Summary_000474 SAP within 0.5 of worksheet PDF +066dce19 Slice 46b: Elmhurst extractor parses windows from layout-style Summary PDFs +36f2c7bb Slice 46a: Elmhurst mapper handles multi-bp Summary PDFs — Summary_000474 chain test flips green +ccf7aa21 Scaffold: end-to-end Summary→EpcPropertyData chain test for 000474 (xfail) ``` -The 0.5 tolerance in commit 46c's message is stale — this handover -tightened it to 1e-4 after the commit. The first thing you commit -should fix one of the two diffs and explicitly mention closing the -last bit of the gap toward 1e-4. - Good luck.