diff --git a/backend/documents_parser/elmhurst_extractor.py b/backend/documents_parser/elmhurst_extractor.py index e8a90d91..4e222bc8 100644 --- a/backend/documents_parser/elmhurst_extractor.py +++ b/backend/documents_parser/elmhurst_extractor.py @@ -21,6 +21,7 @@ from datatypes.epc.surveys.elmhurst_site_notes import ( Shower, SurveyorInfo, VentilationAndCooling, + ElmhurstPvArray, WallDetails, WaterHeating, Window, @@ -1153,8 +1154,6 @@ class ElmhurstSiteNotesExtractor: hydro_raw = self._next_val("Electricity generated [kWh/year]") hydro = float(hydro_raw) if hydro_raw else 0.0 - pv = self._extract_pv_array_detail() - return Renewables( solar_water_heating=self._bool_val("Solar Water Heating"), wwhrs_present=self._bool_val("Is WWHRS present in the property?"), @@ -1164,69 +1163,94 @@ class ElmhurstSiteNotesExtractor: wind_turbine_present=self._bool_val("Wind turbine present?"), wind_turbines_terrain_type=terrain, hydro_electricity_generated_kwh=hydro, - pv_peak_power_kw=pv[0], - pv_orientation=pv[1], - pv_elevation_deg=pv[2], - pv_overshading=pv[3], + pv_arrays=self._extract_pv_arrays(), ) - def _extract_pv_array_detail( - self, - ) -> tuple[Optional[float], Optional[str], Optional[int], Optional[str]]: + def _extract_pv_arrays(self) -> List[ElmhurstPvArray]: """Parse the Elmhurst Summary §19.0 PV Panel section. Returns - (kw_peak, orientation, elevation_deg, overshading) when the cert - lodges measured PV; (None, None, None, None) when absent. + one `ElmhurstPvArray` per lodged array, or [] when absent. - The Summary's PV block looks like: + The Summary's PV block looks like (single-array, e.g. cert 0380): Photovoltaic panel details PV Cells kW Peak Orientation Elevation Overshading - 2.36 - South-West + 3.00 + South-East 45° None Or Little - — the 4 values follow the header block in a known order, one - per line. Anchor on "Photovoltaic panel details" → skip the - header lines → read 4 values. + Multi-array (e.g. cert 0350 lodges 2 arrays): + ... + 1.50 + South-East + 45° + None Or Little + 1.50 + North-West + 45° + None Or Little + + — each array is 4 values in (kW Peak, Orientation, Elevation, + Overshading) order. Anchor on "Photovoltaic panel details", + skip header lines, then read values in 4-tuples until the + section breaks at the next §header or end-of-array tokens + (Batteries / Export / Capacity / etc.). """ anchor = "Photovoltaic panel details" try: idx = next(i for i, l in enumerate(self._lines) if l == anchor) except StopIteration: - return (None, None, None, None) - # The 4 header lines after the anchor are: - # "PV Cells kW Peak Orientation", "Elevation", "Overshading" - # followed by 4 value lines. Slice the next ~10 lines and - # filter the first 4 entries that look like values (not - # headers). - tail = self._lines[idx + 1 : idx + 12] + return [] + # The header lines after the anchor are: "PV Cells kW Peak + # Orientation", "Elevation", "Overshading". Subsequent lines + # carry values for one OR MORE arrays. Stop at the next + # §-header (a "20.0" or "21.0") or post-PV section tokens + # ("Batteries", "Connected to", "Diverter", "Capacity", etc.). header_tokens = {"pv cells", "kw peak", "orientation", "elevation", "overshading"} + stop_tokens = { + "batteries", "capacity known", "capacity", + "connected to the dwelling's meter", "diverter present", + "export capable meter", + } values: List[str] = [] - for line in tail: + for line in self._lines[idx + 1:]: stripped = line.strip() if not stripped: continue lower = stripped.lower() + if lower in stop_tokens: + break + # Next §-header (e.g. "20.0 Wind Turbine") closes the block — + # match "." so kWp values + # like "1.50" don't trip the close. + if re.match(r"^\d{1,2}\.\d\s+\w", stripped): + break if any(h in lower for h in header_tokens): continue values.append(stripped) - if len(values) == 4: - break - if len(values) < 4: - return (None, None, None, None) - try: - kwp = float(values[0]) - except ValueError: - return (None, None, None, None) - orientation = values[1] - # Elevation lodged as "45°" — strip trailing degree symbol. - m = re.match(r"^(\d+)", values[2]) - elevation = int(m.group(1)) if m else None - overshading = values[3] - return (kwp, orientation, elevation, overshading) + # Walk values in 4-tuples; an incomplete trailing tuple is dropped. + arrays: List[ElmhurstPvArray] = [] + for i in range(0, len(values) - 3, 4): + try: + kwp = float(values[i]) + except ValueError: + continue + orientation = values[i + 1] + # Elevation lodged as "45°" — strip trailing degree symbol. + m = re.match(r"^(\d+)", values[i + 2]) + if m is None: + continue + elevation = int(m.group(1)) + overshading = values[i + 3] + arrays.append(ElmhurstPvArray( + peak_power_kw=kwp, + orientation=orientation, + elevation_deg=elevation, + overshading=overshading, + )) + return arrays def extract(self) -> ElmhurstSiteNotes: emissions_raw = self._next_val("Emissions (t/year)") diff --git a/backend/documents_parser/tests/fixtures/Summary_000903.pdf b/backend/documents_parser/tests/fixtures/Summary_000903.pdf new file mode 100644 index 00000000..0f376590 Binary files /dev/null and b/backend/documents_parser/tests/fixtures/Summary_000903.pdf differ diff --git a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py index 635b5308..4f402e68 100644 --- a/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py +++ b/backend/documents_parser/tests/test_summary_pdf_mapper_chain.py @@ -621,6 +621,34 @@ def test_summary_0380_cylinder_block_surfaces_full_15_1_lodging() -> None: assert epc.sap_heating.cylinder_thermostat == "Y" +def test_summary_0350_surfaces_two_pv_arrays() -> None: + # Arrange — cert 0350's Summary §19.0 Photovoltaic Panel block + # lodges TWO arrays (L 503-510): + # 1.50 kWp / South-East / 45° / None Or Little + # 1.50 kWp / North-West / 45° / None Or Little + # The Elmhurst extractor's `_extract_pv_array_detail` hardcodes a + # single 4-value reader (loop breaks at `len(values) == 4`) and + # the `Renewables` dataclass exposes only 4 scalar PV fields — + # together they cap output at one array regardless of how many the + # PDF lodges. Cert 0380 (single-array) is unaffected; cert 0350 + # is the first multi-array cohort cert. Without both arrays the + # cascade halves the PV export credit and the SAP score drops. + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000903_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + + # Act + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Assert + assert epc.sap_energy_source is not None + arrays = epc.sap_energy_source.photovoltaic_arrays + assert arrays is not None + assert len(arrays) == 2 + # Both arrays at 1.5 kWp; order matches PDF row order. + assert arrays[0].peak_power == 1.5 + assert arrays[1].peak_power == 1.5 + + def test_summary_0350_ext1_inherits_main_wall_insulation_thickness() -> None: # Arrange — cert 0350-2968-2650-2796-5255 is a multi-bp dwelling # (Main + 1st Extension). Its Summary §7 Walls block lodges @@ -650,6 +678,39 @@ def test_summary_0350_ext1_inherits_main_wall_insulation_thickness() -> None: assert ext1_bp.wall_insulation_thickness == "100mm" +def test_summary_0350_full_chain_sap_within_spec_floor_of_worksheet() -> None: + # Arrange — cert 0350-2968-2650-2796-5255 (Summary_000903.pdf / + # dr87-0001-000903.pdf) is the second heat-pump cert under per-cert + # Summary-path mapper validation and the first multi-bp cohort + # cert: Mitsubishi PUZ-WM50VHA ASHP (PCDB index 104568), main + # dwelling + 1 extension, 2 PV arrays (2x 1.5 kWp at SE / NW). + # Worksheet PDF "SAP value" line lodges unrounded SAP **84.1367**. + # + # First-attempt closure (validating the structural-debt-amortizes + # hypothesis): after Slices S0380.2..S0380.6 (which were forced by + # cert 0380) the cohort HP routing + cylinder block were already + # in place; cert 0350 needed only TWO new slices: + # - Slice S0380.8: extension "As Main Wall" inheritance copies + # `insulation_thickness_mm` (cert 0380 was single-bp, didn't + # exercise the inheritance path). + # - Slice S0380.9: refactor Elmhurst `Renewables` to support + # multiple PV arrays per dwelling (cert 0380 was single-array, + # didn't exercise multi-array PV). + # Both fixes are structural and apply cohort-wide. + pages = _summary_pdf_to_textract_style_pages(_SUMMARY_000903_PDF) + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + epc = EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + # Act + result = calculate_sap_from_inputs( + cert_to_inputs(epc, prices=SAP_10_2_SPEC_PRICES) + ) + + # Assert — ±0.07 ASHP-cohort spec-floor tolerance. + worksheet_unrounded_sap = 84.1367 + assert abs(result.sap_score_continuous - worksheet_unrounded_sap) < _ASHP_COHORT_CHAIN_TOLERANCE + + def test_summary_0380_full_chain_sap_within_spec_floor_of_worksheet() -> None: # Arrange — cert 0380-2471-3250-2596-8761 (Summary_000899.pdf / # dr87-0001-000899.pdf) is the first heat-pump cert under per-cert diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index c381adea..d3ccbd83 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -3067,26 +3067,29 @@ def _elmhurst_pv_arrays( ) -> Optional[List[PhotovoltaicArray]]: """Build the Appendix M / Appendix U3.3 cost-offset cascade's input list from the Elmhurst Summary §19.0 PV detail. Returns None when - the cert hasn't lodged measured PV (no kW Peak value) — the cohort - PV-absent path the cascade already handles correctly. + the cert hasn't lodged measured PV — the cohort PV-absent path the + cascade already handles correctly. - All four §19.0 inputs (kW peak + orientation + elevation + - overshading) are required for a meaningful Appendix M output; - missing any of them collapses to None so the cascade defers to - the legacy `photovoltaic_supply.percent_roof_area` fallback. + Each lodged §19.0 row (a `ElmhurstPvArray`) becomes one + `PhotovoltaicArray` entry. Single-array dwellings (cohort cert + 0380: 3 kWp) and multi-array dwellings (cohort cert 0350: 2x 1.5 + kWp at distinct orientations) go through the same iterator. """ - if renewables.pv_peak_power_kw is None or renewables.pv_peak_power_kw <= 0.0: + if not renewables.pv_arrays: return None - if renewables.pv_orientation is None or renewables.pv_elevation_deg is None: - return None - return [ - PhotovoltaicArray( - peak_power=renewables.pv_peak_power_kw, - pitch=_elmhurst_pv_pitch_code(renewables.pv_elevation_deg), - orientation=_elmhurst_orientation_int(renewables.pv_orientation), - overshading=_elmhurst_pv_overshading_int(renewables.pv_overshading), + out: List[PhotovoltaicArray] = [] + for arr in renewables.pv_arrays: + if arr.peak_power_kw <= 0.0: + continue + out.append( + PhotovoltaicArray( + peak_power=arr.peak_power_kw, + pitch=_elmhurst_pv_pitch_code(arr.elevation_deg), + orientation=_elmhurst_orientation_int(arr.orientation), + overshading=_elmhurst_pv_overshading_int(arr.overshading), + ) ) - ] + return out or None # RdSAP 10 §11.1 PV pitch enum (degrees → integer code consumed by diff --git a/datatypes/epc/surveys/elmhurst_site_notes.py b/datatypes/epc/surveys/elmhurst_site_notes.py index 85b63c07..27833d14 100644 --- a/datatypes/epc/surveys/elmhurst_site_notes.py +++ b/datatypes/epc/surveys/elmhurst_site_notes.py @@ -259,13 +259,24 @@ class Renewables: wind_turbines_terrain_type: str hydro_electricity_generated_kwh: float # PV array detail (Elmhurst Summary §19.0 "Photovoltaic Panel" - # block: kW Peak, Orientation, Elevation, Overshading). Populated - # when the cert lodges measured PV; absent (None / "" / 0.0) - # otherwise. Drives Appendix M / Appendix U3.3 cost-offset cascade. - pv_peak_power_kw: Optional[float] = None - pv_orientation: Optional[str] = None # e.g. "South-West" - pv_elevation_deg: Optional[int] = None # e.g. 45 - pv_overshading: Optional[str] = None # e.g. "None Or Little" + # block: a list of (kW Peak, Orientation, Elevation, Overshading) + # rows). Empty list when the cert hasn't lodged measured PV. + # Drives Appendix M / Appendix U3.3 cost-offset cascade — both the + # single-array (cohort cert 0380) and multi-array (cohort cert + # 0350: 2x 1.5 kWp) layouts go through the same list. + pv_arrays: List["ElmhurstPvArray"] = field( + default_factory=lambda: [] # type: ignore[reportUnknownLambdaType] + ) + + +@dataclass +class ElmhurstPvArray: + """One Photovoltaic array row from Summary §19.0. The four fields + match the columns in the PDF's PV Panel block.""" + peak_power_kw: float + orientation: str # e.g. "South-West" + elevation_deg: int # e.g. 45 + overshading: str # e.g. "None Or Little" @dataclass