added test for a 1000 examples

This commit is contained in:
Jun-te Kim 2026-06-09 16:02:21 +00:00
parent 06cb4f7b6e
commit 3b7d26fe34
10 changed files with 2490 additions and 252 deletions

3
.gitignore vendored
View file

@ -303,3 +303,6 @@ backlog/*
# Local Claude config files
.claude/*modelling_cohort.csv
# Local EPC debug cache (scripts/eon)
scripts/eon/epc_cache.pkl

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -6,7 +6,6 @@ from typing import Final, List, Optional, Union
from datatypes.epc.domain.epc import Epc
_API_EXTENSION = re.compile(r"^Extension\s+(\d+)$")
@ -36,9 +35,7 @@ class BuildingPartIdentifier(Enum):
OTHER = "other"
@classmethod
def from_api_string(
cls, api_identifier: Optional[str]
) -> "BuildingPartIdentifier":
def from_api_string(cls, api_identifier: Optional[str]) -> "BuildingPartIdentifier":
"""Map a gov-EPC API `BuildingPart.identifier` to its canonical
member. "Main Dwelling" MAIN; "Extension N" EXTENSION_N
(for N in 1..4). `None` (permitted by the 21_0_1 schema) and
@ -76,6 +73,7 @@ class Addendum:
Present on ~43% of real RdSAP certs (stone-walls / system-build / a list of
numeric improvement codes the assessor wanted to call out).
"""
stone_walls: Optional[bool] = None
system_build: Optional[bool] = None
addendum_numbers: Optional[List[int]] = None
@ -184,10 +182,12 @@ class SapVentilation:
flueless_gas_fires_count: Optional[int] = None
ventilation_in_pcdf_database: Optional[bool] = None
# SAP10.2 §2 cert lodgements not previously surfaced on this type.
sheltered_sides: Optional[int] = None # (19) — cert assessor lodge, 0..4
sheltered_sides: Optional[int] = None # (19) — cert assessor lodge, 0..4
has_suspended_timber_floor: Optional[bool] = None # (12) gate
suspended_timber_floor_sealed: Optional[bool] = None
has_draught_lobby: Optional[bool] = None # (13) gate (overrides .draught_lobby for §2 cascade)
has_draught_lobby: Optional[bool] = (
None # (13) gate (overrides .draught_lobby for §2 cascade)
)
# SAP 10.2 §2 (17a) — air permeability at 4 Pa from the low-pressure
# Pulse pressure test, m³/h per m² of envelope area. When present the
# cascade routes (18) via the AP4 formula `0.263 × AP4^0.924 + (8)`.
@ -302,10 +302,11 @@ class PhotovoltaicArray:
measured PV configuration; `photovoltaic_supply` carries the fallback
`percent_roof_area` estimate when the surveyor could not confirm details.
"""
peak_power: float
pitch: int
orientation: int
overshading: int
orientation: Optional[int] = None
@dataclass
@ -515,7 +516,9 @@ class SapBuildingPart:
floor_u_value_known: Optional[bool] = None
roof_construction: Optional[int] = None
roof_construction_type: Optional[str] = None # str from site notes e.g. "PS Pitched, sloping ceiling"
roof_construction_type: Optional[str] = (
None # str from site notes e.g. "PS Pitched, sloping ceiling"
)
roof_insulation_location: Optional[Union[int, str]] = (
None # TODO: make enum/mapping?
)
@ -592,6 +595,7 @@ class RenewableHeatIncentive:
baseline `space_heating_kwh` and `hot_water_kwh` for SAP10 properties (used as ML
training targets per ADR-0007).
"""
space_heating_kwh: float
water_heating_kwh: float
impact_of_loft_insulation_kwh: Optional[float] = None

File diff suppressed because it is too large Load diff

View file

@ -307,6 +307,36 @@ class TestFromRdSapSchema21_0_0:
# photovoltaic_supply is None when the measured shape is present
assert result.sap_energy_source.photovoltaic_supply is None
def test_photovoltaic_array_orientation_nd_nulls_only_that_field(self) -> None:
# Arrange — a 3-array dwelling where the middle array lodges the RdSAP
# 'ND' ("Not Defined") sentinel for orientation. Regression for the
# real 21.0.1 cert 5236-4425-7600-0474-2292: 'ND' must null ONLY that
# array's orientation, not crash the int() coercion and drop every
# array (which happened when 'ND' was handled in the shared
# _measurement_value coercer instead of field-scoped _pv_orientation).
data = load("21_0_0.json")
data["sap_energy_source"]["photovoltaic_supply"] = [
[{"pitch": 3, "peak_power": 2.0, "orientation": 3, "overshading": 1}],
[{"pitch": 1, "peak_power": 2.0, "orientation": "ND", "overshading": 1}],
[{"pitch": 3, "peak_power": 2.0, "orientation": 7, "overshading": 1}],
]
schema = from_dict(RdSapSchema21_0_0, data)
# Act
result = EpcPropertyDataMapper.from_rdsap_schema_21_0_0(schema)
# Assert — all three arrays survive; only the 'ND' orientation is None,
# and its sibling fields + the other arrays keep their real values.
arrays = result.sap_energy_source.photovoltaic_arrays
assert arrays is not None
assert len(arrays) == 3
assert [a.orientation for a in arrays] == [3, None, 7]
nd_array = arrays[1]
assert nd_array.orientation is None
assert nd_array.peak_power == 2.0
assert nd_array.pitch == 1
assert nd_array.overshading == 1
# ---------------------------------------------------------------------------
# Schema 21.0.1 (most comprehensive — full field coverage)

View file

@ -718,7 +718,7 @@ _HOURS_PER_DAY_OVER_1000: Final[float] = 0.024
_DAYS_PER_MONTH: Final[tuple[int, ...]] = (31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)
def _pv_annual_s_kwh_per_m2(
orientation_code: int,
orientation_code: Optional[int],
pitch_code: int,
climate: "int | PostcodeClimate",
) -> float:
@ -727,8 +727,10 @@ def _pv_annual_s_kwh_per_m2(
monthly Appendix U3.2 surface flux over the year. `climate` selects
Table U3/U4 region (UK average = 0 for the rating cascade) or a
`PostcodeClimate` from PCDB Table 172 for the demand cascade.
Returns 0.0 for unrecognised orientation codes (cert octants outside
1..8) these PV arrays contribute nothing."""
Returns 0.0 for an unknown orientation (None when the cert lodged 'ND',
or a code outside 1..8) these PV arrays contribute nothing."""
if orientation_code is None:
return 0.0
orientation = ORIENTATION_BY_SAP10_CODE.get(orientation_code)
if orientation is None:
return 0.0
@ -2475,8 +2477,10 @@ def _pv_array_monthly_generation_kwh(
E_PV,m = 0.8 × kWp × ZPV × (days_m × S_m × 24 / 1000)
where S_m is the §U3.2 surface flux (W/). Returns a 12-zero tuple
for arrays whose orientation isn't mapped in
`ORIENTATION_BY_SAP10_CODE` (defensive current cert lodgements
always cover 1..8)."""
`ORIENTATION_BY_SAP10_CODE` (defensive None when the cert lodged
'ND', else a code outside 1..8)."""
if array.orientation is None:
return (0.0,) * 12
orientation = ORIENTATION_BY_SAP10_CODE.get(array.orientation)
if orientation is None:
return (0.0,) * 12

View file

@ -1229,7 +1229,7 @@ def _pv_aggregates(es: SapEnergySource) -> dict[str, Any]:
total_power += a.peak_power
weighted_pitch += a.pitch * a.peak_power
weighted_overshading += a.overshading * a.peak_power
if a.orientation in _OCTANT_NAMES:
if a.orientation is not None and a.orientation in _OCTANT_NAMES:
octant_power[_OCTANT_NAMES[a.orientation]] += a.peak_power
aggregates["has_pv"] = True
aggregates["pv_capacity_source"] = "measured"

View file

@ -0,0 +1,68 @@
"""Drive EpcPropertyDataMapper against the harvested cert corpus.
The corpus (backend/epc_api/json_samples/cert_corpus.jsonl) is a balanced
sample of raw API certs across schema versions, produced by
scripts/eon/harvest_certs.py. Each line is one cert in the exact shape
``from_api_response`` consumes.
* 21.0.0 / 21.0.1 supported today; these are a regression guard.
* 20.0.0 mapper is incomplete, so these are xfail. As
``from_rdsap_schema_20_0_0`` is built out they flip to
xpass; when the whole bucket passes, drop the xfail
marker and the strict guard below will keep it honest.
If the corpus hasn't been harvested yet, every parametrisation is skipped.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import pytest
from datatypes.epc.domain.epc_property_data import EpcPropertyData
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
SAMPLES = Path("backend/epc_api/json_samples")
SUPPORTED = {"RdSAP-Schema-21.0.1"}
WIP = {"RdSAP-Schema-20.0.0"}
def _load(schema: str) -> list[dict[str, Any]]:
"""Load one schema's harvested corpus (json_samples/<schema>/corpus.jsonl)."""
path = SAMPLES / schema / "corpus.jsonl"
if not path.exists():
return []
return [
json.loads(line) for line in path.read_text().splitlines() if line.strip()
]
def _cases(schemas: set[str]) -> list[Any]:
certs = [(s, c) for s in schemas for c in _load(s)]
if not certs:
return [
pytest.param(
None,
marks=pytest.mark.skip(reason=f"no {schemas} corpus harvested"),
id="empty",
)
]
return [
pytest.param(c, id=f"{s}-{i}") for i, (s, c) in enumerate(certs)
]
@pytest.mark.parametrize("cert", _cases(SUPPORTED))
def test_supported_schemas_map(cert: dict[str, Any]) -> None:
result = EpcPropertyDataMapper.from_api_response(cert)
assert isinstance(result, EpcPropertyData)
@pytest.mark.xfail(reason="RdSapSchema20.0.0 mapper is incomplete", strict=False)
@pytest.mark.parametrize("cert", _cases(WIP))
def test_wip_schema_20_maps(cert: dict[str, Any]) -> None:
result = EpcPropertyDataMapper.from_api_response(cert)
assert isinstance(result, EpcPropertyData)

View file

@ -25,5 +25,6 @@ testpaths =
etl/epc_clean/tests
etl/hubspot/tests
etl/spatial/tests
infrastructure/epc_client/tests/
markers =
integration: mark a test as an integration test