slice 14l: bigger-run fixes — UCL guard, PV Measurement coercion, sMAPE

Three changes surfaced by the 25k 2026 run:
- transform._peui_ucl returns None for non-positive raw PEUI (net-exporters).
  apply_ucl_correction would otherwise raise ValueError on negative input.
- PhotovoltaicArray scalars (peak_power, pitch, orientation, overshading)
  now accept Measurement | int | float in the schema; mapper coerces via
  _measurement_value.
- train_baseline reports sMAPE alongside MAPE — handles zero-actual rows
  (e.g. co2_emissions for net-zero certs) where MAPE explodes.

Results at N=25,000 RdSAP 2026 certs (~32s end-to-end):
  sap_score          MAPE=0.064  sMAPE=0.054  R^2=0.762
  co2_emissions      sMAPE=0.140  R^2=0.890
  peui_raw           MAPE=0.126  sMAPE=0.120  R^2=0.714
  peui_ucl           MAPE=0.114  sMAPE=0.108  R^2=0.736
  space_heating_kwh  MAPE=0.167  sMAPE=0.157  R^2=0.915
  hot_water_kwh      MAPE=0.089  sMAPE=0.086  R^2=0.737

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Khalim Conn-Kowlessar 2026-05-16 21:15:37 +00:00
parent 8fddd25b9a
commit c496f345f8
4 changed files with 39 additions and 14 deletions

View file

@ -102,10 +102,10 @@ def _map_schema_21_pv(
if isinstance(es_pv_supply, list):
flattened = [
PhotovoltaicArray(
peak_power=array.peak_power,
pitch=array.pitch,
orientation=array.orientation,
overshading=array.overshading,
peak_power=_measurement_value(array.peak_power),
pitch=int(_measurement_value(array.pitch)),
orientation=int(_measurement_value(array.orientation)),
overshading=int(_measurement_value(array.overshading)),
)
for inner_list in es_pv_supply
for array in inner_list

View file

@ -111,12 +111,12 @@ class PhotovoltaicArray:
Modern SAP10 EPCs with measured PV carry `photovoltaic_supply` as a nested
list (`list[list[PhotovoltaicArray]]`) rather than the legacy wrapper dict
`PhotovoltaicSupply`. The Union type on SapEnergySource.photovoltaic_supply
accepts either shape.
accepts either shape. Some certs wrap the scalars in Measurement dicts.
"""
peak_power: float
pitch: int
orientation: int
overshading: int
peak_power: Union[Measurement, int, float]
pitch: Union[Measurement, int]
orientation: Union[Measurement, int]
overshading: Union[Measurement, int]
@dataclass
@ -147,8 +147,9 @@ class SapWindow:
orientation: int
window_type: int
glazing_type: int
window_width: float
window_height: float
# Real-API certs sometimes carry a Measurement dict for dimensions, not a plain float.
window_width: Union[Measurement, int, float]
window_height: Union[Measurement, int, float]
draught_proofed: str # TODO: make bool
window_location: int
window_wall_type: int

View file

@ -595,13 +595,20 @@ class EpcMlTransform:
def _peui_ucl(epc: EpcPropertyData) -> Optional[float]:
"""Apply the Few et al. per-band UCL correction to PEUI for training labels.
Returns None when either the raw PEUI or the SAP score is missing those rows
are unusable as `peui_ucl` training labels and should be dropped upstream.
Returns None when:
- either the raw PEUI or the SAP score is missing, or
- the raw PEUI is non-positive (e.g. net-exporter homes with negative PEUI)
so the UCL correction is undefined.
Those rows are unusable as `peui_ucl` training labels and should be dropped
upstream rather than crashing the transform.
"""
if epc.energy_consumption_current is None or epc.energy_rating_current is None:
return None
peui_raw = float(epc.energy_consumption_current)
if peui_raw <= 0:
return None
band = Epc.from_sap_score(epc.energy_rating_current)
return apply_ucl_correction(float(epc.energy_consumption_current), band)
return apply_ucl_correction(peui_raw, band)
def _pv_aggregates(es: SapEnergySource) -> dict[str, Any]:

View file

@ -64,6 +64,7 @@ def train_baseline(
metrics[target] = {
"mape": float(cast(float, mean_absolute_percentage_error(y_test, preds))),
"smape": _smape(y_test, preds),
"r2": float(cast(float, r2_score(y_test, preds))),
}
@ -79,3 +80,19 @@ def train_baseline(
json.dumps(metrics, indent=2).encode("utf-8"),
)
return metrics
def _smape(y_true: Any, y_pred: Any) -> float:
"""Symmetric MAPE: mean(|y - yhat| / ((|y| + |yhat|) / 2)).
Bounded in [0, 2] (often reported as 0-200%). Stable when |y| is near zero,
so it's a better summary than MAPE for low-magnitude targets like
`hot_water_kwh` in well-insulated homes.
"""
y_t = np.asarray(y_true, dtype=float)
y_p = np.asarray(y_pred, dtype=float)
denom = (np.abs(y_t) + np.abs(y_p)) / 2.0
mask = denom > 0
if not mask.any():
return 0.0
return float(np.mean(np.abs(y_t[mask] - y_p[mask]) / denom[mask]))