slice 14l: bigger-run fixes — UCL guard, PV Measurement coercion, sMAPE

Three changes surfaced by the 25k 2026 run: - transform._peui_ucl returns None for non-positive raw PEUI (net-exporters). apply_ucl_correction would otherwise raise ValueError on negative input. - PhotovoltaicArray scalars (peak_power, pitch, orientation, overshading) now accept Measurement | int | float in the schema; mapper coerces via _measurement_value. - train_baseline reports sMAPE alongside MAPE — handles zero-actual rows (e.g. co2_emissions for net-zero certs) where MAPE explodes. Results at N=25,000 RdSAP 2026 certs (~32s end-to-end): sap_score MAPE=0.064 sMAPE=0.054 R^2=0.762 co2_emissions sMAPE=0.140 R^2=0.890 peui_raw MAPE=0.126 sMAPE=0.120 R^2=0.714 peui_ucl MAPE=0.114 sMAPE=0.108 R^2=0.736 space_heating_kwh MAPE=0.167 sMAPE=0.157 R^2=0.915 hot_water_kwh MAPE=0.089 sMAPE=0.086 R^2=0.737 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-07-27 23:35:01 +00:00 · 2026-05-16 21:15:37 +00:00 · 2026-05-16 21:15:37 +00:00 · c496f345f8
commit c496f345f8
parent 8fddd25b9a
4 changed files with 39 additions and 14 deletions
--- a/datatypes/epc/domain/mapper.py
+++ b/datatypes/epc/domain/mapper.py
@ -102,10 +102,10 @@ def _map_schema_21_pv(
    if isinstance(es_pv_supply, list):
        flattened = [
            PhotovoltaicArray(
-                peak_power=array.peak_power,
-                pitch=array.pitch,
-                orientation=array.orientation,
-                overshading=array.overshading,
+                peak_power=_measurement_value(array.peak_power),
+                pitch=int(_measurement_value(array.pitch)),
+                orientation=int(_measurement_value(array.orientation)),
+                overshading=int(_measurement_value(array.overshading)),
            )
            for inner_list in es_pv_supply
            for array in inner_list
--- a/datatypes/epc/schema/rdsap_schema_21_0_1.py
+++ b/datatypes/epc/schema/rdsap_schema_21_0_1.py
@ -111,12 +111,12 @@ class PhotovoltaicArray:
    Modern SAP10 EPCs with measured PV carry `photovoltaic_supply` as a nested
    list (`list[list[PhotovoltaicArray]]`) rather than the legacy wrapper dict
    `PhotovoltaicSupply`. The Union type on SapEnergySource.photovoltaic_supply
-    accepts either shape.
+    accepts either shape. Some certs wrap the scalars in Measurement dicts.
    """
-    peak_power: float
-    pitch: int
-    orientation: int
-    overshading: int
+    peak_power: Union[Measurement, int, float]
+    pitch: Union[Measurement, int]
+    orientation: Union[Measurement, int]
+    overshading: Union[Measurement, int]


@dataclass
@ -147,8 +147,9 @@ class SapWindow:
    orientation: int
    window_type: int
    glazing_type: int
-    window_width: float
-    window_height: float
+    # Real-API certs sometimes carry a Measurement dict for dimensions, not a plain float.
+    window_width: Union[Measurement, int, float]
+    window_height: Union[Measurement, int, float]
    draught_proofed: str  # TODO: make bool
    window_location: int
    window_wall_type: int
--- a/packages/domain/src/domain/ml/transform.py
+++ b/packages/domain/src/domain/ml/transform.py
@ -595,13 +595,20 @@ class EpcMlTransform:
 def _peui_ucl(epc: EpcPropertyData) -> Optional[float]:
    """Apply the Few et al. per-band UCL correction to PEUI for training labels.

-    Returns None when either the raw PEUI or the SAP score is missing — those rows
-    are unusable as `peui_ucl` training labels and should be dropped upstream.
+    Returns None when:
+    - either the raw PEUI or the SAP score is missing, or
+    - the raw PEUI is non-positive (e.g. net-exporter homes with negative PEUI)
+      so the UCL correction is undefined.
+    Those rows are unusable as `peui_ucl` training labels and should be dropped
+    upstream rather than crashing the transform.
    """
    if epc.energy_consumption_current is None or epc.energy_rating_current is None:
        return None
+    peui_raw = float(epc.energy_consumption_current)
+    if peui_raw <= 0:
+        return None
    band = Epc.from_sap_score(epc.energy_rating_current)
-    return apply_ucl_correction(float(epc.energy_consumption_current), band)
+    return apply_ucl_correction(peui_raw, band)


 def _pv_aggregates(es: SapEnergySource) -> dict[str, Any]:
--- a/services/ml_training_data/src/ml_training_data/train_baseline.py
+++ b/services/ml_training_data/src/ml_training_data/train_baseline.py
@ -64,6 +64,7 @@ def train_baseline(

        metrics[target] = {
            "mape": float(cast(float, mean_absolute_percentage_error(y_test, preds))),
+            "smape": _smape(y_test, preds),
            "r2": float(cast(float, r2_score(y_test, preds))),
        }

@ -79,3 +80,19 @@ def train_baseline(
        json.dumps(metrics, indent=2).encode("utf-8"),
    )
    return metrics
+
+
+def _smape(y_true: Any, y_pred: Any) -> float:
+    """Symmetric MAPE: mean(|y - yhat| / ((|y| + |yhat|) / 2)).
+
+    Bounded in [0, 2] (often reported as 0-200%). Stable when |y| is near zero,
+    so it's a better summary than MAPE for low-magnitude targets like
+    `hot_water_kwh` in well-insulated homes.
+    """
+    y_t = np.asarray(y_true, dtype=float)
+    y_p = np.asarray(y_pred, dtype=float)
+    denom = (np.abs(y_t) + np.abs(y_p)) / 2.0
+    mask = denom > 0
+    if not mask.any():
+        return 0.0
+    return float(np.mean(np.abs(y_t[mask] - y_p[mask]) / denom[mask]))