From c496f345f8bf6a1ed181412cc96aef751400a880 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 May 2026 21:15:37 +0000 Subject: [PATCH] =?UTF-8?q?slice=2014l:=20bigger-run=20fixes=20=E2=80=94?= =?UTF-8?q?=20UCL=20guard,=20PV=20Measurement=20coercion,=20sMAPE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three changes surfaced by the 25k 2026 run: - transform._peui_ucl returns None for non-positive raw PEUI (net-exporters). apply_ucl_correction would otherwise raise ValueError on negative input. - PhotovoltaicArray scalars (peak_power, pitch, orientation, overshading) now accept Measurement | int | float in the schema; mapper coerces via _measurement_value. - train_baseline reports sMAPE alongside MAPE — handles zero-actual rows (e.g. co2_emissions for net-zero certs) where MAPE explodes. Results at N=25,000 RdSAP 2026 certs (~32s end-to-end): sap_score MAPE=0.064 sMAPE=0.054 R^2=0.762 co2_emissions sMAPE=0.140 R^2=0.890 peui_raw MAPE=0.126 sMAPE=0.120 R^2=0.714 peui_ucl MAPE=0.114 sMAPE=0.108 R^2=0.736 space_heating_kwh MAPE=0.167 sMAPE=0.157 R^2=0.915 hot_water_kwh MAPE=0.089 sMAPE=0.086 R^2=0.737 Co-Authored-By: Claude Opus 4.7 --- datatypes/epc/domain/mapper.py | 8 ++++---- datatypes/epc/schema/rdsap_schema_21_0_1.py | 15 ++++++++------- packages/domain/src/domain/ml/transform.py | 13 ++++++++++--- .../src/ml_training_data/train_baseline.py | 17 +++++++++++++++++ 4 files changed, 39 insertions(+), 14 deletions(-) diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 26e87521..45e0cb3a 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -102,10 +102,10 @@ def _map_schema_21_pv( if isinstance(es_pv_supply, list): flattened = [ PhotovoltaicArray( - peak_power=array.peak_power, - pitch=array.pitch, - orientation=array.orientation, - overshading=array.overshading, + peak_power=_measurement_value(array.peak_power), + pitch=int(_measurement_value(array.pitch)), + orientation=int(_measurement_value(array.orientation)), + overshading=int(_measurement_value(array.overshading)), ) for inner_list in es_pv_supply for array in inner_list diff --git a/datatypes/epc/schema/rdsap_schema_21_0_1.py b/datatypes/epc/schema/rdsap_schema_21_0_1.py index 5de41f6a..37498bb8 100644 --- a/datatypes/epc/schema/rdsap_schema_21_0_1.py +++ b/datatypes/epc/schema/rdsap_schema_21_0_1.py @@ -111,12 +111,12 @@ class PhotovoltaicArray: Modern SAP10 EPCs with measured PV carry `photovoltaic_supply` as a nested list (`list[list[PhotovoltaicArray]]`) rather than the legacy wrapper dict `PhotovoltaicSupply`. The Union type on SapEnergySource.photovoltaic_supply - accepts either shape. + accepts either shape. Some certs wrap the scalars in Measurement dicts. """ - peak_power: float - pitch: int - orientation: int - overshading: int + peak_power: Union[Measurement, int, float] + pitch: Union[Measurement, int] + orientation: Union[Measurement, int] + overshading: Union[Measurement, int] @dataclass @@ -147,8 +147,9 @@ class SapWindow: orientation: int window_type: int glazing_type: int - window_width: float - window_height: float + # Real-API certs sometimes carry a Measurement dict for dimensions, not a plain float. + window_width: Union[Measurement, int, float] + window_height: Union[Measurement, int, float] draught_proofed: str # TODO: make bool window_location: int window_wall_type: int diff --git a/packages/domain/src/domain/ml/transform.py b/packages/domain/src/domain/ml/transform.py index bfba7a43..8b6fe500 100644 --- a/packages/domain/src/domain/ml/transform.py +++ b/packages/domain/src/domain/ml/transform.py @@ -595,13 +595,20 @@ class EpcMlTransform: def _peui_ucl(epc: EpcPropertyData) -> Optional[float]: """Apply the Few et al. per-band UCL correction to PEUI for training labels. - Returns None when either the raw PEUI or the SAP score is missing — those rows - are unusable as `peui_ucl` training labels and should be dropped upstream. + Returns None when: + - either the raw PEUI or the SAP score is missing, or + - the raw PEUI is non-positive (e.g. net-exporter homes with negative PEUI) + so the UCL correction is undefined. + Those rows are unusable as `peui_ucl` training labels and should be dropped + upstream rather than crashing the transform. """ if epc.energy_consumption_current is None or epc.energy_rating_current is None: return None + peui_raw = float(epc.energy_consumption_current) + if peui_raw <= 0: + return None band = Epc.from_sap_score(epc.energy_rating_current) - return apply_ucl_correction(float(epc.energy_consumption_current), band) + return apply_ucl_correction(peui_raw, band) def _pv_aggregates(es: SapEnergySource) -> dict[str, Any]: diff --git a/services/ml_training_data/src/ml_training_data/train_baseline.py b/services/ml_training_data/src/ml_training_data/train_baseline.py index f29a0de6..7e80157c 100644 --- a/services/ml_training_data/src/ml_training_data/train_baseline.py +++ b/services/ml_training_data/src/ml_training_data/train_baseline.py @@ -64,6 +64,7 @@ def train_baseline( metrics[target] = { "mape": float(cast(float, mean_absolute_percentage_error(y_test, preds))), + "smape": _smape(y_test, preds), "r2": float(cast(float, r2_score(y_test, preds))), } @@ -79,3 +80,19 @@ def train_baseline( json.dumps(metrics, indent=2).encode("utf-8"), ) return metrics + + +def _smape(y_true: Any, y_pred: Any) -> float: + """Symmetric MAPE: mean(|y - yhat| / ((|y| + |yhat|) / 2)). + + Bounded in [0, 2] (often reported as 0-200%). Stable when |y| is near zero, + so it's a better summary than MAPE for low-magnitude targets like + `hot_water_kwh` in well-insulated homes. + """ + y_t = np.asarray(y_true, dtype=float) + y_p = np.asarray(y_pred, dtype=float) + denom = (np.abs(y_t) + np.abs(y_p)) / 2.0 + mask = denom > 0 + if not mask.any(): + return 0.0 + return float(np.mean(np.abs(y_t[mask] - y_p[mask]) / denom[mask]))