diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 26e87521..45e0cb3a 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -102,10 +102,10 @@ def _map_schema_21_pv( if isinstance(es_pv_supply, list): flattened = [ PhotovoltaicArray( - peak_power=array.peak_power, - pitch=array.pitch, - orientation=array.orientation, - overshading=array.overshading, + peak_power=_measurement_value(array.peak_power), + pitch=int(_measurement_value(array.pitch)), + orientation=int(_measurement_value(array.orientation)), + overshading=int(_measurement_value(array.overshading)), ) for inner_list in es_pv_supply for array in inner_list diff --git a/datatypes/epc/schema/rdsap_schema_21_0_1.py b/datatypes/epc/schema/rdsap_schema_21_0_1.py index 5de41f6a..37498bb8 100644 --- a/datatypes/epc/schema/rdsap_schema_21_0_1.py +++ b/datatypes/epc/schema/rdsap_schema_21_0_1.py @@ -111,12 +111,12 @@ class PhotovoltaicArray: Modern SAP10 EPCs with measured PV carry `photovoltaic_supply` as a nested list (`list[list[PhotovoltaicArray]]`) rather than the legacy wrapper dict `PhotovoltaicSupply`. The Union type on SapEnergySource.photovoltaic_supply - accepts either shape. + accepts either shape. Some certs wrap the scalars in Measurement dicts. """ - peak_power: float - pitch: int - orientation: int - overshading: int + peak_power: Union[Measurement, int, float] + pitch: Union[Measurement, int] + orientation: Union[Measurement, int] + overshading: Union[Measurement, int] @dataclass @@ -147,8 +147,9 @@ class SapWindow: orientation: int window_type: int glazing_type: int - window_width: float - window_height: float + # Real-API certs sometimes carry a Measurement dict for dimensions, not a plain float. + window_width: Union[Measurement, int, float] + window_height: Union[Measurement, int, float] draught_proofed: str # TODO: make bool window_location: int window_wall_type: int diff --git a/packages/domain/src/domain/ml/transform.py b/packages/domain/src/domain/ml/transform.py index bfba7a43..8b6fe500 100644 --- a/packages/domain/src/domain/ml/transform.py +++ b/packages/domain/src/domain/ml/transform.py @@ -595,13 +595,20 @@ class EpcMlTransform: def _peui_ucl(epc: EpcPropertyData) -> Optional[float]: """Apply the Few et al. per-band UCL correction to PEUI for training labels. - Returns None when either the raw PEUI or the SAP score is missing — those rows - are unusable as `peui_ucl` training labels and should be dropped upstream. + Returns None when: + - either the raw PEUI or the SAP score is missing, or + - the raw PEUI is non-positive (e.g. net-exporter homes with negative PEUI) + so the UCL correction is undefined. + Those rows are unusable as `peui_ucl` training labels and should be dropped + upstream rather than crashing the transform. """ if epc.energy_consumption_current is None or epc.energy_rating_current is None: return None + peui_raw = float(epc.energy_consumption_current) + if peui_raw <= 0: + return None band = Epc.from_sap_score(epc.energy_rating_current) - return apply_ucl_correction(float(epc.energy_consumption_current), band) + return apply_ucl_correction(peui_raw, band) def _pv_aggregates(es: SapEnergySource) -> dict[str, Any]: diff --git a/services/ml_training_data/src/ml_training_data/train_baseline.py b/services/ml_training_data/src/ml_training_data/train_baseline.py index f29a0de6..7e80157c 100644 --- a/services/ml_training_data/src/ml_training_data/train_baseline.py +++ b/services/ml_training_data/src/ml_training_data/train_baseline.py @@ -64,6 +64,7 @@ def train_baseline( metrics[target] = { "mape": float(cast(float, mean_absolute_percentage_error(y_test, preds))), + "smape": _smape(y_test, preds), "r2": float(cast(float, r2_score(y_test, preds))), } @@ -79,3 +80,19 @@ def train_baseline( json.dumps(metrics, indent=2).encode("utf-8"), ) return metrics + + +def _smape(y_true: Any, y_pred: Any) -> float: + """Symmetric MAPE: mean(|y - yhat| / ((|y| + |yhat|) / 2)). + + Bounded in [0, 2] (often reported as 0-200%). Stable when |y| is near zero, + so it's a better summary than MAPE for low-magnitude targets like + `hot_water_kwh` in well-insulated homes. + """ + y_t = np.asarray(y_true, dtype=float) + y_p = np.asarray(y_pred, dtype=float) + denom = (np.abs(y_t) + np.abs(y_p)) / 2.0 + mask = denom > 0 + if not mask.any(): + return 0.0 + return float(np.mean(np.abs(y_t[mask] - y_p[mask]) / denom[mask]))