From 8fddd25b9aa1284fd46d08691d1279a68a5ea1ae Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sat, 16 May 2026 20:47:41 +0000
Subject: [PATCH] slice 14k: E2E pipeline runs on real 2026 RdSAP certs

Two production fixes surfaced by the live run:
- mapper.from_rdsap_schema_21_0_1 now sets the three ML target scalars
  (energy_rating_current, co2_emissions_current, energy_consumption_current).
  They were silently None for every cert before, leaving the only labels as
  the kWh fields from renewable_heat_incentive.
- train_baseline coerces object-dtype columns to numeric (None -> NaN) and
  drops rows with null target per fit, so LightGBM accepts the frame.

E2E on 500 real certs (~1s):
  sap_score             R^2=0.604  MAPE=0.084
  co2_emissions         R^2=0.813  MAPE=0.130
  peui_raw              R^2=0.979  MAPE=0.026
  space_heating_kwh     R^2=0.823  MAPE=0.213
  hot_water_kwh         R^2=0.519  MAPE=0.115

peui_ucl excluded: UCL correction still needs wiring.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 datatypes/epc/domain/mapper.py                   | 16 ++++++++++------
 .../src/ml_training_data/train_baseline.py       | 13 +++++++++++--
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py
index 9ac4ce19..26e87521 100644
--- a/datatypes/epc/domain/mapper.py
+++ b/datatypes/epc/domain/mapper.py
@@ -1159,8 +1159,8 @@ class EpcPropertyDataMapper:
                     window_type=w.window_type,
                     frame_factor=w.frame_factor,
                     glazing_type=w.glazing_type,
-                    window_width=w.window_width,
-                    window_height=w.window_height,
+                    window_width=_measurement_value(w.window_width),
+                    window_height=_measurement_value(w.window_height),
                     draught_proofed=w.draught_proofed == "true",
                     window_location=w.window_location,
                     window_wall_type=w.window_wall_type,
@@ -1177,7 +1177,7 @@ class EpcPropertyDataMapper:
             sap_energy_source=SapEnergySource(
                 mains_gas=es.mains_gas == "Y",
                 meter_type=str(es.meter_type),
-                pv_battery_count=es.pv_battery_count,
+                pv_battery_count=es.pv_battery_count or 0,
                 wind_turbines_count=es.wind_turbines_count,
                 gas_smart_meter_present=es.gas_smart_meter_present == "true",
                 is_dwelling_export_capable=es.is_dwelling_export_capable == "true",
@@ -1391,8 +1391,8 @@ class EpcPropertyDataMapper:
                     window_type=w.window_type,
                     frame_factor=w.frame_factor,
                     glazing_type=w.glazing_type,
-                    window_width=w.window_width,
-                    window_height=w.window_height,
+                    window_width=_measurement_value(w.window_width),
+                    window_height=_measurement_value(w.window_height),
                     draught_proofed=w.draught_proofed == "true",
                     window_location=w.window_location,
                     window_wall_type=w.window_wall_type,
@@ -1414,7 +1414,7 @@ class EpcPropertyDataMapper:
             sap_energy_source=SapEnergySource(
                 mains_gas=es.mains_gas == "Y",
                 meter_type=str(es.meter_type),
-                pv_battery_count=es.pv_battery_count,
+                pv_battery_count=es.pv_battery_count or 0,
                 wind_turbines_count=es.wind_turbines_count,
                 gas_smart_meter_present=es.gas_smart_meter_present == "true",
                 is_dwelling_export_capable=es.is_dwelling_export_capable == "true",
@@ -1520,6 +1520,10 @@ class EpcPropertyDataMapper:
                     else None
                 ),
             ),
+            # ML targets: the assessment-derived scalars used as training labels.
+            energy_rating_current=schema.energy_rating_current,
+            co2_emissions_current=float(schema.co2_emissions_current),
+            energy_consumption_current=schema.energy_consumption_current,
         )
 
     @staticmethod
diff --git a/services/ml_training_data/src/ml_training_data/train_baseline.py b/services/ml_training_data/src/ml_training_data/train_baseline.py
index 412e922f..f29a0de6 100644
--- a/services/ml_training_data/src/ml_training_data/train_baseline.py
+++ b/services/ml_training_data/src/ml_training_data/train_baseline.py
@@ -38,11 +38,20 @@ def train_baseline(
     n_estimators: int = 200,
 ) -> dict[str, dict[str, float]]:
     feature_cols = [c for c in df.columns if c not in targets and c != _CERT_NUM_COLUMN]
+    # LightGBM needs numeric (or pd.Categorical) dtypes. Coerce object columns whose
+    # contents are numeric-or-None to numeric (None -> NaN). Pandas object columns
+    # that are *actually* string categoricals are left alone if coercion would
+    # destroy data — pd.Categorical features pass through LightGBM correctly.
+    for col in [*feature_cols, *targets]:
+        if df[col].dtype == "object":
+            df[col] = pd.to_numeric(df[col], errors="coerce")
     metrics: dict[str, dict[str, float]] = {}
 
     for target in targets:
-        x = df[feature_cols]
-        y = df[target]
+        # Drop rows where this target is null so LightGBM doesn't trip on label NaN.
+        target_df = df.dropna(subset=[target])
+        x = target_df[feature_cols]
+        y = target_df[target]
         split = cast(
             tuple[pd.DataFrame, pd.DataFrame, "pd.Series[Any]", "pd.Series[Any]"],
             train_test_split(x, y, test_size=test_size, random_state=seed),