diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 9ac4ce19..26e87521 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -1159,8 +1159,8 @@ class EpcPropertyDataMapper: window_type=w.window_type, frame_factor=w.frame_factor, glazing_type=w.glazing_type, - window_width=w.window_width, - window_height=w.window_height, + window_width=_measurement_value(w.window_width), + window_height=_measurement_value(w.window_height), draught_proofed=w.draught_proofed == "true", window_location=w.window_location, window_wall_type=w.window_wall_type, @@ -1177,7 +1177,7 @@ class EpcPropertyDataMapper: sap_energy_source=SapEnergySource( mains_gas=es.mains_gas == "Y", meter_type=str(es.meter_type), - pv_battery_count=es.pv_battery_count, + pv_battery_count=es.pv_battery_count or 0, wind_turbines_count=es.wind_turbines_count, gas_smart_meter_present=es.gas_smart_meter_present == "true", is_dwelling_export_capable=es.is_dwelling_export_capable == "true", @@ -1391,8 +1391,8 @@ class EpcPropertyDataMapper: window_type=w.window_type, frame_factor=w.frame_factor, glazing_type=w.glazing_type, - window_width=w.window_width, - window_height=w.window_height, + window_width=_measurement_value(w.window_width), + window_height=_measurement_value(w.window_height), draught_proofed=w.draught_proofed == "true", window_location=w.window_location, window_wall_type=w.window_wall_type, @@ -1414,7 +1414,7 @@ class EpcPropertyDataMapper: sap_energy_source=SapEnergySource( mains_gas=es.mains_gas == "Y", meter_type=str(es.meter_type), - pv_battery_count=es.pv_battery_count, + pv_battery_count=es.pv_battery_count or 0, wind_turbines_count=es.wind_turbines_count, gas_smart_meter_present=es.gas_smart_meter_present == "true", is_dwelling_export_capable=es.is_dwelling_export_capable == "true", @@ -1520,6 +1520,10 @@ class EpcPropertyDataMapper: else None ), ), + # ML targets: the assessment-derived scalars used as training labels. + energy_rating_current=schema.energy_rating_current, + co2_emissions_current=float(schema.co2_emissions_current), + energy_consumption_current=schema.energy_consumption_current, ) @staticmethod diff --git a/services/ml_training_data/src/ml_training_data/train_baseline.py b/services/ml_training_data/src/ml_training_data/train_baseline.py index 412e922f..f29a0de6 100644 --- a/services/ml_training_data/src/ml_training_data/train_baseline.py +++ b/services/ml_training_data/src/ml_training_data/train_baseline.py @@ -38,11 +38,20 @@ def train_baseline( n_estimators: int = 200, ) -> dict[str, dict[str, float]]: feature_cols = [c for c in df.columns if c not in targets and c != _CERT_NUM_COLUMN] + # LightGBM needs numeric (or pd.Categorical) dtypes. Coerce object columns whose + # contents are numeric-or-None to numeric (None -> NaN). Pandas object columns + # that are *actually* string categoricals are left alone if coercion would + # destroy data — pd.Categorical features pass through LightGBM correctly. + for col in [*feature_cols, *targets]: + if df[col].dtype == "object": + df[col] = pd.to_numeric(df[col], errors="coerce") metrics: dict[str, dict[str, float]] = {} for target in targets: - x = df[feature_cols] - y = df[target] + # Drop rows where this target is null so LightGBM doesn't trip on label NaN. + target_df = df.dropna(subset=[target]) + x = target_df[feature_cols] + y = target_df[target] split = cast( tuple[pd.DataFrame, pd.DataFrame, "pd.Series[Any]", "pd.Series[Any]"], train_test_split(x, y, test_size=test_size, random_state=seed),