From 700ff4640c99f1054b2e7df2632c888938ffd12c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 May 2026 12:06:13 +0000 Subject: [PATCH] slice 16g: LightGBM objective=mape for sap_score + peui_ucl Per ADR-0008: the v15 baseline reports MAPE but optimises MSE, which under-weights tail rows. Switching to objective='mape' applies gradient proportional to 1/|y| and lets the model focus where MAPE penalises. Targets co2_emissions, space_heating_kwh, hot_water_kwh, and peui_raw retain the default 'regression' objective (some rows have ~zero CO2 from heavy PV; MAPE objective destabilises near zero). Sample weights deferred to slice 16i if slice 16h's per-decile residuals still show tail bias after the objective switch. --- .../src/ml_training_data/train_baseline.py | 14 +++++++++- .../tests/unit/test_train_baseline.py | 28 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/services/ml_training_data/src/ml_training_data/train_baseline.py b/services/ml_training_data/src/ml_training_data/train_baseline.py index ed618a26..da8a3ac0 100644 --- a/services/ml_training_data/src/ml_training_data/train_baseline.py +++ b/services/ml_training_data/src/ml_training_data/train_baseline.py @@ -26,6 +26,15 @@ from ml_training_data.storage import Storage _CERT_NUM_COLUMN = "certificate_number" +# Per-target LightGBM objective overrides (ADR-0008, slice 16g). Defaults to +# 'regression' (MSE); we use 'mape' for sap_score and peui_ucl because the +# default MSE under-weights tail rows relative to the MAPE we report. +# co2_emissions cannot use 'mape' safely (some rows are ~0 from heavy PV). +_OBJECTIVE_OVERRIDES: dict[str, str] = { + "sap_score": "mape", + "peui_ucl": "mape", +} + def train_baseline( df: pd.DataFrame, @@ -58,7 +67,10 @@ def train_baseline( ) x_train, x_test, y_train, y_test = split - model: Any = lgb.LGBMRegressor(n_estimators=n_estimators, random_state=seed, verbose=-1) + objective = _OBJECTIVE_OVERRIDES.get(target, "regression") + model: Any = lgb.LGBMRegressor( + n_estimators=n_estimators, random_state=seed, verbose=-1, objective=objective, + ) model.fit(x_train, y_train) preds: np.ndarray[Any, Any] = np.asarray(model.predict(x_test)) diff --git a/services/ml_training_data/tests/unit/test_train_baseline.py b/services/ml_training_data/tests/unit/test_train_baseline.py index 835b5b8d..aae64092 100644 --- a/services/ml_training_data/tests/unit/test_train_baseline.py +++ b/services/ml_training_data/tests/unit/test_train_baseline.py @@ -122,6 +122,34 @@ def test_train_baseline_writes_per_decile_residuals_per_target(tmp_path: Path) - assert true_mins == sorted(true_mins) +def test_train_baseline_uses_mape_objective_for_sap_score_and_peui_ucl(tmp_path: Path) -> None: + # Arrange — sap_score + peui_ucl should use objective="mape" per ADR-0008. + # We can't directly inspect LGBMRegressor.objective post-fit reliably, so + # instead we verify the per-target override map is wired and that training + # completes (LightGBM raises if the objective name is unknown). + storage = LocalStorage(root=tmp_path) + df = _synthetic_dataset(n=300) + df["peui_ucl"] = df["sap_score"].astype(float) + 5.0 + + # Act + metrics = train_baseline( + df=df, + targets=["sap_score", "peui_ucl"], + storage=storage, + run_key="runs/2026-05-16/", + seed=42, + ) + + # Assert — both targets fit successfully under the mape objective. + assert "sap_score" in metrics + assert "peui_ucl" in metrics + # Verify the override map is present and contains both targets. + from ml_training_data.train_baseline import _OBJECTIVE_OVERRIDES # noqa: PLC0415 + assert _OBJECTIVE_OVERRIDES.get("sap_score") == "mape" + assert _OBJECTIVE_OVERRIDES.get("peui_ucl") == "mape" + assert _OBJECTIVE_OVERRIDES.get("co2_emissions") is None + + def test_train_baseline_residuals_emitted_per_target_independently(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path)