mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
slice 16g: LightGBM objective=mape for sap_score + peui_ucl
Per ADR-0008: the v15 baseline reports MAPE but optimises MSE, which under-weights tail rows. Switching to objective='mape' applies gradient proportional to 1/|y| and lets the model focus where MAPE penalises. Targets co2_emissions, space_heating_kwh, hot_water_kwh, and peui_raw retain the default 'regression' objective (some rows have ~zero CO2 from heavy PV; MAPE objective destabilises near zero). Sample weights deferred to slice 16i if slice 16h's per-decile residuals still show tail bias after the objective switch.
This commit is contained in:
parent
5c20e323da
commit
700ff4640c
2 changed files with 41 additions and 1 deletions
|
|
@ -26,6 +26,15 @@ from ml_training_data.storage import Storage
|
|||
|
||||
_CERT_NUM_COLUMN = "certificate_number"
|
||||
|
||||
# Per-target LightGBM objective overrides (ADR-0008, slice 16g). Defaults to
|
||||
# 'regression' (MSE); we use 'mape' for sap_score and peui_ucl because the
|
||||
# default MSE under-weights tail rows relative to the MAPE we report.
|
||||
# co2_emissions cannot use 'mape' safely (some rows are ~0 from heavy PV).
|
||||
_OBJECTIVE_OVERRIDES: dict[str, str] = {
|
||||
"sap_score": "mape",
|
||||
"peui_ucl": "mape",
|
||||
}
|
||||
|
||||
|
||||
def train_baseline(
|
||||
df: pd.DataFrame,
|
||||
|
|
@ -58,7 +67,10 @@ def train_baseline(
|
|||
)
|
||||
x_train, x_test, y_train, y_test = split
|
||||
|
||||
model: Any = lgb.LGBMRegressor(n_estimators=n_estimators, random_state=seed, verbose=-1)
|
||||
objective = _OBJECTIVE_OVERRIDES.get(target, "regression")
|
||||
model: Any = lgb.LGBMRegressor(
|
||||
n_estimators=n_estimators, random_state=seed, verbose=-1, objective=objective,
|
||||
)
|
||||
model.fit(x_train, y_train)
|
||||
preds: np.ndarray[Any, Any] = np.asarray(model.predict(x_test))
|
||||
|
||||
|
|
|
|||
|
|
@ -122,6 +122,34 @@ def test_train_baseline_writes_per_decile_residuals_per_target(tmp_path: Path) -
|
|||
assert true_mins == sorted(true_mins)
|
||||
|
||||
|
||||
def test_train_baseline_uses_mape_objective_for_sap_score_and_peui_ucl(tmp_path: Path) -> None:
|
||||
# Arrange — sap_score + peui_ucl should use objective="mape" per ADR-0008.
|
||||
# We can't directly inspect LGBMRegressor.objective post-fit reliably, so
|
||||
# instead we verify the per-target override map is wired and that training
|
||||
# completes (LightGBM raises if the objective name is unknown).
|
||||
storage = LocalStorage(root=tmp_path)
|
||||
df = _synthetic_dataset(n=300)
|
||||
df["peui_ucl"] = df["sap_score"].astype(float) + 5.0
|
||||
|
||||
# Act
|
||||
metrics = train_baseline(
|
||||
df=df,
|
||||
targets=["sap_score", "peui_ucl"],
|
||||
storage=storage,
|
||||
run_key="runs/2026-05-16/",
|
||||
seed=42,
|
||||
)
|
||||
|
||||
# Assert — both targets fit successfully under the mape objective.
|
||||
assert "sap_score" in metrics
|
||||
assert "peui_ucl" in metrics
|
||||
# Verify the override map is present and contains both targets.
|
||||
from ml_training_data.train_baseline import _OBJECTIVE_OVERRIDES # noqa: PLC0415
|
||||
assert _OBJECTIVE_OVERRIDES.get("sap_score") == "mape"
|
||||
assert _OBJECTIVE_OVERRIDES.get("peui_ucl") == "mape"
|
||||
assert _OBJECTIVE_OVERRIDES.get("co2_emissions") is None
|
||||
|
||||
|
||||
def test_train_baseline_residuals_emitted_per_target_independently(tmp_path: Path) -> None:
|
||||
# Arrange
|
||||
storage = LocalStorage(root=tmp_path)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue