slice 16i: MAE + RMSE in metrics; sample_weight_fn + low_sap_tail_weight

train_baseline now returns mae + rmse alongside mape/smape/r2.  MAE is the
user-facing metric ("predicted SAP within N points"); RMSE the quadratic
counterpart.  Both come straight from sklearn.

New sample_weight_fn parameter: callable(y_train) -> per-row weights.
Threads into LGBMRegressor.fit's sample_weight argument.  Default None
preserves existing behaviour.

Default tail strategy exposed as low_sap_tail_weight(y, threshold=58,
weight=3): 3x weight where SAP < 58.  Threshold picked from slice 16h's
per-decile residuals — decile 0 (SAP 1-58) carries 17% MAPE vs <5% body.

Three TDD tracers, all AAA.
This commit is contained in:
Khalim Conn-Kowlessar 2026-05-17 14:48:00 +00:00
parent ece1279475
commit 6072d8795a
2 changed files with 101 additions and 2 deletions

View file

@ -10,7 +10,7 @@ leaks into the model as a feature.
"""
import json
from typing import Any, cast
from typing import Any, Callable, Optional, cast
import lightgbm as lgb # type: ignore[import-untyped]
import numpy as np
@ -18,7 +18,9 @@ import pandas as pd
import sklearn.metrics as _sk_metrics # type: ignore[import-untyped] # pyright: ignore[reportMissingTypeStubs]
import sklearn.model_selection as _sk_model_selection # type: ignore[import-untyped] # pyright: ignore[reportMissingTypeStubs]
mean_absolute_error: Any = _sk_metrics.mean_absolute_error # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
mean_absolute_percentage_error: Any = _sk_metrics.mean_absolute_percentage_error # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
mean_squared_error: Any = _sk_metrics.mean_squared_error # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
r2_score: Any = _sk_metrics.r2_score # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
train_test_split: Any = _sk_model_selection.train_test_split # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
@ -35,6 +37,32 @@ _CERT_NUM_COLUMN = "certificate_number"
_OBJECTIVE_OVERRIDES: dict[str, str] = {}
SampleWeightFn = Callable[["pd.Series[Any]"], "pd.Series[Any]"]
# Default tail-bucket weight curve for slice 16i. Boundary 58 picked from
# slice 16h's per-decile residuals (decile 0 = SAP 1-58 carries 17% MAPE
# vs <5% in the body); 3x multiplier is the lightest weight that demonstrably
# reduces the +3.1 bias at decile 0 without inflating body MAPE. Configurable
# via sample_weight_fn for ablation runs.
_DEFAULT_LOW_SAP_THRESHOLD: float = 58.0
_DEFAULT_LOW_SAP_WEIGHT: float = 3.0
def low_sap_tail_weight(
y: "pd.Series[Any]",
threshold: float = _DEFAULT_LOW_SAP_THRESHOLD,
weight: float = _DEFAULT_LOW_SAP_WEIGHT,
) -> "pd.Series[Any]":
"""Return per-row weights: `weight` where y < threshold, 1.0 otherwise.
Use as `train_baseline(..., sample_weight_fn=low_sap_tail_weight)` to
apply the slice 16i tail strategy.
"""
arr = np.asarray(y, dtype=float)
return pd.Series(np.where(arr < threshold, weight, 1.0), index=y.index)
def train_baseline(
df: pd.DataFrame,
targets: list[str],
@ -44,6 +72,7 @@ def train_baseline(
test_size: float = 0.2,
seed: int = 42,
n_estimators: int = 200,
sample_weight_fn: Optional[SampleWeightFn] = None,
) -> dict[str, dict[str, float]]:
feature_cols = [c for c in df.columns if c not in targets and c != _CERT_NUM_COLUMN]
# LightGBM needs numeric (or pd.Categorical) dtypes. Coerce object columns whose
@ -70,12 +99,15 @@ def train_baseline(
model: Any = lgb.LGBMRegressor(
n_estimators=n_estimators, random_state=seed, verbose=-1, objective=objective,
)
model.fit(x_train, y_train)
sample_weight = sample_weight_fn(y_train) if sample_weight_fn is not None else None
model.fit(x_train, y_train, sample_weight=sample_weight)
preds: np.ndarray[Any, Any] = np.asarray(model.predict(x_test))
metrics[target] = {
"mape": float(cast(float, mean_absolute_percentage_error(y_test, preds))),
"smape": _smape(y_test, preds),
"mae": float(cast(float, mean_absolute_error(y_test, preds))),
"rmse": float(np.sqrt(cast(float, mean_squared_error(y_test, preds)))),
"r2": float(cast(float, r2_score(y_test, preds))),
}

View file

@ -52,6 +52,73 @@ def test_train_baseline_returns_mape_and_r2_per_target(tmp_path: Path) -> None:
assert metrics["sap_score"]["r2"] > 0.0 # learns something on a correlated signal
def test_low_sap_tail_weight_returns_3x_for_rows_below_58_else_1x() -> None:
# Arrange — exposed helper so callers wanting the default tail strategy
# can plug it straight into train_baseline. SAP-rating boundary 58 chosen
# from slice 16h's per-decile residuals: decile 0 (SAP 1-58) carries 17%
# MAPE; deciles 1-9 are all below 5%.
import pandas as pd # noqa: PLC0415
from ml_training_data.train_baseline import low_sap_tail_weight # noqa: PLC0415
# Act
weights = low_sap_tail_weight(pd.Series([20, 50, 58, 60, 90]))
# Assert
assert list(weights) == [3.0, 3.0, 1.0, 1.0, 1.0]
def test_train_baseline_accepts_sample_weight_fn_per_target(tmp_path: Path) -> None:
# Arrange — sample_weight_fn is a callable taking the training-label Series
# and returning a Series of weights the same length. When supplied, the
# weights flow into LGBMRegressor.fit's sample_weight argument and the
# model emphasizes the heavily-weighted rows. We verify the indirection
# works by training twice (no weights vs heavy-weighted tail) and
# confirming the predictions differ on the tail subset.
import numpy as np # noqa: PLC0415
import pandas as pd # noqa: PLC0415
storage = LocalStorage(root=tmp_path)
df = _synthetic_dataset(n=600, seed=0)
def weight_tail(y: "pd.Series[Any]") -> "pd.Series[Any]":
return pd.Series(np.where(np.asarray(y, dtype=float) < 60, 10.0, 1.0), index=y.index)
# Act
m_unweighted = train_baseline(
df=df.copy(), targets=["sap_score"], storage=storage,
run_key="runs/unw/", seed=42,
)
m_weighted = train_baseline(
df=df.copy(), targets=["sap_score"], storage=storage,
run_key="runs/w/", seed=42, sample_weight_fn=weight_tail,
)
# Assert — global MAE should differ between weighted and unweighted runs.
# (Direction depends on data; we just need to see that the weight reached LGBM.)
assert m_unweighted["sap_score"]["mae"] != m_weighted["sap_score"]["mae"]
def test_train_baseline_reports_mae_and_rmse_per_target(tmp_path: Path) -> None:
# Arrange — MAE gives users-facing "predicted SAP within N points" meaning;
# RMSE penalises large errors quadratically. Both should be reported next
# to MAPE so we can read the residual without inverting MAPE math by hand.
storage = LocalStorage(root=tmp_path)
df = _synthetic_dataset()
# Act
metrics = train_baseline(
df=df, targets=["sap_score"], storage=storage,
run_key="runs/2026-05-16/", seed=42,
)
# Assert
assert "mae" in metrics["sap_score"]
assert "rmse" in metrics["sap_score"]
assert metrics["sap_score"]["mae"] > 0
assert metrics["sap_score"]["rmse"] >= metrics["sap_score"]["mae"] # always true mathematically
def test_train_baseline_writes_feature_importance_per_target(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)