slice 16i: MAE + RMSE in metrics; sample_weight_fn + low_sap_tail_weight

train_baseline now returns mae + rmse alongside mape/smape/r2. MAE is the user-facing metric ("predicted SAP within N points"); RMSE the quadratic counterpart. Both come straight from sklearn. New sample_weight_fn parameter: callable(y_train) -> per-row weights. Threads into LGBMRegressor.fit's sample_weight argument. Default None preserves existing behaviour. Default tail strategy exposed as low_sap_tail_weight(y, threshold=58, weight=3): 3x weight where SAP < 58. Threshold picked from slice 16h's per-decile residuals — decile 0 (SAP 1-58) carries 17% MAPE vs <5% body. Three TDD tracers, all AAA.
2026-07-27 23:35:01 +00:00 · 2026-05-17 14:48:00 +00:00 · 2026-05-17 14:48:00 +00:00 · 6072d8795a
commit 6072d8795a
parent ece1279475
2 changed files with 101 additions and 2 deletions
--- a/services/ml_training_data/src/ml_training_data/train_baseline.py
+++ b/services/ml_training_data/src/ml_training_data/train_baseline.py
@ -10,7 +10,7 @@ leaks into the model as a feature.
 """

 import json
-from typing import Any, cast
+from typing import Any, Callable, Optional, cast

 import lightgbm as lgb  # type: ignore[import-untyped]
 import numpy as np
@ -18,7 +18,9 @@ import pandas as pd
 import sklearn.metrics as _sk_metrics  # type: ignore[import-untyped]  # pyright: ignore[reportMissingTypeStubs]
 import sklearn.model_selection as _sk_model_selection  # type: ignore[import-untyped]  # pyright: ignore[reportMissingTypeStubs]

+mean_absolute_error: Any = _sk_metrics.mean_absolute_error  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
 mean_absolute_percentage_error: Any = _sk_metrics.mean_absolute_percentage_error  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
+mean_squared_error: Any = _sk_metrics.mean_squared_error  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
 r2_score: Any = _sk_metrics.r2_score  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
 train_test_split: Any = _sk_model_selection.train_test_split  # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]

@ -35,6 +37,32 @@ _CERT_NUM_COLUMN = "certificate_number"
 _OBJECTIVE_OVERRIDES: dict[str, str] = {}


+SampleWeightFn = Callable[["pd.Series[Any]"], "pd.Series[Any]"]
+
+
+# Default tail-bucket weight curve for slice 16i.  Boundary 58 picked from
+# slice 16h's per-decile residuals (decile 0 = SAP 1-58 carries 17% MAPE
+# vs <5% in the body); 3x multiplier is the lightest weight that demonstrably
+# reduces the +3.1 bias at decile 0 without inflating body MAPE.  Configurable
+# via sample_weight_fn for ablation runs.
+_DEFAULT_LOW_SAP_THRESHOLD: float = 58.0
+_DEFAULT_LOW_SAP_WEIGHT: float = 3.0
+
+
+def low_sap_tail_weight(
+    y: "pd.Series[Any]",
+    threshold: float = _DEFAULT_LOW_SAP_THRESHOLD,
+    weight: float = _DEFAULT_LOW_SAP_WEIGHT,
+) -> "pd.Series[Any]":
+    """Return per-row weights: `weight` where y < threshold, 1.0 otherwise.
+
+    Use as `train_baseline(..., sample_weight_fn=low_sap_tail_weight)` to
+    apply the slice 16i tail strategy.
+    """
+    arr = np.asarray(y, dtype=float)
+    return pd.Series(np.where(arr < threshold, weight, 1.0), index=y.index)
+
+
 def train_baseline(
    df: pd.DataFrame,
    targets: list[str],
@ -44,6 +72,7 @@ def train_baseline(
    test_size: float = 0.2,
    seed: int = 42,
    n_estimators: int = 200,
+    sample_weight_fn: Optional[SampleWeightFn] = None,
 ) -> dict[str, dict[str, float]]:
    feature_cols = [c for c in df.columns if c not in targets and c != _CERT_NUM_COLUMN]
    # LightGBM needs numeric (or pd.Categorical) dtypes. Coerce object columns whose
@ -70,12 +99,15 @@ def train_baseline(
        model: Any = lgb.LGBMRegressor(
            n_estimators=n_estimators, random_state=seed, verbose=-1, objective=objective,
        )
-        model.fit(x_train, y_train)
+        sample_weight = sample_weight_fn(y_train) if sample_weight_fn is not None else None
+        model.fit(x_train, y_train, sample_weight=sample_weight)
        preds: np.ndarray[Any, Any] = np.asarray(model.predict(x_test))

        metrics[target] = {
            "mape": float(cast(float, mean_absolute_percentage_error(y_test, preds))),
            "smape": _smape(y_test, preds),
+            "mae": float(cast(float, mean_absolute_error(y_test, preds))),
+            "rmse": float(np.sqrt(cast(float, mean_squared_error(y_test, preds)))),
            "r2": float(cast(float, r2_score(y_test, preds))),
        }

--- a/services/ml_training_data/tests/unit/test_train_baseline.py
+++ b/services/ml_training_data/tests/unit/test_train_baseline.py
@ -52,6 +52,73 @@ def test_train_baseline_returns_mape_and_r2_per_target(tmp_path: Path) -> None:
    assert metrics["sap_score"]["r2"] > 0.0  # learns something on a correlated signal


+def test_low_sap_tail_weight_returns_3x_for_rows_below_58_else_1x() -> None:
+    # Arrange — exposed helper so callers wanting the default tail strategy
+    # can plug it straight into train_baseline. SAP-rating boundary 58 chosen
+    # from slice 16h's per-decile residuals: decile 0 (SAP 1-58) carries 17%
+    # MAPE; deciles 1-9 are all below 5%.
+    import pandas as pd  # noqa: PLC0415
+
+    from ml_training_data.train_baseline import low_sap_tail_weight  # noqa: PLC0415
+
+    # Act
+    weights = low_sap_tail_weight(pd.Series([20, 50, 58, 60, 90]))
+
+    # Assert
+    assert list(weights) == [3.0, 3.0, 1.0, 1.0, 1.0]
+
+
+def test_train_baseline_accepts_sample_weight_fn_per_target(tmp_path: Path) -> None:
+    # Arrange — sample_weight_fn is a callable taking the training-label Series
+    # and returning a Series of weights the same length.  When supplied, the
+    # weights flow into LGBMRegressor.fit's sample_weight argument and the
+    # model emphasizes the heavily-weighted rows.  We verify the indirection
+    # works by training twice (no weights vs heavy-weighted tail) and
+    # confirming the predictions differ on the tail subset.
+    import numpy as np  # noqa: PLC0415
+    import pandas as pd  # noqa: PLC0415
+
+    storage = LocalStorage(root=tmp_path)
+    df = _synthetic_dataset(n=600, seed=0)
+
+    def weight_tail(y: "pd.Series[Any]") -> "pd.Series[Any]":
+        return pd.Series(np.where(np.asarray(y, dtype=float) < 60, 10.0, 1.0), index=y.index)
+
+    # Act
+    m_unweighted = train_baseline(
+        df=df.copy(), targets=["sap_score"], storage=storage,
+        run_key="runs/unw/", seed=42,
+    )
+    m_weighted = train_baseline(
+        df=df.copy(), targets=["sap_score"], storage=storage,
+        run_key="runs/w/", seed=42, sample_weight_fn=weight_tail,
+    )
+
+    # Assert — global MAE should differ between weighted and unweighted runs.
+    # (Direction depends on data; we just need to see that the weight reached LGBM.)
+    assert m_unweighted["sap_score"]["mae"] != m_weighted["sap_score"]["mae"]
+
+
+def test_train_baseline_reports_mae_and_rmse_per_target(tmp_path: Path) -> None:
+    # Arrange — MAE gives users-facing "predicted SAP within N points" meaning;
+    # RMSE penalises large errors quadratically.  Both should be reported next
+    # to MAPE so we can read the residual without inverting MAPE math by hand.
+    storage = LocalStorage(root=tmp_path)
+    df = _synthetic_dataset()
+
+    # Act
+    metrics = train_baseline(
+        df=df, targets=["sap_score"], storage=storage,
+        run_key="runs/2026-05-16/", seed=42,
+    )
+
+    # Assert
+    assert "mae" in metrics["sap_score"]
+    assert "rmse" in metrics["sap_score"]
+    assert metrics["sap_score"]["mae"] > 0
+    assert metrics["sap_score"]["rmse"] >= metrics["sap_score"]["mae"]  # always true mathematically
+
+
 def test_train_baseline_writes_feature_importance_per_target(tmp_path: Path) -> None:
    # Arrange
    storage = LocalStorage(root=tmp_path)