From 6072d8795a7319d81269d065276e8b86a0ee575e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 May 2026 14:48:00 +0000 Subject: [PATCH] slice 16i: MAE + RMSE in metrics; sample_weight_fn + low_sap_tail_weight MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit train_baseline now returns mae + rmse alongside mape/smape/r2. MAE is the user-facing metric ("predicted SAP within N points"); RMSE the quadratic counterpart. Both come straight from sklearn. New sample_weight_fn parameter: callable(y_train) -> per-row weights. Threads into LGBMRegressor.fit's sample_weight argument. Default None preserves existing behaviour. Default tail strategy exposed as low_sap_tail_weight(y, threshold=58, weight=3): 3x weight where SAP < 58. Threshold picked from slice 16h's per-decile residuals — decile 0 (SAP 1-58) carries 17% MAPE vs <5% body. Three TDD tracers, all AAA. --- .../src/ml_training_data/train_baseline.py | 36 +++++++++- .../tests/unit/test_train_baseline.py | 67 +++++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/services/ml_training_data/src/ml_training_data/train_baseline.py b/services/ml_training_data/src/ml_training_data/train_baseline.py index 7d6a1017..019c8bdd 100644 --- a/services/ml_training_data/src/ml_training_data/train_baseline.py +++ b/services/ml_training_data/src/ml_training_data/train_baseline.py @@ -10,7 +10,7 @@ leaks into the model as a feature. """ import json -from typing import Any, cast +from typing import Any, Callable, Optional, cast import lightgbm as lgb # type: ignore[import-untyped] import numpy as np @@ -18,7 +18,9 @@ import pandas as pd import sklearn.metrics as _sk_metrics # type: ignore[import-untyped] # pyright: ignore[reportMissingTypeStubs] import sklearn.model_selection as _sk_model_selection # type: ignore[import-untyped] # pyright: ignore[reportMissingTypeStubs] +mean_absolute_error: Any = _sk_metrics.mean_absolute_error # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] mean_absolute_percentage_error: Any = _sk_metrics.mean_absolute_percentage_error # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] +mean_squared_error: Any = _sk_metrics.mean_squared_error # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] r2_score: Any = _sk_metrics.r2_score # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] train_test_split: Any = _sk_model_selection.train_test_split # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType] @@ -35,6 +37,32 @@ _CERT_NUM_COLUMN = "certificate_number" _OBJECTIVE_OVERRIDES: dict[str, str] = {} +SampleWeightFn = Callable[["pd.Series[Any]"], "pd.Series[Any]"] + + +# Default tail-bucket weight curve for slice 16i. Boundary 58 picked from +# slice 16h's per-decile residuals (decile 0 = SAP 1-58 carries 17% MAPE +# vs <5% in the body); 3x multiplier is the lightest weight that demonstrably +# reduces the +3.1 bias at decile 0 without inflating body MAPE. Configurable +# via sample_weight_fn for ablation runs. +_DEFAULT_LOW_SAP_THRESHOLD: float = 58.0 +_DEFAULT_LOW_SAP_WEIGHT: float = 3.0 + + +def low_sap_tail_weight( + y: "pd.Series[Any]", + threshold: float = _DEFAULT_LOW_SAP_THRESHOLD, + weight: float = _DEFAULT_LOW_SAP_WEIGHT, +) -> "pd.Series[Any]": + """Return per-row weights: `weight` where y < threshold, 1.0 otherwise. + + Use as `train_baseline(..., sample_weight_fn=low_sap_tail_weight)` to + apply the slice 16i tail strategy. + """ + arr = np.asarray(y, dtype=float) + return pd.Series(np.where(arr < threshold, weight, 1.0), index=y.index) + + def train_baseline( df: pd.DataFrame, targets: list[str], @@ -44,6 +72,7 @@ def train_baseline( test_size: float = 0.2, seed: int = 42, n_estimators: int = 200, + sample_weight_fn: Optional[SampleWeightFn] = None, ) -> dict[str, dict[str, float]]: feature_cols = [c for c in df.columns if c not in targets and c != _CERT_NUM_COLUMN] # LightGBM needs numeric (or pd.Categorical) dtypes. Coerce object columns whose @@ -70,12 +99,15 @@ def train_baseline( model: Any = lgb.LGBMRegressor( n_estimators=n_estimators, random_state=seed, verbose=-1, objective=objective, ) - model.fit(x_train, y_train) + sample_weight = sample_weight_fn(y_train) if sample_weight_fn is not None else None + model.fit(x_train, y_train, sample_weight=sample_weight) preds: np.ndarray[Any, Any] = np.asarray(model.predict(x_test)) metrics[target] = { "mape": float(cast(float, mean_absolute_percentage_error(y_test, preds))), "smape": _smape(y_test, preds), + "mae": float(cast(float, mean_absolute_error(y_test, preds))), + "rmse": float(np.sqrt(cast(float, mean_squared_error(y_test, preds)))), "r2": float(cast(float, r2_score(y_test, preds))), } diff --git a/services/ml_training_data/tests/unit/test_train_baseline.py b/services/ml_training_data/tests/unit/test_train_baseline.py index cc816714..969f3785 100644 --- a/services/ml_training_data/tests/unit/test_train_baseline.py +++ b/services/ml_training_data/tests/unit/test_train_baseline.py @@ -52,6 +52,73 @@ def test_train_baseline_returns_mape_and_r2_per_target(tmp_path: Path) -> None: assert metrics["sap_score"]["r2"] > 0.0 # learns something on a correlated signal +def test_low_sap_tail_weight_returns_3x_for_rows_below_58_else_1x() -> None: + # Arrange — exposed helper so callers wanting the default tail strategy + # can plug it straight into train_baseline. SAP-rating boundary 58 chosen + # from slice 16h's per-decile residuals: decile 0 (SAP 1-58) carries 17% + # MAPE; deciles 1-9 are all below 5%. + import pandas as pd # noqa: PLC0415 + + from ml_training_data.train_baseline import low_sap_tail_weight # noqa: PLC0415 + + # Act + weights = low_sap_tail_weight(pd.Series([20, 50, 58, 60, 90])) + + # Assert + assert list(weights) == [3.0, 3.0, 1.0, 1.0, 1.0] + + +def test_train_baseline_accepts_sample_weight_fn_per_target(tmp_path: Path) -> None: + # Arrange — sample_weight_fn is a callable taking the training-label Series + # and returning a Series of weights the same length. When supplied, the + # weights flow into LGBMRegressor.fit's sample_weight argument and the + # model emphasizes the heavily-weighted rows. We verify the indirection + # works by training twice (no weights vs heavy-weighted tail) and + # confirming the predictions differ on the tail subset. + import numpy as np # noqa: PLC0415 + import pandas as pd # noqa: PLC0415 + + storage = LocalStorage(root=tmp_path) + df = _synthetic_dataset(n=600, seed=0) + + def weight_tail(y: "pd.Series[Any]") -> "pd.Series[Any]": + return pd.Series(np.where(np.asarray(y, dtype=float) < 60, 10.0, 1.0), index=y.index) + + # Act + m_unweighted = train_baseline( + df=df.copy(), targets=["sap_score"], storage=storage, + run_key="runs/unw/", seed=42, + ) + m_weighted = train_baseline( + df=df.copy(), targets=["sap_score"], storage=storage, + run_key="runs/w/", seed=42, sample_weight_fn=weight_tail, + ) + + # Assert — global MAE should differ between weighted and unweighted runs. + # (Direction depends on data; we just need to see that the weight reached LGBM.) + assert m_unweighted["sap_score"]["mae"] != m_weighted["sap_score"]["mae"] + + +def test_train_baseline_reports_mae_and_rmse_per_target(tmp_path: Path) -> None: + # Arrange — MAE gives users-facing "predicted SAP within N points" meaning; + # RMSE penalises large errors quadratically. Both should be reported next + # to MAPE so we can read the residual without inverting MAPE math by hand. + storage = LocalStorage(root=tmp_path) + df = _synthetic_dataset() + + # Act + metrics = train_baseline( + df=df, targets=["sap_score"], storage=storage, + run_key="runs/2026-05-16/", seed=42, + ) + + # Assert + assert "mae" in metrics["sap_score"] + assert "rmse" in metrics["sap_score"] + assert metrics["sap_score"]["mae"] > 0 + assert metrics["sap_score"]["rmse"] >= metrics["sap_score"]["mae"] # always true mathematically + + def test_train_baseline_writes_feature_importance_per_target(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path)