mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
slice 16i: MAE + RMSE in metrics; sample_weight_fn + low_sap_tail_weight
train_baseline now returns mae + rmse alongside mape/smape/r2. MAE is the
user-facing metric ("predicted SAP within N points"); RMSE the quadratic
counterpart. Both come straight from sklearn.
New sample_weight_fn parameter: callable(y_train) -> per-row weights.
Threads into LGBMRegressor.fit's sample_weight argument. Default None
preserves existing behaviour.
Default tail strategy exposed as low_sap_tail_weight(y, threshold=58,
weight=3): 3x weight where SAP < 58. Threshold picked from slice 16h's
per-decile residuals — decile 0 (SAP 1-58) carries 17% MAPE vs <5% body.
Three TDD tracers, all AAA.
This commit is contained in:
parent
ece1279475
commit
6072d8795a
2 changed files with 101 additions and 2 deletions
|
|
@ -10,7 +10,7 @@ leaks into the model as a feature.
|
|||
"""
|
||||
|
||||
import json
|
||||
from typing import Any, cast
|
||||
from typing import Any, Callable, Optional, cast
|
||||
|
||||
import lightgbm as lgb # type: ignore[import-untyped]
|
||||
import numpy as np
|
||||
|
|
@ -18,7 +18,9 @@ import pandas as pd
|
|||
import sklearn.metrics as _sk_metrics # type: ignore[import-untyped] # pyright: ignore[reportMissingTypeStubs]
|
||||
import sklearn.model_selection as _sk_model_selection # type: ignore[import-untyped] # pyright: ignore[reportMissingTypeStubs]
|
||||
|
||||
mean_absolute_error: Any = _sk_metrics.mean_absolute_error # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
||||
mean_absolute_percentage_error: Any = _sk_metrics.mean_absolute_percentage_error # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
||||
mean_squared_error: Any = _sk_metrics.mean_squared_error # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
||||
r2_score: Any = _sk_metrics.r2_score # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
||||
train_test_split: Any = _sk_model_selection.train_test_split # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
||||
|
||||
|
|
@ -35,6 +37,32 @@ _CERT_NUM_COLUMN = "certificate_number"
|
|||
_OBJECTIVE_OVERRIDES: dict[str, str] = {}
|
||||
|
||||
|
||||
SampleWeightFn = Callable[["pd.Series[Any]"], "pd.Series[Any]"]
|
||||
|
||||
|
||||
# Default tail-bucket weight curve for slice 16i. Boundary 58 picked from
|
||||
# slice 16h's per-decile residuals (decile 0 = SAP 1-58 carries 17% MAPE
|
||||
# vs <5% in the body); 3x multiplier is the lightest weight that demonstrably
|
||||
# reduces the +3.1 bias at decile 0 without inflating body MAPE. Configurable
|
||||
# via sample_weight_fn for ablation runs.
|
||||
_DEFAULT_LOW_SAP_THRESHOLD: float = 58.0
|
||||
_DEFAULT_LOW_SAP_WEIGHT: float = 3.0
|
||||
|
||||
|
||||
def low_sap_tail_weight(
|
||||
y: "pd.Series[Any]",
|
||||
threshold: float = _DEFAULT_LOW_SAP_THRESHOLD,
|
||||
weight: float = _DEFAULT_LOW_SAP_WEIGHT,
|
||||
) -> "pd.Series[Any]":
|
||||
"""Return per-row weights: `weight` where y < threshold, 1.0 otherwise.
|
||||
|
||||
Use as `train_baseline(..., sample_weight_fn=low_sap_tail_weight)` to
|
||||
apply the slice 16i tail strategy.
|
||||
"""
|
||||
arr = np.asarray(y, dtype=float)
|
||||
return pd.Series(np.where(arr < threshold, weight, 1.0), index=y.index)
|
||||
|
||||
|
||||
def train_baseline(
|
||||
df: pd.DataFrame,
|
||||
targets: list[str],
|
||||
|
|
@ -44,6 +72,7 @@ def train_baseline(
|
|||
test_size: float = 0.2,
|
||||
seed: int = 42,
|
||||
n_estimators: int = 200,
|
||||
sample_weight_fn: Optional[SampleWeightFn] = None,
|
||||
) -> dict[str, dict[str, float]]:
|
||||
feature_cols = [c for c in df.columns if c not in targets and c != _CERT_NUM_COLUMN]
|
||||
# LightGBM needs numeric (or pd.Categorical) dtypes. Coerce object columns whose
|
||||
|
|
@ -70,12 +99,15 @@ def train_baseline(
|
|||
model: Any = lgb.LGBMRegressor(
|
||||
n_estimators=n_estimators, random_state=seed, verbose=-1, objective=objective,
|
||||
)
|
||||
model.fit(x_train, y_train)
|
||||
sample_weight = sample_weight_fn(y_train) if sample_weight_fn is not None else None
|
||||
model.fit(x_train, y_train, sample_weight=sample_weight)
|
||||
preds: np.ndarray[Any, Any] = np.asarray(model.predict(x_test))
|
||||
|
||||
metrics[target] = {
|
||||
"mape": float(cast(float, mean_absolute_percentage_error(y_test, preds))),
|
||||
"smape": _smape(y_test, preds),
|
||||
"mae": float(cast(float, mean_absolute_error(y_test, preds))),
|
||||
"rmse": float(np.sqrt(cast(float, mean_squared_error(y_test, preds)))),
|
||||
"r2": float(cast(float, r2_score(y_test, preds))),
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -52,6 +52,73 @@ def test_train_baseline_returns_mape_and_r2_per_target(tmp_path: Path) -> None:
|
|||
assert metrics["sap_score"]["r2"] > 0.0 # learns something on a correlated signal
|
||||
|
||||
|
||||
def test_low_sap_tail_weight_returns_3x_for_rows_below_58_else_1x() -> None:
|
||||
# Arrange — exposed helper so callers wanting the default tail strategy
|
||||
# can plug it straight into train_baseline. SAP-rating boundary 58 chosen
|
||||
# from slice 16h's per-decile residuals: decile 0 (SAP 1-58) carries 17%
|
||||
# MAPE; deciles 1-9 are all below 5%.
|
||||
import pandas as pd # noqa: PLC0415
|
||||
|
||||
from ml_training_data.train_baseline import low_sap_tail_weight # noqa: PLC0415
|
||||
|
||||
# Act
|
||||
weights = low_sap_tail_weight(pd.Series([20, 50, 58, 60, 90]))
|
||||
|
||||
# Assert
|
||||
assert list(weights) == [3.0, 3.0, 1.0, 1.0, 1.0]
|
||||
|
||||
|
||||
def test_train_baseline_accepts_sample_weight_fn_per_target(tmp_path: Path) -> None:
|
||||
# Arrange — sample_weight_fn is a callable taking the training-label Series
|
||||
# and returning a Series of weights the same length. When supplied, the
|
||||
# weights flow into LGBMRegressor.fit's sample_weight argument and the
|
||||
# model emphasizes the heavily-weighted rows. We verify the indirection
|
||||
# works by training twice (no weights vs heavy-weighted tail) and
|
||||
# confirming the predictions differ on the tail subset.
|
||||
import numpy as np # noqa: PLC0415
|
||||
import pandas as pd # noqa: PLC0415
|
||||
|
||||
storage = LocalStorage(root=tmp_path)
|
||||
df = _synthetic_dataset(n=600, seed=0)
|
||||
|
||||
def weight_tail(y: "pd.Series[Any]") -> "pd.Series[Any]":
|
||||
return pd.Series(np.where(np.asarray(y, dtype=float) < 60, 10.0, 1.0), index=y.index)
|
||||
|
||||
# Act
|
||||
m_unweighted = train_baseline(
|
||||
df=df.copy(), targets=["sap_score"], storage=storage,
|
||||
run_key="runs/unw/", seed=42,
|
||||
)
|
||||
m_weighted = train_baseline(
|
||||
df=df.copy(), targets=["sap_score"], storage=storage,
|
||||
run_key="runs/w/", seed=42, sample_weight_fn=weight_tail,
|
||||
)
|
||||
|
||||
# Assert — global MAE should differ between weighted and unweighted runs.
|
||||
# (Direction depends on data; we just need to see that the weight reached LGBM.)
|
||||
assert m_unweighted["sap_score"]["mae"] != m_weighted["sap_score"]["mae"]
|
||||
|
||||
|
||||
def test_train_baseline_reports_mae_and_rmse_per_target(tmp_path: Path) -> None:
|
||||
# Arrange — MAE gives users-facing "predicted SAP within N points" meaning;
|
||||
# RMSE penalises large errors quadratically. Both should be reported next
|
||||
# to MAPE so we can read the residual without inverting MAPE math by hand.
|
||||
storage = LocalStorage(root=tmp_path)
|
||||
df = _synthetic_dataset()
|
||||
|
||||
# Act
|
||||
metrics = train_baseline(
|
||||
df=df, targets=["sap_score"], storage=storage,
|
||||
run_key="runs/2026-05-16/", seed=42,
|
||||
)
|
||||
|
||||
# Assert
|
||||
assert "mae" in metrics["sap_score"]
|
||||
assert "rmse" in metrics["sap_score"]
|
||||
assert metrics["sap_score"]["mae"] > 0
|
||||
assert metrics["sap_score"]["rmse"] >= metrics["sap_score"]["mae"] # always true mathematically
|
||||
|
||||
|
||||
def test_train_baseline_writes_feature_importance_per_target(tmp_path: Path) -> None:
|
||||
# Arrange
|
||||
storage = LocalStorage(root=tmp_path)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue