From fd8d71eb05101523c4307a1aae77dba911c58f67 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 May 2026 11:18:40 +0000 Subject: [PATCH] slice 15e: per-decile residuals reporting in train_baseline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `_per_decile_residuals` and writes `residuals_.json` next to metrics.json. Buckets test-set rows by deciles of the true target value; each bucket carries count + MAPE + MAE + mean residual + true_min/max. Lets us tell whether errors concentrate in the tails of the true distribution (e.g. SAP<40 / SAP>85) vs the mid-band — which the global MAPE alone hides. Baseline for slice 16's MAPE-improvement ablations. --- .../src/ml_training_data/train_baseline.py | 48 +++++++++++++++++++ .../tests/unit/test_train_baseline.py | 48 +++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/services/ml_training_data/src/ml_training_data/train_baseline.py b/services/ml_training_data/src/ml_training_data/train_baseline.py index 7e80157c..ed618a26 100644 --- a/services/ml_training_data/src/ml_training_data/train_baseline.py +++ b/services/ml_training_data/src/ml_training_data/train_baseline.py @@ -75,6 +75,12 @@ def train_baseline( json.dumps(importance, indent=2).encode("utf-8"), ) + residuals = _per_decile_residuals(np.asarray(y_test, dtype=float), preds) + storage.write_bytes( + f"{run_key}residuals_{target}.json", + json.dumps({"buckets": residuals}, indent=2).encode("utf-8"), + ) + storage.write_bytes( f"{run_key}metrics.json", json.dumps(metrics, indent=2).encode("utf-8"), @@ -96,3 +102,45 @@ def _smape(y_true: Any, y_pred: Any) -> float: if not mask.any(): return 0.0 return float(np.mean(np.abs(y_t[mask] - y_p[mask]) / denom[mask])) + + +def _per_decile_residuals( + y_true: np.ndarray[Any, Any], y_pred: np.ndarray[Any, Any] +) -> list[dict[str, float]]: + """Bucket the test set by deciles of the true target value, then report + MAPE / MAE / mean residual / count per bucket. + + Lets us tell whether errors concentrate in the tails of the true distribution + (e.g. SAP<40 / SAP>85) vs the mid-band — which the global MAPE alone hides. + """ + order = np.argsort(y_true, kind="stable") + y_t = y_true[order] + y_p = y_pred[order] + n = len(y_t) + bucket_size = n // 10 # last bucket absorbs the remainder + buckets: list[dict[str, float]] = [] + for i in range(10): + start = i * bucket_size + stop = n if i == 9 else (i + 1) * bucket_size + slice_t = y_t[start:stop] + slice_p = y_p[start:stop] + count = len(slice_t) + if count == 0: + continue + abs_err = np.abs(slice_t - slice_p) + mae = float(np.mean(abs_err)) + mean_residual = float(np.mean(slice_p - slice_t)) + mape_mask = slice_t != 0 + mape = float(np.mean(abs_err[mape_mask] / np.abs(slice_t[mape_mask]))) if mape_mask.any() else 0.0 + buckets.append( + { + "decile": float(i), + "true_min": float(slice_t[0]), + "true_max": float(slice_t[-1]), + "count": float(count), + "mape": mape, + "mae": mae, + "mean_residual": mean_residual, + } + ) + return buckets diff --git a/services/ml_training_data/tests/unit/test_train_baseline.py b/services/ml_training_data/tests/unit/test_train_baseline.py index 80920fb8..835b5b8d 100644 --- a/services/ml_training_data/tests/unit/test_train_baseline.py +++ b/services/ml_training_data/tests/unit/test_train_baseline.py @@ -92,3 +92,51 @@ def test_train_baseline_handles_multiple_targets_independently(tmp_path: Path) - assert storage.exists("runs/2026-05-16/importance_sap_score.json") assert storage.exists("runs/2026-05-16/importance_co2_emissions.json") assert storage.exists("runs/2026-05-16/metrics.json") + + +def test_train_baseline_writes_per_decile_residuals_per_target(tmp_path: Path) -> None: + # Arrange + storage = LocalStorage(root=tmp_path) + df = _synthetic_dataset(n=500) + + # Act + train_baseline( + df=df, + targets=["sap_score"], + storage=storage, + run_key="runs/2026-05-16/", + seed=42, + ) + + # Assert + residuals = json.loads(storage.read_bytes("runs/2026-05-16/residuals_sap_score.json")) + assert "buckets" in residuals + assert len(residuals["buckets"]) == 10 + expected_keys = {"decile", "true_min", "true_max", "count", "mape", "mae", "mean_residual"} + for bucket in residuals["buckets"]: + assert expected_keys <= set(bucket.keys()) + # The 10 bucket counts sum to the test-set size (20% of df). + assert sum(b["count"] for b in residuals["buckets"]) == int(len(df) * 0.2) + # Buckets are ordered by true_min ascending. + true_mins = [b["true_min"] for b in residuals["buckets"]] + assert true_mins == sorted(true_mins) + + +def test_train_baseline_residuals_emitted_per_target_independently(tmp_path: Path) -> None: + # Arrange + storage = LocalStorage(root=tmp_path) + df = _synthetic_dataset(n=500) + df["co2_emissions"] = df["sap_score"] * 0.1 + 1.0 + + # Act + train_baseline( + df=df, + targets=["sap_score", "co2_emissions"], + storage=storage, + run_key="runs/2026-05-16/", + seed=42, + ) + + # Assert + assert storage.exists("runs/2026-05-16/residuals_sap_score.json") + assert storage.exists("runs/2026-05-16/residuals_co2_emissions.json")