From fd8d71eb05101523c4307a1aae77dba911c58f67 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Sun, 17 May 2026 11:18:40 +0000
Subject: [PATCH] slice 15e: per-decile residuals reporting in train_baseline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `_per_decile_residuals` and writes `residuals_<target>.json` next to
metrics.json. Buckets test-set rows by deciles of the true target value;
each bucket carries count + MAPE + MAE + mean residual + true_min/max.

Lets us tell whether errors concentrate in the tails of the true distribution
(e.g. SAP<40 / SAP>85) vs the mid-band — which the global MAPE alone hides.
Baseline for slice 16's MAPE-improvement ablations.
---
 .../src/ml_training_data/train_baseline.py    | 48 +++++++++++++++++++
 .../tests/unit/test_train_baseline.py         | 48 +++++++++++++++++++
 2 files changed, 96 insertions(+)
diff --git a/services/ml_training_data/src/ml_training_data/train_baseline.py b/services/ml_training_data/src/ml_training_data/train_baseline.py
index 7e80157c..ed618a26 100644
--- a/services/ml_training_data/src/ml_training_data/train_baseline.py
+++ b/services/ml_training_data/src/ml_training_data/train_baseline.py
@@ -75,6 +75,12 @@ def train_baseline(
             json.dumps(importance, indent=2).encode("utf-8"),
         )
 
+        residuals = _per_decile_residuals(np.asarray(y_test, dtype=float), preds)
+        storage.write_bytes(
+            f"{run_key}residuals_{target}.json",
+            json.dumps({"buckets": residuals}, indent=2).encode("utf-8"),
+        )
+
     storage.write_bytes(
         f"{run_key}metrics.json",
         json.dumps(metrics, indent=2).encode("utf-8"),
@@ -96,3 +102,45 @@ def _smape(y_true: Any, y_pred: Any) -> float:
     if not mask.any():
         return 0.0
     return float(np.mean(np.abs(y_t[mask] - y_p[mask]) / denom[mask]))
+
+
+def _per_decile_residuals(
+    y_true: np.ndarray[Any, Any], y_pred: np.ndarray[Any, Any]
+) -> list[dict[str, float]]:
+    """Bucket the test set by deciles of the true target value, then report
+    MAPE / MAE / mean residual / count per bucket.
+
+    Lets us tell whether errors concentrate in the tails of the true distribution
+    (e.g. SAP<40 / SAP>85) vs the mid-band — which the global MAPE alone hides.
+    """
+    order = np.argsort(y_true, kind="stable")
+    y_t = y_true[order]
+    y_p = y_pred[order]
+    n = len(y_t)
+    bucket_size = n // 10  # last bucket absorbs the remainder
+    buckets: list[dict[str, float]] = []
+    for i in range(10):
+        start = i * bucket_size
+        stop = n if i == 9 else (i + 1) * bucket_size
+        slice_t = y_t[start:stop]
+        slice_p = y_p[start:stop]
+        count = len(slice_t)
+        if count == 0:
+            continue
+        abs_err = np.abs(slice_t - slice_p)
+        mae = float(np.mean(abs_err))
+        mean_residual = float(np.mean(slice_p - slice_t))
+        mape_mask = slice_t != 0
+        mape = float(np.mean(abs_err[mape_mask] / np.abs(slice_t[mape_mask]))) if mape_mask.any() else 0.0
+        buckets.append(
+            {
+                "decile": float(i),
+                "true_min": float(slice_t[0]),
+                "true_max": float(slice_t[-1]),
+                "count": float(count),
+                "mape": mape,
+                "mae": mae,
+                "mean_residual": mean_residual,
+            }
+        )
+    return buckets
diff --git a/services/ml_training_data/tests/unit/test_train_baseline.py b/services/ml_training_data/tests/unit/test_train_baseline.py
index 80920fb8..835b5b8d 100644
--- a/services/ml_training_data/tests/unit/test_train_baseline.py
+++ b/services/ml_training_data/tests/unit/test_train_baseline.py
@@ -92,3 +92,51 @@ def test_train_baseline_handles_multiple_targets_independently(tmp_path: Path) -
     assert storage.exists("runs/2026-05-16/importance_sap_score.json")
     assert storage.exists("runs/2026-05-16/importance_co2_emissions.json")
     assert storage.exists("runs/2026-05-16/metrics.json")
+
+
+def test_train_baseline_writes_per_decile_residuals_per_target(tmp_path: Path) -> None:
+    # Arrange
+    storage = LocalStorage(root=tmp_path)
+    df = _synthetic_dataset(n=500)
+
+    # Act
+    train_baseline(
+        df=df,
+        targets=["sap_score"],
+        storage=storage,
+        run_key="runs/2026-05-16/",
+        seed=42,
+    )
+
+    # Assert
+    residuals = json.loads(storage.read_bytes("runs/2026-05-16/residuals_sap_score.json"))
+    assert "buckets" in residuals
+    assert len(residuals["buckets"]) == 10
+    expected_keys = {"decile", "true_min", "true_max", "count", "mape", "mae", "mean_residual"}
+    for bucket in residuals["buckets"]:
+        assert expected_keys <= set(bucket.keys())
+    # The 10 bucket counts sum to the test-set size (20% of df).
+    assert sum(b["count"] for b in residuals["buckets"]) == int(len(df) * 0.2)
+    # Buckets are ordered by true_min ascending.
+    true_mins = [b["true_min"] for b in residuals["buckets"]]
+    assert true_mins == sorted(true_mins)
+
+
+def test_train_baseline_residuals_emitted_per_target_independently(tmp_path: Path) -> None:
+    # Arrange
+    storage = LocalStorage(root=tmp_path)
+    df = _synthetic_dataset(n=500)
+    df["co2_emissions"] = df["sap_score"] * 0.1 + 1.0
+
+    # Act
+    train_baseline(
+        df=df,
+        targets=["sap_score", "co2_emissions"],
+        storage=storage,
+        run_key="runs/2026-05-16/",
+        seed=42,
+    )
+
+    # Assert
+    assert storage.exists("runs/2026-05-16/residuals_sap_score.json")
+    assert storage.exists("runs/2026-05-16/residuals_co2_emissions.json")