Model/services/ml_training_data/tests/unit/test_train_baseline.py
Khalim Conn-Kowlessar ece1279475 revert slice 16g: drop mape objective per 16h ablation
250k retrain showed objective='mape' loses ~0.6 percentage points of
global sap_score MAPE (3.92% with regression vs 4.50% with mape) and
~0.7 pts on peui_ucl. The mape objective over-weights the low-SAP tail
(weight ~1/y) and drags the body MAPE up by more than it gains in the
tail.

Body MAPE on v16 features is already strong (2.38% on deciles 1-8); the
remaining tail bias at decile 0 (SAP<58, +3.1 bias) needs a different
fix -- sample weights or stratified loss -- queued as slice 16i.
2026-05-17 14:34:04 +00:00

168 lines
5.4 KiB
Python

"""Tests for train_baseline() — fits one LightGBM regressor per target.
train_baseline produces the baseline metrics (MAPE + R^2) and dumps per-target
feature-importance JSON to storage. This is the only stage that pulls in
LightGBM + sklearn; downstream training repos read the metrics + parquet only.
"""
import json
from pathlib import Path
import numpy as np
import pandas as pd
from ml_training_data.storage import LocalStorage
from ml_training_data.train_baseline import train_baseline
def _synthetic_dataset(n: int = 200, seed: int = 0) -> pd.DataFrame:
rng = np.random.default_rng(seed)
floor_area = rng.uniform(40, 200, size=n)
walls = rng.integers(1, 5, size=n)
# sap_score correlates with floor_area + walls, plus noise.
sap_score = (100 - 0.2 * floor_area + 3 * walls + rng.normal(0, 2, size=n)).astype(int)
return pd.DataFrame(
{
"certificate_number": [f"CN-{i:04d}" for i in range(n)],
"total_floor_area_m2": floor_area,
"wall_count": walls,
"sap_score": sap_score,
}
)
def test_train_baseline_returns_mape_and_r2_per_target(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
df = _synthetic_dataset()
# Act
metrics = train_baseline(
df=df,
targets=["sap_score"],
storage=storage,
run_key="runs/2026-05-16/",
seed=42,
)
# Assert
assert "sap_score" in metrics
assert "mape" in metrics["sap_score"]
assert "r2" in metrics["sap_score"]
assert metrics["sap_score"]["r2"] > 0.0 # learns something on a correlated signal
def test_train_baseline_writes_feature_importance_per_target(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
df = _synthetic_dataset()
# Act
train_baseline(
df=df,
targets=["sap_score"],
storage=storage,
run_key="runs/2026-05-16/",
seed=42,
)
# Assert
importance = json.loads(storage.read_bytes("runs/2026-05-16/importance_sap_score.json"))
assert set(importance.keys()) == {"total_floor_area_m2", "wall_count"}
assert all(isinstance(v, (int, float)) for v in importance.values())
def test_train_baseline_handles_multiple_targets_independently(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
df = _synthetic_dataset()
df["co2_emissions"] = df["sap_score"] * 0.1 + 1.0 # second correlated target
# Act
metrics = train_baseline(
df=df,
targets=["sap_score", "co2_emissions"],
storage=storage,
run_key="runs/2026-05-16/",
seed=42,
)
# Assert
assert set(metrics.keys()) == {"sap_score", "co2_emissions"}
assert storage.exists("runs/2026-05-16/importance_sap_score.json")
assert storage.exists("runs/2026-05-16/importance_co2_emissions.json")
assert storage.exists("runs/2026-05-16/metrics.json")
def test_train_baseline_writes_per_decile_residuals_per_target(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
df = _synthetic_dataset(n=500)
# Act
train_baseline(
df=df,
targets=["sap_score"],
storage=storage,
run_key="runs/2026-05-16/",
seed=42,
)
# Assert
residuals = json.loads(storage.read_bytes("runs/2026-05-16/residuals_sap_score.json"))
assert "buckets" in residuals
assert len(residuals["buckets"]) == 10
expected_keys = {"decile", "true_min", "true_max", "count", "mape", "mae", "mean_residual"}
for bucket in residuals["buckets"]:
assert expected_keys <= set(bucket.keys())
# The 10 bucket counts sum to the test-set size (20% of df).
assert sum(b["count"] for b in residuals["buckets"]) == int(len(df) * 0.2)
# Buckets are ordered by true_min ascending.
true_mins = [b["true_min"] for b in residuals["buckets"]]
assert true_mins == sorted(true_mins)
def test_train_baseline_uses_default_regression_objective_per_slice_16h(tmp_path: Path) -> None:
# Arrange — slice 16g originally switched sap_score + peui_ucl to
# objective='mape'; slice 16h's 250k ablation showed that lost ~0.6 pts
# of global MAPE because mape over-weights the low-SAP tail. Reverted
# to default 'regression' for all targets; tail strategy moves to
# sample weights in slice 16i.
storage = LocalStorage(root=tmp_path)
df = _synthetic_dataset(n=300)
df["peui_ucl"] = df["sap_score"].astype(float) + 5.0
# Act
metrics = train_baseline(
df=df,
targets=["sap_score", "peui_ucl"],
storage=storage,
run_key="runs/2026-05-16/",
seed=42,
)
# Assert
assert "sap_score" in metrics
assert "peui_ucl" in metrics
from ml_training_data.train_baseline import _OBJECTIVE_OVERRIDES # noqa: PLC0415
assert _OBJECTIVE_OVERRIDES == {}
def test_train_baseline_residuals_emitted_per_target_independently(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
df = _synthetic_dataset(n=500)
df["co2_emissions"] = df["sap_score"] * 0.1 + 1.0
# Act
train_baseline(
df=df,
targets=["sap_score", "co2_emissions"],
storage=storage,
run_key="runs/2026-05-16/",
seed=42,
)
# Assert
assert storage.exists("runs/2026-05-16/residuals_sap_score.json")
assert storage.exists("runs/2026-05-16/residuals_co2_emissions.json")