Model/services/ml_training_data/tests/unit/test_train_baseline.py

"""Tests for train_baseline() — fits one LightGBM regressor per target.

train_baseline produces the baseline metrics (MAPE + R^2) and dumps per-target
feature-importance JSON to storage. This is the only stage that pulls in
LightGBM + sklearn; downstream training repos read the metrics + parquet only.
"""

import json
from pathlib import Path

import numpy as np
import pandas as pd

from ml_training_data.storage import LocalStorage
from ml_training_data.train_baseline import train_baseline


def _synthetic_dataset(n: int = 200, seed: int = 0) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    floor_area = rng.uniform(40, 200, size=n)
    walls = rng.integers(1, 5, size=n)
    # sap_score correlates with floor_area + walls, plus noise.
    sap_score = (100 - 0.2 * floor_area + 3 * walls + rng.normal(0, 2, size=n)).astype(int)
    return pd.DataFrame(
        {
            "certificate_number": [f"CN-{i:04d}" for i in range(n)],
            "total_floor_area_m2": floor_area,
            "wall_count": walls,
            "sap_score": sap_score,
        }
    )


def test_train_baseline_returns_mape_and_r2_per_target(tmp_path: Path) -> None:
    # Arrange
    storage = LocalStorage(root=tmp_path)
    df = _synthetic_dataset()

    # Act
    metrics = train_baseline(
        df=df,
        targets=["sap_score"],
        storage=storage,
        run_key="runs/2026-05-16/",
        seed=42,
    )

    # Assert
    assert "sap_score" in metrics
    assert "mape" in metrics["sap_score"]
    assert "r2" in metrics["sap_score"]
    assert metrics["sap_score"]["r2"] > 0.0  # learns something on a correlated signal


def test_low_sap_tail_weight_returns_3x_for_rows_below_58_else_1x() -> None:
    # Arrange — exposed helper so callers wanting the default tail strategy
    # can plug it straight into train_baseline. SAP-rating boundary 58 chosen
    # from slice 16h's per-decile residuals: decile 0 (SAP 1-58) carries 17%
    # MAPE; deciles 1-9 are all below 5%.
    import pandas as pd  # noqa: PLC0415

    from ml_training_data.train_baseline import low_sap_tail_weight  # noqa: PLC0415

    # Act
    weights = low_sap_tail_weight(pd.Series([20, 50, 58, 60, 90]))

    # Assert
    assert list(weights) == [3.0, 3.0, 1.0, 1.0, 1.0]


def test_train_baseline_accepts_sample_weight_fn_per_target(tmp_path: Path) -> None:
    # Arrange — sample_weight_fn is a callable taking the training-label Series
    # and returning a Series of weights the same length.  When supplied, the
    # weights flow into LGBMRegressor.fit's sample_weight argument and the
    # model emphasizes the heavily-weighted rows.  We verify the indirection
    # works by training twice (no weights vs heavy-weighted tail) and
    # confirming the predictions differ on the tail subset.
    import numpy as np  # noqa: PLC0415
    import pandas as pd  # noqa: PLC0415

    storage = LocalStorage(root=tmp_path)
    df = _synthetic_dataset(n=600, seed=0)

    def weight_tail(y: "pd.Series[Any]") -> "pd.Series[Any]":
        return pd.Series(np.where(np.asarray(y, dtype=float) < 60, 10.0, 1.0), index=y.index)

    # Act
    m_unweighted = train_baseline(
        df=df.copy(), targets=["sap_score"], storage=storage,
        run_key="runs/unw/", seed=42,
    )
    m_weighted = train_baseline(
        df=df.copy(), targets=["sap_score"], storage=storage,
        run_key="runs/w/", seed=42, sample_weight_fn=weight_tail,
    )

    # Assert — global MAE should differ between weighted and unweighted runs.
    # (Direction depends on data; we just need to see that the weight reached LGBM.)
    assert m_unweighted["sap_score"]["mae"] != m_weighted["sap_score"]["mae"]


def test_train_baseline_reports_mae_and_rmse_per_target(tmp_path: Path) -> None:
    # Arrange — MAE gives users-facing "predicted SAP within N points" meaning;
    # RMSE penalises large errors quadratically.  Both should be reported next
    # to MAPE so we can read the residual without inverting MAPE math by hand.
    storage = LocalStorage(root=tmp_path)
    df = _synthetic_dataset()

    # Act
    metrics = train_baseline(
        df=df, targets=["sap_score"], storage=storage,
        run_key="runs/2026-05-16/", seed=42,
    )

    # Assert
    assert "mae" in metrics["sap_score"]
    assert "rmse" in metrics["sap_score"]
    assert metrics["sap_score"]["mae"] > 0
    assert metrics["sap_score"]["rmse"] >= metrics["sap_score"]["mae"]  # always true mathematically


def test_train_baseline_writes_feature_importance_per_target(tmp_path: Path) -> None:
    # Arrange
    storage = LocalStorage(root=tmp_path)
    df = _synthetic_dataset()

    # Act
    train_baseline(
        df=df,
        targets=["sap_score"],
        storage=storage,
        run_key="runs/2026-05-16/",
        seed=42,
    )

    # Assert
    importance = json.loads(storage.read_bytes("runs/2026-05-16/importance_sap_score.json"))
    assert set(importance.keys()) == {"total_floor_area_m2", "wall_count"}
    assert all(isinstance(v, (int, float)) for v in importance.values())


def test_train_baseline_handles_multiple_targets_independently(tmp_path: Path) -> None:
    # Arrange
    storage = LocalStorage(root=tmp_path)
    df = _synthetic_dataset()
    df["co2_emissions"] = df["sap_score"] * 0.1 + 1.0  # second correlated target

    # Act
    metrics = train_baseline(
        df=df,
        targets=["sap_score", "co2_emissions"],
        storage=storage,
        run_key="runs/2026-05-16/",
        seed=42,
    )

    # Assert
    assert set(metrics.keys()) == {"sap_score", "co2_emissions"}
    assert storage.exists("runs/2026-05-16/importance_sap_score.json")
    assert storage.exists("runs/2026-05-16/importance_co2_emissions.json")
    assert storage.exists("runs/2026-05-16/metrics.json")


def test_train_baseline_writes_per_decile_residuals_per_target(tmp_path: Path) -> None:
    # Arrange
    storage = LocalStorage(root=tmp_path)
    df = _synthetic_dataset(n=500)

    # Act
    train_baseline(
        df=df,
        targets=["sap_score"],
        storage=storage,
        run_key="runs/2026-05-16/",
        seed=42,
    )

    # Assert
    residuals = json.loads(storage.read_bytes("runs/2026-05-16/residuals_sap_score.json"))
    assert "buckets" in residuals
    assert len(residuals["buckets"]) == 10
    expected_keys = {"decile", "true_min", "true_max", "count", "mape", "mae", "mean_residual"}
    for bucket in residuals["buckets"]:
        assert expected_keys <= set(bucket.keys())
    # The 10 bucket counts sum to the test-set size (20% of df).
    assert sum(b["count"] for b in residuals["buckets"]) == int(len(df) * 0.2)
    # Buckets are ordered by true_min ascending.
    true_mins = [b["true_min"] for b in residuals["buckets"]]
    assert true_mins == sorted(true_mins)


def test_train_baseline_uses_default_regression_objective_per_slice_16h(tmp_path: Path) -> None:
    # Arrange — slice 16g originally switched sap_score + peui_ucl to
    # objective='mape'; slice 16h's 250k ablation showed that lost ~0.6 pts
    # of global MAPE because mape over-weights the low-SAP tail. Reverted
    # to default 'regression' for all targets; tail strategy moves to
    # sample weights in slice 16i.
    storage = LocalStorage(root=tmp_path)
    df = _synthetic_dataset(n=300)
    df["peui_ucl"] = df["sap_score"].astype(float) + 5.0

    # Act
    metrics = train_baseline(
        df=df,
        targets=["sap_score", "peui_ucl"],
        storage=storage,
        run_key="runs/2026-05-16/",
        seed=42,
    )

    # Assert
    assert "sap_score" in metrics
    assert "peui_ucl" in metrics
    from ml_training_data.train_baseline import _OBJECTIVE_OVERRIDES  # noqa: PLC0415
    assert _OBJECTIVE_OVERRIDES == {}


def test_train_baseline_residuals_emitted_per_target_independently(tmp_path: Path) -> None:
    # Arrange
    storage = LocalStorage(root=tmp_path)
    df = _synthetic_dataset(n=500)
    df["co2_emissions"] = df["sap_score"] * 0.1 + 1.0

    # Act
    train_baseline(
        df=df,
        targets=["sap_score", "co2_emissions"],
        storage=storage,
        run_key="runs/2026-05-16/",
        seed=42,
    )

    # Assert
    assert storage.exists("runs/2026-05-16/residuals_sap_score.json")
    assert storage.exists("runs/2026-05-16/residuals_co2_emissions.json")