"""Tests for train_baseline() — fits one LightGBM regressor per target. train_baseline produces the baseline metrics (MAPE + R^2) and dumps per-target feature-importance JSON to storage. This is the only stage that pulls in LightGBM + sklearn; downstream training repos read the metrics + parquet only. """ import json from pathlib import Path import numpy as np import pandas as pd from ml_training_data.storage import LocalStorage from ml_training_data.train_baseline import train_baseline def _synthetic_dataset(n: int = 200, seed: int = 0) -> pd.DataFrame: rng = np.random.default_rng(seed) floor_area = rng.uniform(40, 200, size=n) walls = rng.integers(1, 5, size=n) # sap_score correlates with floor_area + walls, plus noise. sap_score = (100 - 0.2 * floor_area + 3 * walls + rng.normal(0, 2, size=n)).astype(int) return pd.DataFrame( { "certificate_number": [f"CN-{i:04d}" for i in range(n)], "total_floor_area_m2": floor_area, "wall_count": walls, "sap_score": sap_score, } ) def test_train_baseline_returns_mape_and_r2_per_target(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) df = _synthetic_dataset() # Act metrics = train_baseline( df=df, targets=["sap_score"], storage=storage, run_key="runs/2026-05-16/", seed=42, ) # Assert assert "sap_score" in metrics assert "mape" in metrics["sap_score"] assert "r2" in metrics["sap_score"] assert metrics["sap_score"]["r2"] > 0.0 # learns something on a correlated signal def test_train_baseline_writes_feature_importance_per_target(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) df = _synthetic_dataset() # Act train_baseline( df=df, targets=["sap_score"], storage=storage, run_key="runs/2026-05-16/", seed=42, ) # Assert importance = json.loads(storage.read_bytes("runs/2026-05-16/importance_sap_score.json")) assert set(importance.keys()) == {"total_floor_area_m2", "wall_count"} assert all(isinstance(v, (int, float)) for v in importance.values()) def test_train_baseline_handles_multiple_targets_independently(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) df = _synthetic_dataset() df["co2_emissions"] = df["sap_score"] * 0.1 + 1.0 # second correlated target # Act metrics = train_baseline( df=df, targets=["sap_score", "co2_emissions"], storage=storage, run_key="runs/2026-05-16/", seed=42, ) # Assert assert set(metrics.keys()) == {"sap_score", "co2_emissions"} assert storage.exists("runs/2026-05-16/importance_sap_score.json") assert storage.exists("runs/2026-05-16/importance_co2_emissions.json") assert storage.exists("runs/2026-05-16/metrics.json") def test_train_baseline_writes_per_decile_residuals_per_target(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) df = _synthetic_dataset(n=500) # Act train_baseline( df=df, targets=["sap_score"], storage=storage, run_key="runs/2026-05-16/", seed=42, ) # Assert residuals = json.loads(storage.read_bytes("runs/2026-05-16/residuals_sap_score.json")) assert "buckets" in residuals assert len(residuals["buckets"]) == 10 expected_keys = {"decile", "true_min", "true_max", "count", "mape", "mae", "mean_residual"} for bucket in residuals["buckets"]: assert expected_keys <= set(bucket.keys()) # The 10 bucket counts sum to the test-set size (20% of df). assert sum(b["count"] for b in residuals["buckets"]) == int(len(df) * 0.2) # Buckets are ordered by true_min ascending. true_mins = [b["true_min"] for b in residuals["buckets"]] assert true_mins == sorted(true_mins) def test_train_baseline_uses_mape_objective_for_sap_score_and_peui_ucl(tmp_path: Path) -> None: # Arrange — sap_score + peui_ucl should use objective="mape" per ADR-0008. # We can't directly inspect LGBMRegressor.objective post-fit reliably, so # instead we verify the per-target override map is wired and that training # completes (LightGBM raises if the objective name is unknown). storage = LocalStorage(root=tmp_path) df = _synthetic_dataset(n=300) df["peui_ucl"] = df["sap_score"].astype(float) + 5.0 # Act metrics = train_baseline( df=df, targets=["sap_score", "peui_ucl"], storage=storage, run_key="runs/2026-05-16/", seed=42, ) # Assert — both targets fit successfully under the mape objective. assert "sap_score" in metrics assert "peui_ucl" in metrics # Verify the override map is present and contains both targets. from ml_training_data.train_baseline import _OBJECTIVE_OVERRIDES # noqa: PLC0415 assert _OBJECTIVE_OVERRIDES.get("sap_score") == "mape" assert _OBJECTIVE_OVERRIDES.get("peui_ucl") == "mape" assert _OBJECTIVE_OVERRIDES.get("co2_emissions") is None def test_train_baseline_residuals_emitted_per_target_independently(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) df = _synthetic_dataset(n=500) df["co2_emissions"] = df["sap_score"] * 0.1 + 1.0 # Act train_baseline( df=df, targets=["sap_score", "co2_emissions"], storage=storage, run_key="runs/2026-05-16/", seed=42, ) # Assert assert storage.exists("runs/2026-05-16/residuals_sap_score.json") assert storage.exists("runs/2026-05-16/residuals_co2_emissions.json")