mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Per ADR-0008: the v15 baseline reports MAPE but optimises MSE, which under-weights tail rows. Switching to objective='mape' applies gradient proportional to 1/|y| and lets the model focus where MAPE penalises. Targets co2_emissions, space_heating_kwh, hot_water_kwh, and peui_raw retain the default 'regression' objective (some rows have ~zero CO2 from heavy PV; MAPE objective destabilises near zero). Sample weights deferred to slice 16i if slice 16h's per-decile residuals still show tail bias after the objective switch.
170 lines
5.6 KiB
Python
170 lines
5.6 KiB
Python
"""Tests for train_baseline() — fits one LightGBM regressor per target.
|
|
|
|
train_baseline produces the baseline metrics (MAPE + R^2) and dumps per-target
|
|
feature-importance JSON to storage. This is the only stage that pulls in
|
|
LightGBM + sklearn; downstream training repos read the metrics + parquet only.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from ml_training_data.storage import LocalStorage
|
|
from ml_training_data.train_baseline import train_baseline
|
|
|
|
|
|
def _synthetic_dataset(n: int = 200, seed: int = 0) -> pd.DataFrame:
|
|
rng = np.random.default_rng(seed)
|
|
floor_area = rng.uniform(40, 200, size=n)
|
|
walls = rng.integers(1, 5, size=n)
|
|
# sap_score correlates with floor_area + walls, plus noise.
|
|
sap_score = (100 - 0.2 * floor_area + 3 * walls + rng.normal(0, 2, size=n)).astype(int)
|
|
return pd.DataFrame(
|
|
{
|
|
"certificate_number": [f"CN-{i:04d}" for i in range(n)],
|
|
"total_floor_area_m2": floor_area,
|
|
"wall_count": walls,
|
|
"sap_score": sap_score,
|
|
}
|
|
)
|
|
|
|
|
|
def test_train_baseline_returns_mape_and_r2_per_target(tmp_path: Path) -> None:
|
|
# Arrange
|
|
storage = LocalStorage(root=tmp_path)
|
|
df = _synthetic_dataset()
|
|
|
|
# Act
|
|
metrics = train_baseline(
|
|
df=df,
|
|
targets=["sap_score"],
|
|
storage=storage,
|
|
run_key="runs/2026-05-16/",
|
|
seed=42,
|
|
)
|
|
|
|
# Assert
|
|
assert "sap_score" in metrics
|
|
assert "mape" in metrics["sap_score"]
|
|
assert "r2" in metrics["sap_score"]
|
|
assert metrics["sap_score"]["r2"] > 0.0 # learns something on a correlated signal
|
|
|
|
|
|
def test_train_baseline_writes_feature_importance_per_target(tmp_path: Path) -> None:
|
|
# Arrange
|
|
storage = LocalStorage(root=tmp_path)
|
|
df = _synthetic_dataset()
|
|
|
|
# Act
|
|
train_baseline(
|
|
df=df,
|
|
targets=["sap_score"],
|
|
storage=storage,
|
|
run_key="runs/2026-05-16/",
|
|
seed=42,
|
|
)
|
|
|
|
# Assert
|
|
importance = json.loads(storage.read_bytes("runs/2026-05-16/importance_sap_score.json"))
|
|
assert set(importance.keys()) == {"total_floor_area_m2", "wall_count"}
|
|
assert all(isinstance(v, (int, float)) for v in importance.values())
|
|
|
|
|
|
def test_train_baseline_handles_multiple_targets_independently(tmp_path: Path) -> None:
|
|
# Arrange
|
|
storage = LocalStorage(root=tmp_path)
|
|
df = _synthetic_dataset()
|
|
df["co2_emissions"] = df["sap_score"] * 0.1 + 1.0 # second correlated target
|
|
|
|
# Act
|
|
metrics = train_baseline(
|
|
df=df,
|
|
targets=["sap_score", "co2_emissions"],
|
|
storage=storage,
|
|
run_key="runs/2026-05-16/",
|
|
seed=42,
|
|
)
|
|
|
|
# Assert
|
|
assert set(metrics.keys()) == {"sap_score", "co2_emissions"}
|
|
assert storage.exists("runs/2026-05-16/importance_sap_score.json")
|
|
assert storage.exists("runs/2026-05-16/importance_co2_emissions.json")
|
|
assert storage.exists("runs/2026-05-16/metrics.json")
|
|
|
|
|
|
def test_train_baseline_writes_per_decile_residuals_per_target(tmp_path: Path) -> None:
|
|
# Arrange
|
|
storage = LocalStorage(root=tmp_path)
|
|
df = _synthetic_dataset(n=500)
|
|
|
|
# Act
|
|
train_baseline(
|
|
df=df,
|
|
targets=["sap_score"],
|
|
storage=storage,
|
|
run_key="runs/2026-05-16/",
|
|
seed=42,
|
|
)
|
|
|
|
# Assert
|
|
residuals = json.loads(storage.read_bytes("runs/2026-05-16/residuals_sap_score.json"))
|
|
assert "buckets" in residuals
|
|
assert len(residuals["buckets"]) == 10
|
|
expected_keys = {"decile", "true_min", "true_max", "count", "mape", "mae", "mean_residual"}
|
|
for bucket in residuals["buckets"]:
|
|
assert expected_keys <= set(bucket.keys())
|
|
# The 10 bucket counts sum to the test-set size (20% of df).
|
|
assert sum(b["count"] for b in residuals["buckets"]) == int(len(df) * 0.2)
|
|
# Buckets are ordered by true_min ascending.
|
|
true_mins = [b["true_min"] for b in residuals["buckets"]]
|
|
assert true_mins == sorted(true_mins)
|
|
|
|
|
|
def test_train_baseline_uses_mape_objective_for_sap_score_and_peui_ucl(tmp_path: Path) -> None:
|
|
# Arrange — sap_score + peui_ucl should use objective="mape" per ADR-0008.
|
|
# We can't directly inspect LGBMRegressor.objective post-fit reliably, so
|
|
# instead we verify the per-target override map is wired and that training
|
|
# completes (LightGBM raises if the objective name is unknown).
|
|
storage = LocalStorage(root=tmp_path)
|
|
df = _synthetic_dataset(n=300)
|
|
df["peui_ucl"] = df["sap_score"].astype(float) + 5.0
|
|
|
|
# Act
|
|
metrics = train_baseline(
|
|
df=df,
|
|
targets=["sap_score", "peui_ucl"],
|
|
storage=storage,
|
|
run_key="runs/2026-05-16/",
|
|
seed=42,
|
|
)
|
|
|
|
# Assert — both targets fit successfully under the mape objective.
|
|
assert "sap_score" in metrics
|
|
assert "peui_ucl" in metrics
|
|
# Verify the override map is present and contains both targets.
|
|
from ml_training_data.train_baseline import _OBJECTIVE_OVERRIDES # noqa: PLC0415
|
|
assert _OBJECTIVE_OVERRIDES.get("sap_score") == "mape"
|
|
assert _OBJECTIVE_OVERRIDES.get("peui_ucl") == "mape"
|
|
assert _OBJECTIVE_OVERRIDES.get("co2_emissions") is None
|
|
|
|
|
|
def test_train_baseline_residuals_emitted_per_target_independently(tmp_path: Path) -> None:
|
|
# Arrange
|
|
storage = LocalStorage(root=tmp_path)
|
|
df = _synthetic_dataset(n=500)
|
|
df["co2_emissions"] = df["sap_score"] * 0.1 + 1.0
|
|
|
|
# Act
|
|
train_baseline(
|
|
df=df,
|
|
targets=["sap_score", "co2_emissions"],
|
|
storage=storage,
|
|
run_key="runs/2026-05-16/",
|
|
seed=42,
|
|
)
|
|
|
|
# Assert
|
|
assert storage.exists("runs/2026-05-16/residuals_sap_score.json")
|
|
assert storage.exists("runs/2026-05-16/residuals_co2_emissions.json")
|