diff --git a/services/ml_training_data/src/ml_training_data/sap_parity_probe.py b/services/ml_training_data/src/ml_training_data/sap_parity_probe.py index 6a331f82..35e6dcfa 100644 --- a/services/ml_training_data/src/ml_training_data/sap_parity_probe.py +++ b/services/ml_training_data/src/ml_training_data/sap_parity_probe.py @@ -37,7 +37,11 @@ _ZIP_KEYS = ("certificates-2025.json.zip", "certificates-2026.json.zip") def _sample_certs(n: int, seed: int) -> dict[str, int]: df = pd.read_parquet(_PARQUET, columns=["certificate_number", "sap_score"]) - df = df[df["sap_score"].between(20, 95)] + # Wide range so the sample includes full-SAP new-builds (sap_score 90+) + # and the deepest-tail heritage/anomaly certs (sap_score ≤ 20). Earlier + # `between(20, 95)` excluded the populations where the calculator's + # biggest spec gaps tend to live. + df = df[df["sap_score"].between(5, 99)] s = df.sample(n, random_state=seed) return dict(zip(s["certificate_number"], s["sap_score"].astype(int)))