From ac1aa56ab1ac692e071294301fd232779cb5bbba Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 19 May 2026 09:51:42 +0000
Subject: [PATCH] P2.1: extract predict_sap_for_cert; swap probe to SAP 10.2
 spec prices

ADR-0010 P2: cert-calibration layer is deleted, the probe uses
SAP_10_2_SPEC_PRICES (already defined in cert_to_inputs.py). Extracts
a pure predict_sap_for_cert(cert_document, *, prices) -> int helper
out of main()'s inline pipeline so the spec-prices path is unit-
testable in isolation; the helper is also reusable for P3's cohort-
filtered probe variant.

The pinned regression value (SAP=67 for cert 6035-7729 under spec
prices, vs the cert's lodged SAP of 73 under cert-cal prices) lives
in services/ml_training_data/tests/unit/test_sap_parity_probe.py.
It will drift as P4 (PCDB) and the section sweep land their fixes;
that's expected.

cert_calibration_prices is still imported by test_golden_fixtures.py
and the table_12_cert_calibration module is intact. P2.2/P2.3 retire
those.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../src/ml_training_data/sap_parity_probe.py  | 27 ++++++++--
 .../tests/unit/test_sap_parity_probe.py       | 50 +++++++++++++++++++
 2 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 services/ml_training_data/tests/unit/test_sap_parity_probe.py

diff --git a/services/ml_training_data/src/ml_training_data/sap_parity_probe.py b/services/ml_training_data/src/ml_training_data/sap_parity_probe.py
index 35e6dcfa..beb64783 100644
--- a/services/ml_training_data/src/ml_training_data/sap_parity_probe.py
+++ b/services/ml_training_data/src/ml_training_data/sap_parity_probe.py
@@ -23,8 +23,11 @@ import pandas as pd
 
 from datatypes.epc.domain.mapper import EpcPropertyDataMapper
 from domain.sap.calculator import calculate_sap_from_inputs
-from domain.sap.rdsap.cert_to_inputs import cert_to_inputs
-from domain.sap.tables.table_12_cert_calibration import cert_calibration_prices
+from domain.sap.rdsap.cert_to_inputs import (
+    PriceTable,
+    SAP_10_2_SPEC_PRICES,
+    cert_to_inputs,
+)
 from ml_training_data.bulk_zip_reader import BulkZipReader
 from ml_training_data.storage import LocalStorage
 
@@ -35,6 +38,19 @@ _BULK = _REPO / "data" / "ml_training" / "bulk"
 _ZIP_KEYS = ("certificates-2025.json.zip", "certificates-2026.json.zip")
 
 
+def predict_sap_for_cert(
+    cert_document: dict[str, Any], *, prices: PriceTable
+) -> int:
+    """Run the mapper → cert_to_inputs → calculator pipeline on a single
+    cert document and return the rounded RdSAP-style SAP score. The
+    pure-function seam the corpus probe and any future per-cert dev
+    tools share."""
+    epc = EpcPropertyDataMapper.from_api_response(cert_document)
+    inputs = cert_to_inputs(epc, prices=prices)
+    result = calculate_sap_from_inputs(inputs)
+    return result.sap_score
+
+
 def _sample_certs(n: int, seed: int) -> dict[str, int]:
     df = pd.read_parquet(_PARQUET, columns=["certificate_number", "sap_score"])
     # Wide range so the sample includes full-SAP new-builds (sap_score 90+)
@@ -52,9 +68,12 @@ def main(argv: list[str] | None = None) -> None:
     seed = int(args[1]) if len(args) > 1 else 7
 
     targets = _sample_certs(n, seed)
-    print(f"Sampling {len(targets)} certs (seed={seed}) — using cert-calibration prices")
+    print(
+        f"Sampling {len(targets)} certs (seed={seed}) — using SAP 10.2 "
+        f"(14-03-2025) spec prices per ADR-0010"
+    )
     storage = LocalStorage(_BULK)
-    prices = cert_calibration_prices()
+    prices = SAP_10_2_SPEC_PRICES
     results: list[dict[str, Any]] = []
     errors: list[dict[str, Any]] = []
     remaining = set(targets)
diff --git a/services/ml_training_data/tests/unit/test_sap_parity_probe.py b/services/ml_training_data/tests/unit/test_sap_parity_probe.py
new file mode 100644
index 00000000..0403331d
--- /dev/null
+++ b/services/ml_training_data/tests/unit/test_sap_parity_probe.py
@@ -0,0 +1,50 @@
+"""Tests for sap_parity_probe — the per-cert SAP10 prediction pipeline
+exercised by the corpus-wide parity probe.
+
+P2.1 (ADR-0010): the probe now uses SAP 10.2 (14-03-2025) spec prices
+exclusively. The extracted `predict_sap_for_cert` pure function exists
+so the spec-prices path is unit-testable in isolation.
+"""
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any, cast
+
+from domain.sap.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES
+from ml_training_data.sap_parity_probe import predict_sap_for_cert
+
+
+_REPO_ROOT = Path(__file__).resolve().parents[4]
+_GOLDEN_FIXTURES = (
+    _REPO_ROOT
+    / "packages/domain/src/domain/sap/rdsap/tests/fixtures/golden"
+)
+
+
+def _load_cert_document(cert_number: str) -> dict[str, Any]:
+    path = _GOLDEN_FIXTURES / f"{cert_number}.json"
+    return cast(dict[str, Any], json.loads(path.read_text()))
+
+
+def test_predict_sap_for_cert_returns_spec_prices_score_for_6035_7729() -> None:
+    # Arrange
+    # Cert 6035-7729-2309-0879-2296: mid-terrace, age band A, gas combi
+    # boiler code 104, TFA 128 m². One of the previously-retired golden
+    # fixtures; the cert JSON is preserved on disk as reference data
+    # (ADR-0010 §10), the test against it is new and pins the
+    # spec-prices SAP score (not the cert SAP).
+    cert_document = _load_cert_document("6035-7729-2309-0879-2296")
+
+    # Act
+    score = predict_sap_for_cert(cert_document, prices=SAP_10_2_SPEC_PRICES)
+
+    # Assert
+    # Pinned in GREEN of P2.1. If this drifts, either the calculator's
+    # spec-correct output for this fixture has genuinely moved (a real
+    # behavioural change worth investigating) or the SAP 10.2 spec
+    # prices in table_12.py have changed.
+    assert score == _EXPECTED_SCORE_6035_7729
+
+
+_EXPECTED_SCORE_6035_7729: int = 67