From ac1aa56ab1ac692e071294301fd232779cb5bbba Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 19 May 2026 09:51:42 +0000 Subject: [PATCH] P2.1: extract predict_sap_for_cert; swap probe to SAP 10.2 spec prices ADR-0010 P2: cert-calibration layer is deleted, the probe uses SAP_10_2_SPEC_PRICES (already defined in cert_to_inputs.py). Extracts a pure predict_sap_for_cert(cert_document, *, prices) -> int helper out of main()'s inline pipeline so the spec-prices path is unit- testable in isolation; the helper is also reusable for P3's cohort- filtered probe variant. The pinned regression value (SAP=67 for cert 6035-7729 under spec prices, vs the cert's lodged SAP of 73 under cert-cal prices) lives in services/ml_training_data/tests/unit/test_sap_parity_probe.py. It will drift as P4 (PCDB) and the section sweep land their fixes; that's expected. cert_calibration_prices is still imported by test_golden_fixtures.py and the table_12_cert_calibration module is intact. P2.2/P2.3 retire those. Co-Authored-By: Claude Opus 4.7 --- .../src/ml_training_data/sap_parity_probe.py | 27 ++++++++-- .../tests/unit/test_sap_parity_probe.py | 50 +++++++++++++++++++ 2 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 services/ml_training_data/tests/unit/test_sap_parity_probe.py diff --git a/services/ml_training_data/src/ml_training_data/sap_parity_probe.py b/services/ml_training_data/src/ml_training_data/sap_parity_probe.py index 35e6dcfa..beb64783 100644 --- a/services/ml_training_data/src/ml_training_data/sap_parity_probe.py +++ b/services/ml_training_data/src/ml_training_data/sap_parity_probe.py @@ -23,8 +23,11 @@ import pandas as pd from datatypes.epc.domain.mapper import EpcPropertyDataMapper from domain.sap.calculator import calculate_sap_from_inputs -from domain.sap.rdsap.cert_to_inputs import cert_to_inputs -from domain.sap.tables.table_12_cert_calibration import cert_calibration_prices +from domain.sap.rdsap.cert_to_inputs import ( + PriceTable, + SAP_10_2_SPEC_PRICES, + cert_to_inputs, +) from ml_training_data.bulk_zip_reader import BulkZipReader from ml_training_data.storage import LocalStorage @@ -35,6 +38,19 @@ _BULK = _REPO / "data" / "ml_training" / "bulk" _ZIP_KEYS = ("certificates-2025.json.zip", "certificates-2026.json.zip") +def predict_sap_for_cert( + cert_document: dict[str, Any], *, prices: PriceTable +) -> int: + """Run the mapper → cert_to_inputs → calculator pipeline on a single + cert document and return the rounded RdSAP-style SAP score. The + pure-function seam the corpus probe and any future per-cert dev + tools share.""" + epc = EpcPropertyDataMapper.from_api_response(cert_document) + inputs = cert_to_inputs(epc, prices=prices) + result = calculate_sap_from_inputs(inputs) + return result.sap_score + + def _sample_certs(n: int, seed: int) -> dict[str, int]: df = pd.read_parquet(_PARQUET, columns=["certificate_number", "sap_score"]) # Wide range so the sample includes full-SAP new-builds (sap_score 90+) @@ -52,9 +68,12 @@ def main(argv: list[str] | None = None) -> None: seed = int(args[1]) if len(args) > 1 else 7 targets = _sample_certs(n, seed) - print(f"Sampling {len(targets)} certs (seed={seed}) — using cert-calibration prices") + print( + f"Sampling {len(targets)} certs (seed={seed}) — using SAP 10.2 " + f"(14-03-2025) spec prices per ADR-0010" + ) storage = LocalStorage(_BULK) - prices = cert_calibration_prices() + prices = SAP_10_2_SPEC_PRICES results: list[dict[str, Any]] = [] errors: list[dict[str, Any]] = [] remaining = set(targets) diff --git a/services/ml_training_data/tests/unit/test_sap_parity_probe.py b/services/ml_training_data/tests/unit/test_sap_parity_probe.py new file mode 100644 index 00000000..0403331d --- /dev/null +++ b/services/ml_training_data/tests/unit/test_sap_parity_probe.py @@ -0,0 +1,50 @@ +"""Tests for sap_parity_probe — the per-cert SAP10 prediction pipeline +exercised by the corpus-wide parity probe. + +P2.1 (ADR-0010): the probe now uses SAP 10.2 (14-03-2025) spec prices +exclusively. The extracted `predict_sap_for_cert` pure function exists +so the spec-prices path is unit-testable in isolation. +""" +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, cast + +from domain.sap.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES +from ml_training_data.sap_parity_probe import predict_sap_for_cert + + +_REPO_ROOT = Path(__file__).resolve().parents[4] +_GOLDEN_FIXTURES = ( + _REPO_ROOT + / "packages/domain/src/domain/sap/rdsap/tests/fixtures/golden" +) + + +def _load_cert_document(cert_number: str) -> dict[str, Any]: + path = _GOLDEN_FIXTURES / f"{cert_number}.json" + return cast(dict[str, Any], json.loads(path.read_text())) + + +def test_predict_sap_for_cert_returns_spec_prices_score_for_6035_7729() -> None: + # Arrange + # Cert 6035-7729-2309-0879-2296: mid-terrace, age band A, gas combi + # boiler code 104, TFA 128 m². One of the previously-retired golden + # fixtures; the cert JSON is preserved on disk as reference data + # (ADR-0010 §10), the test against it is new and pins the + # spec-prices SAP score (not the cert SAP). + cert_document = _load_cert_document("6035-7729-2309-0879-2296") + + # Act + score = predict_sap_for_cert(cert_document, prices=SAP_10_2_SPEC_PRICES) + + # Assert + # Pinned in GREEN of P2.1. If this drifts, either the calculator's + # spec-correct output for this fixture has genuinely moved (a real + # behavioural change worth investigating) or the SAP 10.2 spec + # prices in table_12.py have changed. + assert score == _EXPECTED_SCORE_6035_7729 + + +_EXPECTED_SCORE_6035_7729: int = 67