diff --git a/services/ml_training_data/src/ml_training_data/sap_parity_probe.py b/services/ml_training_data/src/ml_training_data/sap_parity_probe.py index 35e6dcfa..beb64783 100644 --- a/services/ml_training_data/src/ml_training_data/sap_parity_probe.py +++ b/services/ml_training_data/src/ml_training_data/sap_parity_probe.py @@ -23,8 +23,11 @@ import pandas as pd from datatypes.epc.domain.mapper import EpcPropertyDataMapper from domain.sap.calculator import calculate_sap_from_inputs -from domain.sap.rdsap.cert_to_inputs import cert_to_inputs -from domain.sap.tables.table_12_cert_calibration import cert_calibration_prices +from domain.sap.rdsap.cert_to_inputs import ( + PriceTable, + SAP_10_2_SPEC_PRICES, + cert_to_inputs, +) from ml_training_data.bulk_zip_reader import BulkZipReader from ml_training_data.storage import LocalStorage @@ -35,6 +38,19 @@ _BULK = _REPO / "data" / "ml_training" / "bulk" _ZIP_KEYS = ("certificates-2025.json.zip", "certificates-2026.json.zip") +def predict_sap_for_cert( + cert_document: dict[str, Any], *, prices: PriceTable +) -> int: + """Run the mapper → cert_to_inputs → calculator pipeline on a single + cert document and return the rounded RdSAP-style SAP score. The + pure-function seam the corpus probe and any future per-cert dev + tools share.""" + epc = EpcPropertyDataMapper.from_api_response(cert_document) + inputs = cert_to_inputs(epc, prices=prices) + result = calculate_sap_from_inputs(inputs) + return result.sap_score + + def _sample_certs(n: int, seed: int) -> dict[str, int]: df = pd.read_parquet(_PARQUET, columns=["certificate_number", "sap_score"]) # Wide range so the sample includes full-SAP new-builds (sap_score 90+) @@ -52,9 +68,12 @@ def main(argv: list[str] | None = None) -> None: seed = int(args[1]) if len(args) > 1 else 7 targets = _sample_certs(n, seed) - print(f"Sampling {len(targets)} certs (seed={seed}) — using cert-calibration prices") + print( + f"Sampling {len(targets)} certs (seed={seed}) — using SAP 10.2 " + f"(14-03-2025) spec prices per ADR-0010" + ) storage = LocalStorage(_BULK) - prices = cert_calibration_prices() + prices = SAP_10_2_SPEC_PRICES results: list[dict[str, Any]] = [] errors: list[dict[str, Any]] = [] remaining = set(targets) diff --git a/services/ml_training_data/tests/unit/test_sap_parity_probe.py b/services/ml_training_data/tests/unit/test_sap_parity_probe.py new file mode 100644 index 00000000..0403331d --- /dev/null +++ b/services/ml_training_data/tests/unit/test_sap_parity_probe.py @@ -0,0 +1,50 @@ +"""Tests for sap_parity_probe — the per-cert SAP10 prediction pipeline +exercised by the corpus-wide parity probe. + +P2.1 (ADR-0010): the probe now uses SAP 10.2 (14-03-2025) spec prices +exclusively. The extracted `predict_sap_for_cert` pure function exists +so the spec-prices path is unit-testable in isolation. +""" +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, cast + +from domain.sap.rdsap.cert_to_inputs import SAP_10_2_SPEC_PRICES +from ml_training_data.sap_parity_probe import predict_sap_for_cert + + +_REPO_ROOT = Path(__file__).resolve().parents[4] +_GOLDEN_FIXTURES = ( + _REPO_ROOT + / "packages/domain/src/domain/sap/rdsap/tests/fixtures/golden" +) + + +def _load_cert_document(cert_number: str) -> dict[str, Any]: + path = _GOLDEN_FIXTURES / f"{cert_number}.json" + return cast(dict[str, Any], json.loads(path.read_text())) + + +def test_predict_sap_for_cert_returns_spec_prices_score_for_6035_7729() -> None: + # Arrange + # Cert 6035-7729-2309-0879-2296: mid-terrace, age band A, gas combi + # boiler code 104, TFA 128 m². One of the previously-retired golden + # fixtures; the cert JSON is preserved on disk as reference data + # (ADR-0010 §10), the test against it is new and pins the + # spec-prices SAP score (not the cert SAP). + cert_document = _load_cert_document("6035-7729-2309-0879-2296") + + # Act + score = predict_sap_for_cert(cert_document, prices=SAP_10_2_SPEC_PRICES) + + # Assert + # Pinned in GREEN of P2.1. If this drifts, either the calculator's + # spec-correct output for this fixture has genuinely moved (a real + # behavioural change worth investigating) or the SAP 10.2 spec + # prices in table_12.py have changed. + assert score == _EXPECTED_SCORE_6035_7729 + + +_EXPECTED_SCORE_6035_7729: int = 67