Model/services/ml_training_data/tests/unit/test_build_features.py
Khalim Conn-Kowlessar 20fd55d5a1 slice 14d: build_features wires bulk reader -> mapper -> EpcMlTransform
ijson use_float fixes Decimal/float coercion when streaming JSON.
pyright extraPaths so the new pkg type-checks against domna-domain.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-16 18:38:41 +00:00

87 lines
3 KiB
Python

"""Tests for build_features() — JSON cert → EpcPropertyData → feature row.
build_features wires three existing pieces: BulkZipReader yields JSON cert dicts,
EpcPropertyDataMapper.from_api_response parses them into EpcPropertyData, and
EpcMlTransform.to_rows produces the feature+target DataFrame. The function adds a
leading `certificate_number` column so each row stays traceable to its source cert.
"""
import io
import json
import zipfile
from pathlib import Path
import pytest
from ml_training_data.bulk_zip_reader import BulkZipReader
from ml_training_data.build_features import build_features
from ml_training_data.storage import LocalStorage
_FIXTURE_PATH = Path("datatypes/epc/schema/tests/fixtures/21_0_0.json")
def _load_fixture_cert(cert_number: str) -> dict[str, object]:
cert = json.loads(_FIXTURE_PATH.read_text())
cert["certificate_number"] = cert_number
return cert
def _write_zip(storage: LocalStorage, key: str, entries: dict[str, list[dict[str, object]]]) -> None:
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
for entry_name, payload in entries.items():
zf.writestr(entry_name, json.dumps(payload))
storage.write_bytes(key, buf.getvalue())
def test_build_features_returns_one_row_per_supported_cert(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
cert = _load_fixture_cert("CERT-001")
_write_zip(storage, "bulk.zip", {"certificates-2024.json": [cert]})
reader = BulkZipReader(storage=storage, zip_key="bulk.zip")
# Act
df = build_features(reader, certificate_numbers={"CERT-001"})
# Assert
assert len(df) == 1
assert df.iloc[0]["certificate_number"] == "CERT-001"
assert "certificate_number" == df.columns[0]
def test_build_features_skips_unsupported_schemas_by_default(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
supported = _load_fixture_cert("OK-1")
unsupported = _load_fixture_cert("BAD-1")
unsupported["schema_type"] = "RdSAP-Schema-19.0.0" # not in mapper dispatch
_write_zip(
storage,
"bulk.zip",
{"certificates-2024.json": [supported, unsupported]},
)
reader = BulkZipReader(storage=storage, zip_key="bulk.zip")
# Act
df = build_features(reader, certificate_numbers={"OK-1", "BAD-1"})
# Assert
assert df["certificate_number"].tolist() == ["OK-1"]
def test_build_features_raises_when_skip_unsupported_schemas_is_false(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
unsupported = _load_fixture_cert("BAD-1")
unsupported["schema_type"] = "RdSAP-Schema-19.0.0"
_write_zip(storage, "bulk.zip", {"certificates-2024.json": [unsupported]})
reader = BulkZipReader(storage=storage, zip_key="bulk.zip")
# Act / Assert
with pytest.raises(ValueError, match="Unsupported EPC schema"):
build_features(
reader,
certificate_numbers={"BAD-1"},
skip_unsupported_schemas=False,
)