Model/services/ml_training_data/tests/unit/test_storage.py
Khalim Conn-Kowlessar 0ff9d546b8 slice 14c: BulkZipReader streams certs from gov bulk JSON ZIP
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-16 18:27:24 +00:00

90 lines
2.3 KiB
Python

"""Tests for LocalStorage — fs-backed Storage protocol for the training pipeline.
Storage is the swap-point between local-dev (LocalStorage rooted at ./data/) and the
eventual S3-backed impl. Downstream stages (bulk_fetch, write_parquet) talk to the
Storage protocol only, not Path.
"""
from pathlib import Path
import pytest
from ml_training_data.storage import LocalStorage
def test_write_bytes_then_read_bytes_returns_same_data(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
payload = b"hello world"
# Act
storage.write_bytes("greetings/hello.txt", payload)
out = storage.read_bytes("greetings/hello.txt")
# Assert
assert out == payload
def test_exists_is_false_before_write_and_true_after(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
# Act
before = storage.exists("a/b.bin")
storage.write_bytes("a/b.bin", b"x")
after = storage.exists("a/b.bin")
# Assert
assert before is False
assert after is True
def test_iter_keys_yields_every_written_key(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
storage.write_bytes("certs/a.json", b"1")
storage.write_bytes("certs/b.json", b"2")
storage.write_bytes("manifest.json", b"3")
# Act
keys = sorted(storage.iter_keys())
# Assert
assert keys == ["certs/a.json", "certs/b.json", "manifest.json"]
def test_iter_keys_filters_by_prefix(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
storage.write_bytes("certs/a.json", b"1")
storage.write_bytes("certs/b.json", b"2")
storage.write_bytes("manifest.json", b"3")
# Act
keys = sorted(storage.iter_keys(prefix="certs/"))
# Assert
assert keys == ["certs/a.json", "certs/b.json"]
def test_read_bytes_raises_filenotfound_for_missing_key(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
# Act / Assert
with pytest.raises(FileNotFoundError):
storage.read_bytes("nope.bin")
def test_open_read_returns_seekable_binary_stream(tmp_path: Path) -> None:
# Arrange
storage = LocalStorage(root=tmp_path)
storage.write_bytes("big.bin", b"abcdefghij")
# Act
with storage.open_read("big.bin") as f:
f.seek(4)
chunk = f.read(3)
# Assert
assert chunk == b"efg"