"""Tests for LocalStorage — fs-backed Storage protocol for the training pipeline. Storage is the swap-point between local-dev (LocalStorage rooted at ./data/) and the eventual S3-backed impl. Downstream stages (bulk_fetch, write_parquet) talk to the Storage protocol only, not Path. """ from pathlib import Path import pytest from ml_training_data.storage import LocalStorage def test_write_bytes_then_read_bytes_returns_same_data(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) payload = b"hello world" # Act storage.write_bytes("greetings/hello.txt", payload) out = storage.read_bytes("greetings/hello.txt") # Assert assert out == payload def test_exists_is_false_before_write_and_true_after(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) # Act before = storage.exists("a/b.bin") storage.write_bytes("a/b.bin", b"x") after = storage.exists("a/b.bin") # Assert assert before is False assert after is True def test_iter_keys_yields_every_written_key(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) storage.write_bytes("certs/a.json", b"1") storage.write_bytes("certs/b.json", b"2") storage.write_bytes("manifest.json", b"3") # Act keys = sorted(storage.iter_keys()) # Assert assert keys == ["certs/a.json", "certs/b.json", "manifest.json"] def test_iter_keys_filters_by_prefix(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) storage.write_bytes("certs/a.json", b"1") storage.write_bytes("certs/b.json", b"2") storage.write_bytes("manifest.json", b"3") # Act keys = sorted(storage.iter_keys(prefix="certs/")) # Assert assert keys == ["certs/a.json", "certs/b.json"] def test_read_bytes_raises_filenotfound_for_missing_key(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) # Act / Assert with pytest.raises(FileNotFoundError): storage.read_bytes("nope.bin") def test_open_read_returns_seekable_binary_stream(tmp_path: Path) -> None: # Arrange storage = LocalStorage(root=tmp_path) storage.write_bytes("big.bin", b"abcdefghij") # Act with storage.open_read("big.bin") as f: f.seek(4) chunk = f.read(3) # Assert assert chunk == b"efg"