site notes pdf to json 🟥

This commit is contained in:
Daniel Roth 2026-04-16 14:45:28 +00:00
parent 326ed20015
commit bc527a039f
3 changed files with 36 additions and 0 deletions

View file

@ -0,0 +1,5 @@
from typing import List
def pdf_to_text_list(pdf_bytes: bytes) -> List[str]:
raise NotImplementedError

Binary file not shown.

View file

@ -0,0 +1,31 @@
import json
import os
import pytest
from backend.documents_parser.pdf import pdf_to_text_list
PDF_PATH = os.path.join(os.path.dirname(__file__), "fixtures", "ExampleSiteNotes.pdf")
FIXTURE_PATH = os.path.join(os.path.dirname(__file__), "fixtures", "site_notes_example_text.json")
@pytest.fixture
def pdf_bytes() -> bytes:
with open(PDF_PATH, "rb") as f:
return f.read()
class TestPdfToTextList:
def test_returns_list(self, pdf_bytes: bytes) -> None:
result = pdf_to_text_list(pdf_bytes)
assert isinstance(result, list)
def test_all_elements_are_strings(self, pdf_bytes: bytes) -> None:
result = pdf_to_text_list(pdf_bytes)
assert all(isinstance(t, str) for t in result)
def test_matches_fixture(self, pdf_bytes: bytes) -> None:
with open(FIXTURE_PATH) as f:
expected = json.load(f)
result = pdf_to_text_list(pdf_bytes)
assert result == expected