diff --git a/backend/documents_parser/pdf.py b/backend/documents_parser/pdf.py new file mode 100644 index 00000000..73119f07 --- /dev/null +++ b/backend/documents_parser/pdf.py @@ -0,0 +1,5 @@ +from typing import List + + +def pdf_to_text_list(pdf_bytes: bytes) -> List[str]: + raise NotImplementedError diff --git a/backend/documents_parser/tests/fixtures/ExampleSiteNotes.pdf b/backend/documents_parser/tests/fixtures/ExampleSiteNotes.pdf new file mode 100644 index 00000000..402d38aa Binary files /dev/null and b/backend/documents_parser/tests/fixtures/ExampleSiteNotes.pdf differ diff --git a/backend/documents_parser/tests/test_pdf.py b/backend/documents_parser/tests/test_pdf.py new file mode 100644 index 00000000..3a6dd2fb --- /dev/null +++ b/backend/documents_parser/tests/test_pdf.py @@ -0,0 +1,31 @@ +import json +import os + +import pytest + +from backend.documents_parser.pdf import pdf_to_text_list + +PDF_PATH = os.path.join(os.path.dirname(__file__), "fixtures", "ExampleSiteNotes.pdf") +FIXTURE_PATH = os.path.join(os.path.dirname(__file__), "fixtures", "site_notes_example_text.json") + + +@pytest.fixture +def pdf_bytes() -> bytes: + with open(PDF_PATH, "rb") as f: + return f.read() + + +class TestPdfToTextList: + def test_returns_list(self, pdf_bytes: bytes) -> None: + result = pdf_to_text_list(pdf_bytes) + assert isinstance(result, list) + + def test_all_elements_are_strings(self, pdf_bytes: bytes) -> None: + result = pdf_to_text_list(pdf_bytes) + assert all(isinstance(t, str) for t in result) + + def test_matches_fixture(self, pdf_bytes: bytes) -> None: + with open(FIXTURE_PATH) as f: + expected = json.load(f) + result = pdf_to_text_list(pdf_bytes) + assert result == expected