site notes pdf to json 🟩

This commit is contained in:
Daniel Roth 2026-04-16 14:46:54 +00:00
parent bc527a039f
commit 16a8363a2a

View file

@ -1,5 +1,12 @@
from typing import List
import pymupdf
def pdf_to_text_list(pdf_bytes: bytes) -> List[str]:
raise NotImplementedError
tokens: List[str] = []
with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc:
for page in doc:
for line in page.get_text().split("\n"):
tokens.append(line)
return tokens