Model/backend/documents_parser/pdf.py
2026-04-16 14:46:54 +00:00

12 lines
318 B
Python

from typing import List
import pymupdf
def pdf_to_text_list(pdf_bytes: bytes) -> List[str]:
tokens: List[str] = []
with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc:
for page in doc:
for line in page.get_text().split("\n"):
tokens.append(line)
return tokens