mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
17 lines
482 B
Python
17 lines
482 B
Python
from typing import List
|
|
|
|
import pymupdf
|
|
|
|
|
|
def pdf_to_text_list(pdf_bytes: bytes) -> List[str]:
|
|
tokens: List[str] = []
|
|
with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc:
|
|
for page in doc:
|
|
for line in page.get_text().split("\n"):
|
|
tokens.append(line)
|
|
return tokens
|
|
|
|
|
|
def pdf_to_pages(pdf_bytes: bytes) -> List[str]:
|
|
with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc:
|
|
return [page.get_text() for page in doc]
|