diff --git a/backend/documents_parser/pdf.py b/backend/documents_parser/pdf.py index 73119f07..dfa07300 100644 --- a/backend/documents_parser/pdf.py +++ b/backend/documents_parser/pdf.py @@ -1,5 +1,12 @@ from typing import List +import pymupdf + def pdf_to_text_list(pdf_bytes: bytes) -> List[str]: - raise NotImplementedError + tokens: List[str] = [] + with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc: + for page in doc: + for line in page.get_text().split("\n"): + tokens.append(line) + return tokens