From 16a8363a2a753db874a92638652cb209d63f3ffb Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 16 Apr 2026 14:46:54 +0000 Subject: [PATCH] =?UTF-8?q?site=20notes=20pdf=20to=20json=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/documents_parser/pdf.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/backend/documents_parser/pdf.py b/backend/documents_parser/pdf.py index 73119f07..dfa07300 100644 --- a/backend/documents_parser/pdf.py +++ b/backend/documents_parser/pdf.py @@ -1,5 +1,12 @@ from typing import List +import pymupdf + def pdf_to_text_list(pdf_bytes: bytes) -> List[str]: - raise NotImplementedError + tokens: List[str] = [] + with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc: + for page in doc: + for line in page.get_text().split("\n"): + tokens.append(line) + return tokens