From d25d699000aaf4e37d500a551f6868247b2ab130 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 4 Mar 2025 12:32:23 +0000 Subject: [PATCH] made it into a prety list --- etl/src/etl/pdfReader/pdfReaderToText.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/etl/src/etl/pdfReader/pdfReaderToText.py b/etl/src/etl/pdfReader/pdfReaderToText.py index a050a34..1dfa2de 100644 --- a/etl/src/etl/pdfReader/pdfReaderToText.py +++ b/etl/src/etl/pdfReader/pdfReaderToText.py @@ -7,13 +7,16 @@ class pdfReaderToText(): def __init__(self, file_path): self.source_path = file_path self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger() - self.text = "" + self.all_text = "" def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") pdf = pymupdf.open(self.source_path) for page in pdf: - text = page.get_text().encode("utf8") - self.logger('###') - self.logger.info(text) \ No newline at end of file + text = page.get_text() + self.all_text += text + + + from pprint import pprint + pprint(self.all_text.split('\n')) \ No newline at end of file