diff --git a/etl/src/etl/pdfReader/pdfReaderToText.py b/etl/src/etl/pdfReader/pdfReaderToText.py index a050a34..1dfa2de 100644 --- a/etl/src/etl/pdfReader/pdfReaderToText.py +++ b/etl/src/etl/pdfReader/pdfReaderToText.py @@ -7,13 +7,16 @@ class pdfReaderToText(): def __init__(self, file_path): self.source_path = file_path self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger() - self.text = "" + self.all_text = "" def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") pdf = pymupdf.open(self.source_path) for page in pdf: - text = page.get_text().encode("utf8") - self.logger('###') - self.logger.info(text) \ No newline at end of file + text = page.get_text() + self.all_text += text + + + from pprint import pprint + pprint(self.all_text.split('\n')) \ No newline at end of file