survey-extraction/etl/pdfReader/pdfReaderToText.py
2025-03-08 06:38:48 +00:00

26 lines
No EOL
710 B
Python

from etl.utils.logger import Logger
import logging
import pymupdf
class pdfReaderToText():
def __init__(self, file_path):
self.source_path = file_path
self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger()
self.all_text = ""
self.text_list = []
self.get_text_from_pdf_file()
def get_text_from_pdf_file(self):
self.logger.debug(f"Extrating text from {self.source_path}")
pdf = pymupdf.open(self.source_path)
for page in pdf:
text = page.get_text()
self.all_text += text
self.text_list = self.all_text.split('\n')
def get_list_of_test(self):
return self.text_list