mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-30 13:10:56 +00:00
26 lines
No EOL
710 B
Python
26 lines
No EOL
710 B
Python
from etl.utils.logger import Logger
|
|
import logging
|
|
import pymupdf
|
|
|
|
|
|
class pdfReaderToText():
|
|
|
|
def __init__(self, file_path):
|
|
self.source_path = file_path
|
|
self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger()
|
|
self.all_text = ""
|
|
self.text_list = []
|
|
self.get_text_from_pdf_file()
|
|
|
|
def get_text_from_pdf_file(self):
|
|
self.logger.debug(f"Extrating text from {self.source_path}")
|
|
pdf = pymupdf.open(self.source_path)
|
|
|
|
for page in pdf:
|
|
text = page.get_text()
|
|
self.all_text += text
|
|
|
|
self.text_list = self.all_text.split('\n')
|
|
|
|
def get_list_of_test(self):
|
|
return self.text_list |