pdf reader works

This commit is contained in:
Jun-te Kim 2025-03-04 12:38:31 +00:00
parent d25d699000
commit c9ee97d70a
2 changed files with 10 additions and 5 deletions

View file

@ -1,13 +1,15 @@
import os
from filePathValidator.retrohome import RetroHomeFileStructureValidator
from pdfReader.pdfReaderToText import pdfReaderToText
from pprint import pprint
DATA_LOC = "/workspaces/survey-extraction/data/"
INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf"
def main():
# RetroHomeFileStructureValidator(DATA_LOC)
pdfReaderToText(INTERESTING_FILE_LOC).get_text_from_pdf_file()
list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
pprint(list_)
if __name__ == "__main__":

View file

@ -8,7 +8,9 @@ class pdfReaderToText():
self.source_path = file_path
self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger()
self.all_text = ""
self.text_list = []
self.get_text_from_pdf_file()
def get_text_from_pdf_file(self):
self.logger.debug(f"Extrating text from {self.source_path}")
pdf = pymupdf.open(self.source_path)
@ -17,6 +19,7 @@ class pdfReaderToText():
text = page.get_text()
self.all_text += text
from pprint import pprint
pprint(self.all_text.split('\n'))
self.text_list = self.all_text.split('\n')
def get_list_of_test(self):
return self.text_list