mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-08 11:17:29 +00:00
pdf reader works
This commit is contained in:
parent
d25d699000
commit
c9ee97d70a
2 changed files with 10 additions and 5 deletions
|
|
@ -1,13 +1,15 @@
|
|||
import os
|
||||
from filePathValidator.retrohome import RetroHomeFileStructureValidator
|
||||
from pdfReader.pdfReaderToText import pdfReaderToText
|
||||
from pprint import pprint
|
||||
DATA_LOC = "/workspaces/survey-extraction/data/"
|
||||
INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf"
|
||||
|
||||
def main():
|
||||
# RetroHomeFileStructureValidator(DATA_LOC)
|
||||
|
||||
pdfReaderToText(INTERESTING_FILE_LOC).get_text_from_pdf_file()
|
||||
list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
|
||||
pprint(list_)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -8,7 +8,9 @@ class pdfReaderToText():
|
|||
self.source_path = file_path
|
||||
self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger()
|
||||
self.all_text = ""
|
||||
|
||||
self.text_list = []
|
||||
self.get_text_from_pdf_file()
|
||||
|
||||
def get_text_from_pdf_file(self):
|
||||
self.logger.debug(f"Extrating text from {self.source_path}")
|
||||
pdf = pymupdf.open(self.source_path)
|
||||
|
|
@ -17,6 +19,7 @@ class pdfReaderToText():
|
|||
text = page.get_text()
|
||||
self.all_text += text
|
||||
|
||||
|
||||
from pprint import pprint
|
||||
pprint(self.all_text.split('\n'))
|
||||
self.text_list = self.all_text.split('\n')
|
||||
|
||||
def get_list_of_test(self):
|
||||
return self.text_list
|
||||
Loading…
Add table
Reference in a new issue