From c9ee97d70af79fc20ae58a6816d8b7fb9e77a38f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 4 Mar 2025 12:38:31 +0000 Subject: [PATCH] pdf reader works --- etl/src/etl/main.py | 4 +++- etl/src/etl/pdfReader/pdfReaderToText.py | 11 +++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index ebdcf10..34c7b9c 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -1,13 +1,15 @@ import os from filePathValidator.retrohome import RetroHomeFileStructureValidator from pdfReader.pdfReaderToText import pdfReaderToText +from pprint import pprint DATA_LOC = "/workspaces/survey-extraction/data/" INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf" def main(): # RetroHomeFileStructureValidator(DATA_LOC) - pdfReaderToText(INTERESTING_FILE_LOC).get_text_from_pdf_file() + list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test() + pprint(list_) if __name__ == "__main__": diff --git a/etl/src/etl/pdfReader/pdfReaderToText.py b/etl/src/etl/pdfReader/pdfReaderToText.py index 1dfa2de..f10fc65 100644 --- a/etl/src/etl/pdfReader/pdfReaderToText.py +++ b/etl/src/etl/pdfReader/pdfReaderToText.py @@ -8,7 +8,9 @@ class pdfReaderToText(): self.source_path = file_path self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger() self.all_text = "" - + self.text_list = [] + self.get_text_from_pdf_file() + def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") pdf = pymupdf.open(self.source_path) @@ -17,6 +19,7 @@ class pdfReaderToText(): text = page.get_text() self.all_text += text - - from pprint import pprint - pprint(self.all_text.split('\n')) \ No newline at end of file + self.text_list = self.all_text.split('\n') + + def get_list_of_test(self): + return self.text_list \ No newline at end of file