From af338dd02b2e8b033c2ec17abf64645bef536354 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 11 Mar 2025 11:06:43 +0000 Subject: [PATCH] added new files to allow data extraction --- etl/main.py | 13 ++++++++----- etl/pdfReader/pdfReaderToText.py | 22 +++++++++++++++++++--- etl/pdfReader/reportType.py | 5 +++++ etl/pdfReader/sitenotes.py | 11 +++++++++++ etl/scraper/scraper.py | 2 +- 5 files changed, 44 insertions(+), 9 deletions(-) create mode 100644 etl/pdfReader/reportType.py create mode 100644 etl/pdfReader/sitenotes.py diff --git a/etl/main.py b/etl/main.py index 08c1284..08c163e 100644 --- a/etl/main.py +++ b/etl/main.py @@ -4,11 +4,7 @@ from etl.scraper.scraper import SharePointScraper, SharePointInstaller from pprint import pprint, pformat import logging from etl.utils.logger import Logger - from etl.validator.validator import DomnaSharePointValidator - -DATA_LOC = "/workspaces/survey-extraction/data/" -INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf" logger = Logger(name="main.py", level=logging.DEBUG).get_logger() @@ -28,10 +24,17 @@ def main(): # list_of_house_ass_names = south_coast_scraper.get_housing_association_names() # logger.info(pformat(list_of_house_ass_names)) - # POC of work completed + # POC of downloading each file south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) south_coast_scraper.download_file_for_each_address() + # POC of pdf reader + DATA_LOC = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf" + pdfReader = pdfReaderToText(DATA_LOC) + siteNoteReader = pdfReader.get_reader() + logger.warning(siteNoteReader.type) + + # logger.info(south_coast_scraper.surveyor_to_housing_assosications) diff --git a/etl/pdfReader/pdfReaderToText.py b/etl/pdfReader/pdfReaderToText.py index 2b0790d..c27b721 100644 --- a/etl/pdfReader/pdfReaderToText.py +++ b/etl/pdfReader/pdfReaderToText.py @@ -1,7 +1,8 @@ from etl.utils.logger import Logger import logging import pymupdf - +from etl.pdfReader.sitenotes import QuidosSiteNotes +from etl.pdfReader.reportType import ReportType class pdfReaderToText(): @@ -11,6 +12,7 @@ class pdfReaderToText(): self.all_text = "" self.text_list = [] self.get_text_from_pdf_file() + self.type = None def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") @@ -22,5 +24,19 @@ class pdfReaderToText(): self.text_list = self.all_text.split('\n') - def get_list_of_test(self): - return self.text_list \ No newline at end of file + def get_list_of_text(self): + return self.text_list + + def get_file_type(self): + if len(self.text_list) > 1: + if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower(): + self.type = ReportType.QUIDOS_SITE_NOTE + return self.type + else: + raise NotImplementedError("New type of file - please contact Jun-te Kim") + + def get_reader(self): + self.get_file_type() + + if self.type.name == ReportType.QUIDOS_SITE_NOTE.name: + return QuidosSiteNotes(self.text_list) \ No newline at end of file diff --git a/etl/pdfReader/reportType.py b/etl/pdfReader/reportType.py new file mode 100644 index 0000000..09f11ef --- /dev/null +++ b/etl/pdfReader/reportType.py @@ -0,0 +1,5 @@ +from enum import Enum + + +class ReportType(Enum): + QUIDOS_SITE_NOTE = 1 \ No newline at end of file diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py new file mode 100644 index 0000000..260761f --- /dev/null +++ b/etl/pdfReader/sitenotes.py @@ -0,0 +1,11 @@ +from etl.pdfReader.reportType import ReportType + +class SiteNotes(): + def __init__(self, data_list): + self.raw_data = data_list + + +class QuidosSiteNotes(SiteNotes): + def __init__(self, data_list): + super().__init__(data_list) + self.type = ReportType.QUIDOS_SITE_NOTE \ No newline at end of file diff --git a/etl/scraper/scraper.py b/etl/scraper/scraper.py index 7867c7f..c1fc0fd 100644 --- a/etl/scraper/scraper.py +++ b/etl/scraper/scraper.py @@ -229,7 +229,7 @@ class SharePointScraper(): def create_temp_file(self, content, path): # Ensure the path is under /tmp/ - path = os.path.join("/tmp", path) + path = os.path.join("/tmp/sharepoint/", path) # Ensure the parent directory exists os.makedirs(os.path.dirname(path), exist_ok=True)