added new files to allow data extraction

This commit is contained in:
Jun-te Kim 2025-03-11 11:06:43 +00:00
parent 298ccdbc38
commit af338dd02b
5 changed files with 44 additions and 9 deletions

View file

@ -4,11 +4,7 @@ from etl.scraper.scraper import SharePointScraper, SharePointInstaller
from pprint import pprint, pformat
import logging
from etl.utils.logger import Logger
from etl.validator.validator import DomnaSharePointValidator
DATA_LOC = "/workspaces/survey-extraction/data/"
INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf"
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
@ -28,10 +24,17 @@ def main():
# list_of_house_ass_names = south_coast_scraper.get_housing_association_names()
# logger.info(pformat(list_of_house_ass_names))
# POC of work completed
# POC of downloading each file
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
south_coast_scraper.download_file_for_each_address()
# POC of pdf reader
DATA_LOC = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf"
pdfReader = pdfReaderToText(DATA_LOC)
siteNoteReader = pdfReader.get_reader()
logger.warning(siteNoteReader.type)
# logger.info(south_coast_scraper.surveyor_to_housing_assosications)

View file

@ -1,7 +1,8 @@
from etl.utils.logger import Logger
import logging
import pymupdf
from etl.pdfReader.sitenotes import QuidosSiteNotes
from etl.pdfReader.reportType import ReportType
class pdfReaderToText():
@ -11,6 +12,7 @@ class pdfReaderToText():
self.all_text = ""
self.text_list = []
self.get_text_from_pdf_file()
self.type = None
def get_text_from_pdf_file(self):
self.logger.debug(f"Extrating text from {self.source_path}")
@ -22,5 +24,19 @@ class pdfReaderToText():
self.text_list = self.all_text.split('\n')
def get_list_of_test(self):
return self.text_list
def get_list_of_text(self):
return self.text_list
def get_file_type(self):
if len(self.text_list) > 1:
if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower():
self.type = ReportType.QUIDOS_SITE_NOTE
return self.type
else:
raise NotImplementedError("New type of file - please contact Jun-te Kim")
def get_reader(self):
self.get_file_type()
if self.type.name == ReportType.QUIDOS_SITE_NOTE.name:
return QuidosSiteNotes(self.text_list)

View file

@ -0,0 +1,5 @@
from enum import Enum
class ReportType(Enum):
QUIDOS_SITE_NOTE = 1

View file

@ -0,0 +1,11 @@
from etl.pdfReader.reportType import ReportType
class SiteNotes():
def __init__(self, data_list):
self.raw_data = data_list
class QuidosSiteNotes(SiteNotes):
def __init__(self, data_list):
super().__init__(data_list)
self.type = ReportType.QUIDOS_SITE_NOTE

View file

@ -229,7 +229,7 @@ class SharePointScraper():
def create_temp_file(self, content, path):
# Ensure the path is under /tmp/
path = os.path.join("/tmp", path)
path = os.path.join("/tmp/sharepoint/", path)
# Ensure the parent directory exists
os.makedirs(os.path.dirname(path), exist_ok=True)