mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-08 11:17:29 +00:00
added new files to allow data extraction
This commit is contained in:
parent
298ccdbc38
commit
af338dd02b
5 changed files with 44 additions and 9 deletions
13
etl/main.py
13
etl/main.py
|
|
@ -4,11 +4,7 @@ from etl.scraper.scraper import SharePointScraper, SharePointInstaller
|
|||
from pprint import pprint, pformat
|
||||
import logging
|
||||
from etl.utils.logger import Logger
|
||||
|
||||
from etl.validator.validator import DomnaSharePointValidator
|
||||
|
||||
DATA_LOC = "/workspaces/survey-extraction/data/"
|
||||
INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf"
|
||||
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
|
||||
|
||||
|
||||
|
|
@ -28,10 +24,17 @@ def main():
|
|||
# list_of_house_ass_names = south_coast_scraper.get_housing_association_names()
|
||||
# logger.info(pformat(list_of_house_ass_names))
|
||||
|
||||
# POC of work completed
|
||||
# POC of downloading each file
|
||||
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
||||
south_coast_scraper.download_file_for_each_address()
|
||||
|
||||
# POC of pdf reader
|
||||
DATA_LOC = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf"
|
||||
pdfReader = pdfReaderToText(DATA_LOC)
|
||||
siteNoteReader = pdfReader.get_reader()
|
||||
logger.warning(siteNoteReader.type)
|
||||
|
||||
|
||||
|
||||
# logger.info(south_coast_scraper.surveyor_to_housing_assosications)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
from etl.utils.logger import Logger
|
||||
import logging
|
||||
import pymupdf
|
||||
|
||||
from etl.pdfReader.sitenotes import QuidosSiteNotes
|
||||
from etl.pdfReader.reportType import ReportType
|
||||
|
||||
class pdfReaderToText():
|
||||
|
||||
|
|
@ -11,6 +12,7 @@ class pdfReaderToText():
|
|||
self.all_text = ""
|
||||
self.text_list = []
|
||||
self.get_text_from_pdf_file()
|
||||
self.type = None
|
||||
|
||||
def get_text_from_pdf_file(self):
|
||||
self.logger.debug(f"Extrating text from {self.source_path}")
|
||||
|
|
@ -22,5 +24,19 @@ class pdfReaderToText():
|
|||
|
||||
self.text_list = self.all_text.split('\n')
|
||||
|
||||
def get_list_of_test(self):
|
||||
return self.text_list
|
||||
def get_list_of_text(self):
|
||||
return self.text_list
|
||||
|
||||
def get_file_type(self):
|
||||
if len(self.text_list) > 1:
|
||||
if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower():
|
||||
self.type = ReportType.QUIDOS_SITE_NOTE
|
||||
return self.type
|
||||
else:
|
||||
raise NotImplementedError("New type of file - please contact Jun-te Kim")
|
||||
|
||||
def get_reader(self):
|
||||
self.get_file_type()
|
||||
|
||||
if self.type.name == ReportType.QUIDOS_SITE_NOTE.name:
|
||||
return QuidosSiteNotes(self.text_list)
|
||||
5
etl/pdfReader/reportType.py
Normal file
5
etl/pdfReader/reportType.py
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
from enum import Enum
|
||||
|
||||
|
||||
class ReportType(Enum):
|
||||
QUIDOS_SITE_NOTE = 1
|
||||
11
etl/pdfReader/sitenotes.py
Normal file
11
etl/pdfReader/sitenotes.py
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
from etl.pdfReader.reportType import ReportType
|
||||
|
||||
class SiteNotes():
|
||||
def __init__(self, data_list):
|
||||
self.raw_data = data_list
|
||||
|
||||
|
||||
class QuidosSiteNotes(SiteNotes):
|
||||
def __init__(self, data_list):
|
||||
super().__init__(data_list)
|
||||
self.type = ReportType.QUIDOS_SITE_NOTE
|
||||
|
|
@ -229,7 +229,7 @@ class SharePointScraper():
|
|||
|
||||
def create_temp_file(self, content, path):
|
||||
# Ensure the path is under /tmp/
|
||||
path = os.path.join("/tmp", path)
|
||||
path = os.path.join("/tmp/sharepoint/", path)
|
||||
|
||||
# Ensure the parent directory exists
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue