mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-08 11:17:29 +00:00
58 lines
No EOL
2.4 KiB
Python
58 lines
No EOL
2.4 KiB
Python
from etl.utils.logger import Logger
|
|
import logging
|
|
import pymupdf
|
|
from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor, CSR, ConditionReport
|
|
from etl.pdfReader.reportType import ReportType
|
|
|
|
class pdfReaderToText():
|
|
|
|
def __init__(self, file_path):
|
|
self.source_path = file_path
|
|
self.logger = Logger(name='pdfReader', level=logging.INFO).get_logger()
|
|
self.all_text = ""
|
|
self.text_list = []
|
|
self.get_text_from_pdf_file()
|
|
self.type = None
|
|
self.get_file_type()
|
|
|
|
def get_text_from_pdf_file(self):
|
|
self.logger.debug(f"Extrating text from {self.source_path}")
|
|
pdf = pymupdf.open(self.source_path)
|
|
|
|
for page in pdf:
|
|
text = page.get_text()
|
|
self.all_text += text
|
|
|
|
self.text_list = self.all_text.split('\n')
|
|
|
|
def get_list_of_text(self):
|
|
return self.text_list
|
|
|
|
def get_file_type(self):
|
|
if len(self.text_list) > 1:
|
|
if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower():
|
|
self.type = ReportType.QUIDOS_PRESITE_NOTE
|
|
elif "Wall pre - Masonry cavity wall-unĮlled".lower() in self.text_list[0].lower():
|
|
self.type = ReportType.U_VALUE_CALCULATOR_REPORT
|
|
elif "Overwriting U-Values for EPRs for ECO4 and GBIS:" in self.text_list[1].lower():
|
|
self.type = ReportType.OVERWRITING_U_VALUE_DECLARATION_FORM
|
|
elif "Energy Performance Report" in self.text_list:
|
|
self.type = ReportType.ENERGY_PERFORMANCE_REPORT
|
|
elif "Chartered Surveyor Report: Recommending Extraction of Defective Cavity Wall Insulation " in self.text_list:
|
|
self.type = ReportType.CHARTED_SURVEYOR_REPORT
|
|
elif "Osmosis ACD NEW PAS 2035 Condition Report".lower() in self.text_list[0].lower():
|
|
self.type = ReportType.OSMOSIS_CONDITION_PAS_2035_REPORT
|
|
else:
|
|
pass
|
|
return self.type
|
|
|
|
def get_reader(self):
|
|
self.get_file_type()
|
|
|
|
if self.type.name == ReportType.QUIDOS_PRESITE_NOTE.name:
|
|
return QuidosSiteNotesExtractor(self.text_list)
|
|
elif self.type == ReportType.CHARTED_SURVEYOR_REPORT:
|
|
return CSR(self.text_list)
|
|
elif self.type == ReportType.OSMOSIS_CONDITION_PAS_2035_REPORT:
|
|
return ConditionReport(self.text_list)
|
|
|