from etl.utils.logger import Logger import logging import pymupdf from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor, CSR, ConditionReport from etl.pdfReader.reportType import ReportType class pdfReaderToText(): def __init__(self, file_path): self.source_path = file_path self.logger = Logger(name='pdfReader', level=logging.INFO).get_logger() self.all_text = "" self.text_list = [] self.get_text_from_pdf_file() self.type = None self.get_file_type() def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") pdf = pymupdf.open(self.source_path) for page in pdf: text = page.get_text() self.all_text += text self.text_list = self.all_text.split('\n') def get_list_of_text(self): return self.text_list def get_file_type(self): if len(self.text_list) > 1: if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower(): self.type = ReportType.QUIDOS_PRESITE_NOTE elif "Wall pre - Masonry cavity wall-unĮlled".lower() in self.text_list[0].lower(): self.type = ReportType.U_VALUE_CALCULATOR_REPORT elif "Overwriting U-Values for EPRs for ECO4 and GBIS:" in self.text_list[1].lower(): self.type = ReportType.OVERWRITING_U_VALUE_DECLARATION_FORM elif "Energy Performance Report" in self.text_list: self.type = ReportType.ENERGY_PERFORMANCE_REPORT elif "Chartered Surveyor Report: Recommending Extraction of Defective Cavity Wall Insulation " in self.text_list: self.type = ReportType.CHARTED_SURVEYOR_REPORT elif "Osmosis ACD NEW PAS 2035 Condition Report".lower() in self.text_list[0].lower(): self.type = ReportType.OSMOSIS_CONDITION_PAS_2035_REPORT else: pass return self.type def get_reader(self): self.get_file_type() if self.type.name == ReportType.QUIDOS_PRESITE_NOTE.name: return QuidosSiteNotesExtractor(self.text_list) elif self.type == ReportType.CHARTED_SURVEYOR_REPORT: return CSR(self.text_list) elif self.type == ReportType.OSMOSIS_CONDITION_PAS_2035_REPORT: return ConditionReport(self.text_list)