from etl.utils.logger import Logger import logging import pymupdf from etl.fileReader.sitenotes import ( QuidosSiteNotesExtractor, CSR, WarmHomesConditionReport, ECOConditionReport, EnergyPerformanceReportWithData, EnergyPerformanceReportSummaryInformation ) from etl.fileReader.reportType import ReportType from pprint import pprint class pdfReaderToText(): def __init__(self, file_path): self.source_path = file_path self.logger = Logger(name='pdfReader', level=logging.INFO).get_logger() self.all_text = "" self.text_list = [] self.get_text_from_pdf_file() self.type = None self.get_file_type() def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") pdf = pymupdf.open(self.source_path) for page in pdf: text = page.get_text() self.all_text += text self.text_list = self.all_text.split('\n') def get_list_of_text(self): return self.text_list def get_file_type(self): if len(self.text_list) > 1: if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower(): self.type = ReportType.QUIDOS_PRESITE_NOTE elif "Wall pre - Masonry cavity wall-unĮlled".lower() in self.text_list[0].lower(): self.type = ReportType.U_VALUE_CALCULATOR_REPORT elif "Overwriting U-Values for EPRs for ECO4 and GBIS:" in self.text_list[1].lower(): self.type = ReportType.OVERWRITING_U_VALUE_DECLARATION_FORM elif "Energy Performance Report" in self.text_list: self.type = ReportType.ENERGY_PERFORMANCE_REPORT elif "Chartered Surveyor Report: Recommending Extraction of Defective Cavity Wall Insulation " in self.text_list: self.type = ReportType.CHARTED_SURVEYOR_REPORT elif "Osmosis ACD NEW PAS 2035 Condition Report".lower() in self.text_list[0].lower(): self.type = ReportType.WARM_HOMES_CONDITION_REPORT elif "Domna NEW PAS 2035 ECO Condition Report".lower() in self.text_list[0].lower(): self.type = ReportType.ECO_CONDITION_REPORT elif "ENERGY REPORT".lower() == self.text_list[0].lower() and "Data inputs" in self.text_list: self.type = ReportType.ENERGY_PERFORMANCE_REPORT_WITH_DATA elif "Summary Information".lower() == self.text_list[0].lower(): self.type = ReportType.ENERGY_PERFORMANCE_REPORT_SUMMARY_INFORMATION else: pass return self.type def get_reader(self): self.get_file_type() if self.type.name == ReportType.QUIDOS_PRESITE_NOTE.name: return QuidosSiteNotesExtractor(self.text_list) elif self.type == ReportType.CHARTED_SURVEYOR_REPORT: return CSR(self.text_list) elif self.type == ReportType.WARM_HOMES_CONDITION_REPORT: return WarmHomesConditionReport(self.text_list) elif self.type == ReportType.ECO_CONDITION_REPORT: return ECOConditionReport(self.text_list) elif self.type == ReportType.ENERGY_PERFORMANCE_REPORT_WITH_DATA: return EnergyPerformanceReportWithData(self.text_list) elif self.type == ReportType.ENERGY_PERFORMANCE_REPORT_SUMMARY_INFORMATION: return EnergyPerformanceReportSummaryInformation(self.text_list)