from etl.utils.logger import Logger import logging import pymupdf from etl.pdfReader.sitenotes import QuidosSiteNotes from etl.pdfReader.reportType import ReportType class pdfReaderToText(): def __init__(self, file_path): self.source_path = file_path self.logger = Logger(name='pdfReader', level=logging.DEBUG).get_logger() self.all_text = "" self.text_list = [] self.get_text_from_pdf_file() self.type = None def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") pdf = pymupdf.open(self.source_path) for page in pdf: text = page.get_text() self.all_text += text self.text_list = self.all_text.split('\n') def get_list_of_text(self): return self.text_list def get_file_type(self): if len(self.text_list) > 1: if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower(): self.type = ReportType.QUIDOS_SITE_NOTE return self.type else: raise NotImplementedError("New type of file - please contact Jun-te Kim") def get_reader(self): self.get_file_type() if self.type.name == ReportType.QUIDOS_SITE_NOTE.name: return QuidosSiteNotes(self.text_list)