from typing import List from domain.epc.epc_property_data import EpcPropertyData from domain.epc.mapper import EpcPropertyDataMapper from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor from backend.documents_parser.pdf import pdf_to_pages, pdf_to_text_list def parse_site_notes_pdf(file_path: str) -> EpcPropertyData: with open(file_path, "rb") as f: pdf_bytes = f.read() pages = pdf_to_pages(pdf_bytes) if "Elmhurst Energy Systems" in "\n".join(pages): return _parse_elmhurst(pages) return _parse_pashub(pdf_bytes) def _parse_elmhurst(pages: List[str]) -> EpcPropertyData: site_notes = ElmhurstSiteNotesExtractor(pages).extract() return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) def _parse_pashub(pdf_bytes: bytes) -> EpcPropertyData: tokens = pdf_to_text_list(pdf_bytes) site_notes = PasHubRdSapSiteNotesExtractor(tokens).extract() return EpcPropertyDataMapper.from_site_notes(site_notes)