Model/backend/documents_parser/parser.py

13 lines
564 B
Python

from datatypes.epc.domain.epc_property_data import EpcPropertyData
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor
from backend.documents_parser.pdf import pdf_to_text_list
def parse_pashub_site_notes(file_path: str) -> EpcPropertyData:
with open(file_path, "rb") as f:
pdf_bytes = f.read()
tokens = pdf_to_text_list(pdf_bytes)
site_notes = PasHubRdSapSiteNotesExtractor(tokens).extract()
return EpcPropertyDataMapper.from_site_notes(site_notes)