mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
28 lines
1.1 KiB
Python
28 lines
1.1 KiB
Python
from typing import List
|
|
|
|
from datatypes.epc.domain.epc_property_data import EpcPropertyData
|
|
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
|
|
|
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
|
|
from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor
|
|
from backend.documents_parser.pdf import pdf_to_pages, pdf_to_text_list
|
|
|
|
|
|
def parse_site_notes_pdf(file_path: str) -> EpcPropertyData:
|
|
with open(file_path, "rb") as f:
|
|
pdf_bytes = f.read()
|
|
pages = pdf_to_pages(pdf_bytes)
|
|
if "Elmhurst Energy Systems" in "\n".join(pages):
|
|
return _parse_elmhurst(pages)
|
|
return _parse_pashub(pdf_bytes)
|
|
|
|
|
|
def _parse_elmhurst(pages: List[str]) -> EpcPropertyData:
|
|
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
|
|
return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
|
|
|
|
|
|
def _parse_pashub(pdf_bytes: bytes) -> EpcPropertyData:
|
|
tokens = pdf_to_text_list(pdf_bytes)
|
|
site_notes = PasHubRdSapSiteNotesExtractor(tokens).extract()
|
|
return EpcPropertyDataMapper.from_site_notes(site_notes)
|