update local runner to work for elmhurst

This commit is contained in:
Daniel Roth 2026-04-24 14:01:36 +00:00
parent 1105491141
commit 20ef8cd489
2 changed files with 20 additions and 6 deletions

View file

@ -18,19 +18,27 @@ from backend.app.db.models.epc_property import (
EpcPropertyModel,
EpcWindowModel,
)
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor
from backend.documents_parser.pdf import pdf_to_text_list
from backend.documents_parser.pdf import pdf_to_pages, pdf_to_text_list
from datatypes.epc.domain.epc_property_data import EnergyElement, EpcPropertyData
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
from datatypes.epc.surveys.pashub_rdsap_site_notes import PasHubRdSapSiteNotes
def _parse_pdf(pdf_path: str) -> EpcPropertyData:
with open(pdf_path, "rb") as f:
pdf_bytes: bytes = f.read()
pages: List[str] = pdf_to_text_list(pdf_bytes)
site_notes: PasHubRdSapSiteNotes = PasHubRdSapSiteNotesExtractor(pages).extract()
return EpcPropertyDataMapper.from_site_notes(site_notes)
pages: List[str] = pdf_to_pages(pdf_bytes)
full_text: str = "\n".join(pages)
if "Elmhurst Energy Systems" in full_text:
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
tokens: List[str] = pdf_to_text_list(pdf_bytes)
pashub_notes = PasHubRdSapSiteNotesExtractor(tokens).extract()
return EpcPropertyDataMapper.from_site_notes(pashub_notes)
def _insert_energy_elements(
@ -119,4 +127,5 @@ def run(pdf_path: str) -> None:
if __name__ == "__main__":
run("backend/documents_parser/tests/fixtures/PasHubSiteNotes_6.pdf")
# run("backend/documents_parser/tests/fixtures/PasHubSiteNotes_6.pdf")
run("backend/documents_parser/tests/fixtures/ElmhurstSiteNotes.pdf")

View file

@ -10,3 +10,8 @@ def pdf_to_text_list(pdf_bytes: bytes) -> List[str]:
for line in page.get_text().split("\n"):
tokens.append(line)
return tokens
def pdf_to_pages(pdf_bytes: bytes) -> List[str]:
with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc:
return [page.get_text() for page in doc]