mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
update local runner to work for elmhurst
This commit is contained in:
parent
1105491141
commit
20ef8cd489
2 changed files with 20 additions and 6 deletions
|
|
@ -18,19 +18,27 @@ from backend.app.db.models.epc_property import (
|
|||
EpcPropertyModel,
|
||||
EpcWindowModel,
|
||||
)
|
||||
from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor
|
||||
from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor
|
||||
from backend.documents_parser.pdf import pdf_to_text_list
|
||||
from backend.documents_parser.pdf import pdf_to_pages, pdf_to_text_list
|
||||
from datatypes.epc.domain.epc_property_data import EnergyElement, EpcPropertyData
|
||||
from datatypes.epc.domain.mapper import EpcPropertyDataMapper
|
||||
from datatypes.epc.surveys.pashub_rdsap_site_notes import PasHubRdSapSiteNotes
|
||||
|
||||
|
||||
def _parse_pdf(pdf_path: str) -> EpcPropertyData:
|
||||
with open(pdf_path, "rb") as f:
|
||||
pdf_bytes: bytes = f.read()
|
||||
pages: List[str] = pdf_to_text_list(pdf_bytes)
|
||||
site_notes: PasHubRdSapSiteNotes = PasHubRdSapSiteNotesExtractor(pages).extract()
|
||||
return EpcPropertyDataMapper.from_site_notes(site_notes)
|
||||
|
||||
pages: List[str] = pdf_to_pages(pdf_bytes)
|
||||
full_text: str = "\n".join(pages)
|
||||
|
||||
if "Elmhurst Energy Systems" in full_text:
|
||||
site_notes = ElmhurstSiteNotesExtractor(pages).extract()
|
||||
return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes)
|
||||
|
||||
tokens: List[str] = pdf_to_text_list(pdf_bytes)
|
||||
pashub_notes = PasHubRdSapSiteNotesExtractor(tokens).extract()
|
||||
return EpcPropertyDataMapper.from_site_notes(pashub_notes)
|
||||
|
||||
|
||||
def _insert_energy_elements(
|
||||
|
|
@ -119,4 +127,5 @@ def run(pdf_path: str) -> None:
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run("backend/documents_parser/tests/fixtures/PasHubSiteNotes_6.pdf")
|
||||
# run("backend/documents_parser/tests/fixtures/PasHubSiteNotes_6.pdf")
|
||||
run("backend/documents_parser/tests/fixtures/ElmhurstSiteNotes.pdf")
|
||||
|
|
|
|||
|
|
@ -10,3 +10,8 @@ def pdf_to_text_list(pdf_bytes: bytes) -> List[str]:
|
|||
for line in page.get_text().split("\n"):
|
||||
tokens.append(line)
|
||||
return tokens
|
||||
|
||||
|
||||
def pdf_to_pages(pdf_bytes: bytes) -> List[str]:
|
||||
with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc:
|
||||
return [page.get_text() for page in doc]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue