diff --git a/backend/documents_parser/local_runner.py b/backend/documents_parser/local_runner.py index a50786ea..89dc7cdb 100644 --- a/backend/documents_parser/local_runner.py +++ b/backend/documents_parser/local_runner.py @@ -18,19 +18,27 @@ from backend.app.db.models.epc_property import ( EpcPropertyModel, EpcWindowModel, ) +from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor -from backend.documents_parser.pdf import pdf_to_text_list +from backend.documents_parser.pdf import pdf_to_pages, pdf_to_text_list from datatypes.epc.domain.epc_property_data import EnergyElement, EpcPropertyData from datatypes.epc.domain.mapper import EpcPropertyDataMapper -from datatypes.epc.surveys.pashub_rdsap_site_notes import PasHubRdSapSiteNotes def _parse_pdf(pdf_path: str) -> EpcPropertyData: with open(pdf_path, "rb") as f: pdf_bytes: bytes = f.read() - pages: List[str] = pdf_to_text_list(pdf_bytes) - site_notes: PasHubRdSapSiteNotes = PasHubRdSapSiteNotesExtractor(pages).extract() - return EpcPropertyDataMapper.from_site_notes(site_notes) + + pages: List[str] = pdf_to_pages(pdf_bytes) + full_text: str = "\n".join(pages) + + if "Elmhurst Energy Systems" in full_text: + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + tokens: List[str] = pdf_to_text_list(pdf_bytes) + pashub_notes = PasHubRdSapSiteNotesExtractor(tokens).extract() + return EpcPropertyDataMapper.from_site_notes(pashub_notes) def _insert_energy_elements( @@ -119,4 +127,5 @@ def run(pdf_path: str) -> None: if __name__ == "__main__": - run("backend/documents_parser/tests/fixtures/PasHubSiteNotes_6.pdf") + # run("backend/documents_parser/tests/fixtures/PasHubSiteNotes_6.pdf") + run("backend/documents_parser/tests/fixtures/ElmhurstSiteNotes.pdf") diff --git a/backend/documents_parser/pdf.py b/backend/documents_parser/pdf.py index dfa07300..53e209ad 100644 --- a/backend/documents_parser/pdf.py +++ b/backend/documents_parser/pdf.py @@ -10,3 +10,8 @@ def pdf_to_text_list(pdf_bytes: bytes) -> List[str]: for line in page.get_text().split("\n"): tokens.append(line) return tokens + + +def pdf_to_pages(pdf_bytes: bytes) -> List[str]: + with pymupdf.open(stream=pdf_bytes, filetype="pdf") as doc: + return [page.get_text() for page in doc]