diff --git a/backend/documents_parser/parser.py b/backend/documents_parser/parser.py index 0f6760d7..cff21e0e 100644 --- a/backend/documents_parser/parser.py +++ b/backend/documents_parser/parser.py @@ -1,13 +1,28 @@ +from typing import List + from datatypes.epc.domain.epc_property_data import EpcPropertyData from datatypes.epc.domain.mapper import EpcPropertyDataMapper +from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor -from backend.documents_parser.pdf import pdf_to_text_list +from backend.documents_parser.pdf import pdf_to_pages, pdf_to_text_list -def parse_pashub_site_notes(file_path: str) -> EpcPropertyData: +def parse_site_notes_pdf(file_path: str) -> EpcPropertyData: with open(file_path, "rb") as f: pdf_bytes = f.read() + pages = pdf_to_pages(pdf_bytes) + if "Elmhurst Energy Systems" in "\n".join(pages): + return _parse_elmhurst(pages) + return _parse_pashub(pdf_bytes) + + +def _parse_elmhurst(pages: List[str]) -> EpcPropertyData: + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + +def _parse_pashub(pdf_bytes: bytes) -> EpcPropertyData: tokens = pdf_to_text_list(pdf_bytes) site_notes = PasHubRdSapSiteNotesExtractor(tokens).extract() return EpcPropertyDataMapper.from_site_notes(site_notes) diff --git a/backend/ecmk_fetcher/upload.py b/backend/ecmk_fetcher/upload.py index cc2c908d..edfaf9f1 100644 --- a/backend/ecmk_fetcher/upload.py +++ b/backend/ecmk_fetcher/upload.py @@ -1,5 +1,6 @@ from datetime import datetime, timezone import os +from typing import cast from backend.app.db.connection import db_session from backend.app.db.models.uploaded_file import ( @@ -7,9 +8,14 @@ from backend.app.db.models.uploaded_file import ( FileTypeEnum, UploadedFile, ) +from backend.documents_parser.db_writer import save_epc_property_data +from backend.documents_parser.parser import parse_site_notes_pdf +from utils.logger import setup_logger from utils.s3 import upload_file_to_s3 from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient +logger = setup_logger() + def upload_file_to_sharepoint( client: DomnaSharepointClient, @@ -43,7 +49,7 @@ def upload_excel_to_sharepoint( # TODO: this should be moved to somewhere common and called by pashub fetcher def upload_file_to_s3_and_update_db( bucket: str, file_path: str, hubspot_listing_id: str, file_type: FileTypeEnum -) -> None: +) -> int: filename: str = os.path.basename(file_path) key: str = f"documents/hubspot_listing_id/{hubspot_listing_id}/{filename}" @@ -61,4 +67,14 @@ def upload_file_to_s3_and_update_db( with db_session() as session: # TODO: we should do multiple files at once to reduce db trips session.add(uploaded_file) - session.commit() + session.flush() + uploaded_file_id: int = int(cast(int, uploaded_file.id)) + + if file_type == FileTypeEnum.ECMK_RD_SAP_SITE_NOTE: + try: + epc_data = parse_site_notes_pdf(file_path) + save_epc_property_data(session, epc_data, uploaded_file_id=uploaded_file_id) + except Exception: + logger.warning(f"Failed to parse/save site notes {file_path}", exc_info=True) + + return uploaded_file_id diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index a74f9a2d..7874d686 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -10,7 +10,7 @@ from backend.app.db.models.uploaded_file import ( UploadedFile, ) from backend.documents_parser.db_writer import save_epc_property_data -from backend.documents_parser.parser import parse_pashub_site_notes +from backend.documents_parser.parser import parse_site_notes_pdf from backend.pashub_fetcher.core_files import infer_file_type from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( @@ -106,7 +106,7 @@ def upload_job_to_s3_and_update_db( ): try: site_notes_pairs.append( - (uploaded_file, parse_pashub_site_notes(file_path)) + (uploaded_file, parse_site_notes_pdf(file_path)) ) except Exception: logger.warning(f"Failed to parse site notes {file_path}", exc_info=True)