diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index 19c5c2f9..92554369 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -1,16 +1,21 @@ import os from datetime import datetime, timezone -from typing import List, Optional, Tuple, cast +from typing import List, NamedTuple, Optional, cast from backend.app.db.connection import db_session -from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum, UploadedFile +from backend.app.db.models.uploaded_file import ( + FileSourceEnum, + FileTypeEnum, + UploadedFile, +) from backend.documents_parser.db_writer import save_epc_property_data from backend.documents_parser.parser import parse_site_notes_pdf from backend.pashub_fetcher.core_files import infer_file_type from backend.pashub_fetcher.pashub_client import PashubClient -from backend.pashub_fetcher.pashub_to_ara_trigger_request import PashubToAraTriggerRequest +from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( + PashubToAraTriggerRequest, +) from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders -from datatypes.epc.domain.epc_property_data import EpcPropertyData from utils.logger import setup_logger from utils.s3 import upload_file_to_s3 from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient @@ -18,6 +23,12 @@ from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient logger = setup_logger() +class _FileUploadRecord(NamedTuple): + file_path: str + file_type: Optional[str] + uploaded_file_id: int + + class PashubService: def __init__( self, @@ -32,7 +43,9 @@ class PashubService: def run(self, request: PashubToAraTriggerRequest) -> List[str]: job_id = request.pashub_job_id - uprn: Optional[str] = request.uprn or self._pashub_client.get_uprn_by_job_id(job_id) + uprn: Optional[str] = request.uprn or self._pashub_client.get_uprn_by_job_id( + job_id + ) hubspot_deal_id: Optional[str] = request.hubspot_deal_id if uprn: @@ -40,11 +53,16 @@ class PashubService: else: logger.info(f"No UPRN found for job {job_id}") - job_files: List[str] = self._pashub_client.get_core_evidence_files_by_job_id(job_id) + job_files: List[str] = self._pashub_client.get_core_evidence_files_by_job_id( + job_id + ) if uprn or hubspot_deal_id: logger.info("Uploading files to s3") - self._upload_to_s3_and_update_db(job_files, uprn, hubspot_deal_id) + upload_records = self._upload_to_s3_and_update_db( + job_files, uprn, hubspot_deal_id + ) + self._save_site_notes(upload_records) # SharePoint upload disabled: pashub sharepoint_link is inconsistent # (points to property or project unpredictably) @@ -64,9 +82,9 @@ class PashubService: job_files: List[str], uprn: Optional[str], hubspot_deal_id: Optional[str], - ) -> None: + ) -> List[_FileUploadRecord]: if not uprn and not hubspot_deal_id: - return + return [] base_path = ( f"documents/uprn/{uprn}" @@ -74,8 +92,8 @@ class PashubService: else f"documents/hubspot_deal_id/{hubspot_deal_id}" ) + file_paths: List[str] = [] uploaded_files: List[UploadedFile] = [] - site_notes_pairs: List[Tuple[UploadedFile, EpcPropertyData]] = [] for file_path in job_files: filename = os.path.basename(file_path) @@ -92,27 +110,40 @@ class PashubService: file_source=FileSourceEnum.PAS_HUB.value, file_type=infer_file_type(filename), ) + file_paths.append(file_path) uploaded_files.append(uploaded_file) - file_type: Optional[str] = cast(Optional[str], uploaded_file.file_type) - if file_type is not None and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE: - try: - site_notes_pairs.append( - (uploaded_file, parse_site_notes_pdf(file_path)) - ) - except Exception: - logger.warning(f"Failed to parse site notes {file_path}", exc_info=True) - with db_session() as session: session.add_all(uploaded_files) session.flush() - - for uploaded_file, epc_data in site_notes_pairs: - save_epc_property_data( - session, epc_data, uploaded_file_id=cast(int, uploaded_file.id) + upload_records = [ + _FileUploadRecord( + file_path=fp, + file_type=cast(Optional[str], uf.file_type), + uploaded_file_id=cast(int, uf.id), ) + for fp, uf in zip(file_paths, uploaded_files) + ] - session.commit() + return upload_records + + def _save_site_notes(self, upload_records: List[_FileUploadRecord]) -> None: + for record in upload_records: + if ( + record.file_type is None + or FileTypeEnum(record.file_type) != FileTypeEnum.RD_SAP_SITE_NOTE + ): + continue + try: + epc_data = parse_site_notes_pdf(record.file_path) + with db_session() as session: + save_epc_property_data( + session, epc_data, uploaded_file_id=record.uploaded_file_id + ) + except Exception: + logger.warning( + f"Failed to parse site notes {record.file_path}", exc_info=True + ) def _upload_to_sharepoint( self,