Model/backend/pashub_fetcher/pashub_service.py

190 lines
6.8 KiB
Python

import os
from datetime import datetime, timezone
from typing import Callable, List, NamedTuple, Optional, cast
from backend.app.db.connection import db_session
from backend.app.db.models.uploaded_file import (
FileSourceEnum,
FileTypeEnum,
UploadedFile,
)
from backend.documents_parser.db_writer import save_epc_property_data
from backend.documents_parser.parser import parse_site_notes_pdf
from backend.pashub_fetcher.core_files import get_file_type_string
from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError
from backend.pashub_fetcher.pashub_to_ara_trigger_request import (
PashubToAraTriggerRequest,
)
from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders
from datatypes.epc.domain.epc_property_data import EpcPropertyData
from utils.logger import setup_logger
from utils.s3 import upload_file_to_s3
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
logger = setup_logger()
class _FileUploadRecord(NamedTuple):
file_path: str
file_type: Optional[str]
uploaded_file_id: int
class PashubService:
def __init__(
self,
pashub_client: PashubClient,
sharepoint_client: DomnaSharepointClient,
s3_bucket: str,
coordination_client_factory: Optional[Callable[[], PashubClient]] = None,
) -> None:
self._pashub_client = pashub_client
self._sharepoint_client = sharepoint_client
self._s3_bucket = s3_bucket
self._coordination_client_factory = coordination_client_factory
self._coordination_client: Optional[PashubClient] = None
def _get_coordination_client(self) -> PashubClient:
if self._coordination_client_factory is None:
raise UnauthorizedError("No coordination client factory configured")
if self._coordination_client is None:
self._coordination_client = self._coordination_client_factory()
return self._coordination_client
def run(self, request: PashubToAraTriggerRequest) -> List[str]:
job_id = request.pashub_job_id
active_client = self._pashub_client
if request.uprn:
uprn: Optional[str] = request.uprn
else:
try:
uprn = active_client.get_uprn_by_job_id(job_id)
except UnauthorizedError:
logger.info(
f"PasHub credentials unauthorized for job {job_id}; retrying with CoordinationHub credentials"
)
active_client = self._get_coordination_client()
uprn = active_client.get_uprn_by_job_id(job_id)
hubspot_deal_id: Optional[str] = request.hubspot_deal_id
if uprn:
logger.info(f"Got UPRN {uprn} for job {job_id}")
else:
logger.info(f"No UPRN found for job {job_id}")
try:
job_files: List[str] = active_client.get_core_evidence_files_by_job_id(
job_id
)
except UnauthorizedError:
if active_client is not self._pashub_client:
raise
active_client = self._get_coordination_client()
job_files = active_client.get_core_evidence_files_by_job_id(job_id)
if uprn or hubspot_deal_id:
logger.info("Uploading files to s3")
file_source = (
FileSourceEnum.PAS_HUB
if active_client is self._pashub_client
else FileSourceEnum.COORDINATION_HUB
)
upload_records = self._upload_to_s3_and_update_db(
job_files, uprn, hubspot_deal_id, file_source
)
self._save_site_notes(upload_records)
# SharePoint upload disabled: pashub sharepoint_link is inconsistent
# (points to property or project unpredictably)
# if request.sharepoint_link:
# self._upload_to_sharepoint(request.sharepoint_link, job_files)
for file_path in job_files:
try:
os.remove(file_path)
except OSError:
logger.warning(f"Failed to delete temp file {file_path}")
return job_files
def _upload_to_s3_and_update_db(
self,
job_files: List[str],
uprn: Optional[str],
hubspot_deal_id: Optional[str],
file_source: FileSourceEnum,
) -> List[_FileUploadRecord]:
if not uprn and not hubspot_deal_id:
return []
base_path = (
f"documents/uprn/{uprn}"
if uprn
else f"documents/hubspot_deal_id/{hubspot_deal_id}"
)
file_paths: List[str] = []
uploaded_files: List[UploadedFile] = []
for file_path in job_files:
filename = os.path.basename(file_path)
file_key = f"{base_path}/{filename}"
upload_file_to_s3(file_path, self._s3_bucket, file_key)
uploaded_file = UploadedFile(
s3_file_bucket=self._s3_bucket,
s3_file_key=file_key,
s3_upload_timestamp=datetime.now(timezone.utc),
uprn=int(uprn) if uprn else None,
hubspot_deal_id=hubspot_deal_id,
file_source=file_source.value,
file_type=get_file_type_string(filename),
)
file_paths.append(file_path)
uploaded_files.append(uploaded_file)
with db_session() as session:
session.add_all(uploaded_files)
session.flush()
upload_records = [
_FileUploadRecord(
file_path=fp,
file_type=cast(Optional[str], uf.file_type),
uploaded_file_id=cast(int, uf.id),
)
for fp, uf in zip(file_paths, uploaded_files)
]
return upload_records
def _save_site_notes(self, upload_records: List[_FileUploadRecord]) -> None:
for record in upload_records:
if (
record.file_type is None
or FileTypeEnum(record.file_type) != FileTypeEnum.RD_SAP_SITE_NOTE
):
continue
try:
epc_data: EpcPropertyData = parse_site_notes_pdf(record.file_path)
with db_session() as session:
save_epc_property_data(
session, epc_data, uploaded_file_id=record.uploaded_file_id
)
except Exception:
logger.warning(
f"Failed to parse site notes {record.file_path}", exc_info=True
)
def _upload_to_sharepoint(
self,
sharepoint_link: str,
job_files: List[str],
) -> None:
assessment_path = f"{sharepoint_link}/{SharepointSubfolders.ASSESSMENT.value}"
for file_path in job_files:
filename = file_path.split("/")[-1]
self._sharepoint_client.upload_file(file_path, assessment_path, filename)