Downloaded files carry evidence_category as DownloadedFile 🟩

This commit is contained in:
Daniel Roth 2026-06-02 14:27:54 +00:00
parent d68cbb44f8
commit bdc35573b5
4 changed files with 83 additions and 21 deletions

View file

@ -25,8 +25,8 @@ class _EvidenceFileGroups(NamedTuple):
class DownloadedFiles(NamedTuple):
core: List[str]
other: List[str]
core: List[DownloadedFile]
other: List[DownloadedFile]
class UnauthorizedError(Exception):
@ -62,7 +62,7 @@ class PashubClient:
evidence_list
)
core_paths: List[str] = []
core_files: List[DownloadedFile] = []
for _, evidence in grouped.core.items():
if not evidence.file_id:
continue
@ -73,9 +73,15 @@ class PashubClient:
file_path: str = os.path.join("/tmp", evidence.file_name)
self._download_file(download_url, file_path)
logger.info("Successfully downloaded file")
core_paths.append(file_path)
core_files.append(
DownloadedFile(
file_path=file_path,
evidence_category=evidence.evidence_category,
created_utc=datetime.fromisoformat(evidence.created_utc),
)
)
other_paths: List[str] = []
other_files: List[DownloadedFile] = []
if include_other:
for evidence in grouped.other:
@ -88,9 +94,15 @@ class PashubClient:
self._download_file(download_url, file_path)
logger.info("Successfully downloaded other file")
other_paths.append(file_path)
other_files.append(
DownloadedFile(
file_path=file_path,
evidence_category=evidence.evidence_category,
created_utc=datetime.fromisoformat(evidence.created_utc),
)
)
return DownloadedFiles(core=core_paths, other=other_paths)
return DownloadedFiles(core=core_files, other=other_files)
def get_uprn_by_job_id(self, job_id: str) -> Optional[str]:
logger.info(f"Getting UPRN for job ID {job_id}")

View file

@ -12,6 +12,7 @@ from backend.documents_parser.db_writer import save_epc_property_data
from backend.documents_parser.parser import parse_site_notes_pdf
from backend.pashub_fetcher.core_files import get_file_type_string
from backend.pashub_fetcher.pashub_client import (
DownloadedFile,
DownloadedFiles,
PashubClient,
UnauthorizedError,
@ -116,17 +117,17 @@ class PashubService:
# if request.sharepoint_link:
# self._upload_to_sharepoint(request.sharepoint_link, downloaded.core)
for file_path in downloaded.core + downloaded.other:
for df in downloaded.core + downloaded.other:
try:
os.remove(file_path)
os.remove(df.file_path)
except OSError:
logger.warning(f"Failed to delete temp file {file_path}")
logger.warning(f"Failed to delete temp file {df.file_path}")
return downloaded.core + downloaded.other
return [df.file_path for df in downloaded.core + downloaded.other]
def _upload_to_s3_and_update_db(
self,
job_files: List[str],
job_files: List[DownloadedFile],
uprn: Optional[str],
hubspot_deal_id: Optional[str],
file_source: FileSourceEnum,
@ -144,11 +145,11 @@ class PashubService:
file_paths: List[str] = []
uploaded_files: List[UploadedFile] = []
for file_path in job_files:
filename = os.path.basename(file_path)
for df in job_files:
filename = os.path.basename(df.file_path)
file_key = f"{base_path}/{filename}"
upload_file_to_s3(file_path, self._s3_bucket, file_key)
upload_file_to_s3(df.file_path, self._s3_bucket, file_key)
uploaded_file = UploadedFile(
s3_file_bucket=self._s3_bucket,
@ -157,9 +158,9 @@ class PashubService:
uprn=int(uprn) if uprn else None,
hubspot_deal_id=hubspot_deal_id,
file_source=file_source.value,
file_type=get_file_type_string(filename) or default_file_type,
file_type=get_file_type_string(filename, df.evidence_category) or default_file_type,
)
file_paths.append(file_path)
file_paths.append(df.file_path)
uploaded_files.append(uploaded_file)
with db_session() as session:

View file

@ -119,6 +119,47 @@ def test_group_into_core_and_other_files_picks_latest_when_both_candidates_have_
assert result.core[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf"
def test_group_into_core_and_other_files_classifies_mcs_cert_as_core() -> None:
# Arrange
client = make_client()
files = [
make_file(
file_name="MCS_cert_job123.pdf",
evidence_category="MCS Compliance Certificate",
),
]
# Act
result = client._group_into_core_and_other_files(files)
# Assert
assert CoreFiles.MCS_CERTIFICATE in result.core
assert result.other == []
def test_group_into_core_and_other_files_picks_most_recent_mcs_cert() -> None:
# Arrange
client = make_client()
files = [
make_file(
file_name="mcs_cert_old.pdf",
evidence_category="MCS Compliance Certificate",
created_utc="2024-01-01T00:00:00",
),
make_file(
file_name="mcs_cert_new.pdf",
evidence_category="MCS Compliance Certificate",
created_utc="2024-06-01T00:00:00",
),
]
# Act
result = client._group_into_core_and_other_files(files)
# Assert
assert result.core[CoreFiles.MCS_CERTIFICATE].file_name == "mcs_cert_new.pdf"
def test_group_into_core_and_other_files_falls_back_to_latest_when_no_osm_candidates() -> None:
# Arrange
client = make_client()
@ -165,7 +206,7 @@ def test_get_evidence_files_by_job_id_returns_downloaded_files_with_empty_other_
# Assert
assert isinstance(result, DownloadedFiles)
assert result.core == ["/tmp/SiteNote_001.pdf"]
assert [df.file_path for df in result.core] == ["/tmp/SiteNote_001.pdf"]
assert result.other == []
@ -209,5 +250,5 @@ def test_get_evidence_files_by_job_id_downloads_other_files_when_include_other_t
result = client.get_evidence_files_by_job_id("job-1", include_other=True)
# Assert
assert result.core == ["/tmp/SiteNote_001.pdf"]
assert result.other == ["/tmp/unknown_doc.pdf"]
assert [df.file_path for df in result.core] == ["/tmp/SiteNote_001.pdf"]
assert [df.file_path for df in result.other] == ["/tmp/unknown_doc.pdf"]

View file

@ -1,10 +1,12 @@
import pytest
from datetime import datetime
from typing import Any, Callable, Optional
from unittest.mock import MagicMock, call, patch
from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum
from backend.pashub_fetcher.pashub_client import (
DownloadedFile,
DownloadedFiles,
PashubClient,
UnauthorizedError,
@ -49,8 +51,14 @@ def make_service(
)
_DEFAULT_UTC = datetime(2024, 1, 1)
def make_downloaded(core: list[str], other: list[str] = []) -> DownloadedFiles:
return DownloadedFiles(core=core, other=other)
return DownloadedFiles(
core=[DownloadedFile(fp, None, _DEFAULT_UTC) for fp in core],
other=[DownloadedFile(fp, None, _DEFAULT_UTC) for fp in other],
)
# ---------------------------------------------------------------------------