From bdc35573b57733465ad165d48224327f1ba9ecbc Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 2 Jun 2026 14:27:54 +0000 Subject: [PATCH] =?UTF-8?q?Downloaded=20files=20carry=20evidence=5Fcategor?= =?UTF-8?q?y=20as=20DownloadedFile=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/pashub_client.py | 26 +++++++--- backend/pashub_fetcher/pashub_service.py | 21 +++++---- .../tests/test_pashub_client.py | 47 +++++++++++++++++-- .../tests/test_pashub_service.py | 10 +++- 4 files changed, 83 insertions(+), 21 deletions(-) diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index a6284d27..e10fbec7 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -25,8 +25,8 @@ class _EvidenceFileGroups(NamedTuple): class DownloadedFiles(NamedTuple): - core: List[str] - other: List[str] + core: List[DownloadedFile] + other: List[DownloadedFile] class UnauthorizedError(Exception): @@ -62,7 +62,7 @@ class PashubClient: evidence_list ) - core_paths: List[str] = [] + core_files: List[DownloadedFile] = [] for _, evidence in grouped.core.items(): if not evidence.file_id: continue @@ -73,9 +73,15 @@ class PashubClient: file_path: str = os.path.join("/tmp", evidence.file_name) self._download_file(download_url, file_path) logger.info("Successfully downloaded file") - core_paths.append(file_path) + core_files.append( + DownloadedFile( + file_path=file_path, + evidence_category=evidence.evidence_category, + created_utc=datetime.fromisoformat(evidence.created_utc), + ) + ) - other_paths: List[str] = [] + other_files: List[DownloadedFile] = [] if include_other: for evidence in grouped.other: @@ -88,9 +94,15 @@ class PashubClient: self._download_file(download_url, file_path) logger.info("Successfully downloaded other file") - other_paths.append(file_path) + other_files.append( + DownloadedFile( + file_path=file_path, + evidence_category=evidence.evidence_category, + created_utc=datetime.fromisoformat(evidence.created_utc), + ) + ) - return DownloadedFiles(core=core_paths, other=other_paths) + return DownloadedFiles(core=core_files, other=other_files) def get_uprn_by_job_id(self, job_id: str) -> Optional[str]: logger.info(f"Getting UPRN for job ID {job_id}") diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index 8442adc3..881fc583 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -12,6 +12,7 @@ from backend.documents_parser.db_writer import save_epc_property_data from backend.documents_parser.parser import parse_site_notes_pdf from backend.pashub_fetcher.core_files import get_file_type_string from backend.pashub_fetcher.pashub_client import ( + DownloadedFile, DownloadedFiles, PashubClient, UnauthorizedError, @@ -116,17 +117,17 @@ class PashubService: # if request.sharepoint_link: # self._upload_to_sharepoint(request.sharepoint_link, downloaded.core) - for file_path in downloaded.core + downloaded.other: + for df in downloaded.core + downloaded.other: try: - os.remove(file_path) + os.remove(df.file_path) except OSError: - logger.warning(f"Failed to delete temp file {file_path}") + logger.warning(f"Failed to delete temp file {df.file_path}") - return downloaded.core + downloaded.other + return [df.file_path for df in downloaded.core + downloaded.other] def _upload_to_s3_and_update_db( self, - job_files: List[str], + job_files: List[DownloadedFile], uprn: Optional[str], hubspot_deal_id: Optional[str], file_source: FileSourceEnum, @@ -144,11 +145,11 @@ class PashubService: file_paths: List[str] = [] uploaded_files: List[UploadedFile] = [] - for file_path in job_files: - filename = os.path.basename(file_path) + for df in job_files: + filename = os.path.basename(df.file_path) file_key = f"{base_path}/{filename}" - upload_file_to_s3(file_path, self._s3_bucket, file_key) + upload_file_to_s3(df.file_path, self._s3_bucket, file_key) uploaded_file = UploadedFile( s3_file_bucket=self._s3_bucket, @@ -157,9 +158,9 @@ class PashubService: uprn=int(uprn) if uprn else None, hubspot_deal_id=hubspot_deal_id, file_source=file_source.value, - file_type=get_file_type_string(filename) or default_file_type, + file_type=get_file_type_string(filename, df.evidence_category) or default_file_type, ) - file_paths.append(file_path) + file_paths.append(df.file_path) uploaded_files.append(uploaded_file) with db_session() as session: diff --git a/backend/pashub_fetcher/tests/test_pashub_client.py b/backend/pashub_fetcher/tests/test_pashub_client.py index f072731b..50c91b85 100644 --- a/backend/pashub_fetcher/tests/test_pashub_client.py +++ b/backend/pashub_fetcher/tests/test_pashub_client.py @@ -119,6 +119,47 @@ def test_group_into_core_and_other_files_picks_latest_when_both_candidates_have_ assert result.core[CoreFiles.RETROFIT_DESIGN_DOC].file_name == "2603-OSM-B06M901-XX-DR-N-A_Alvaston Walk 022.pdf" +def test_group_into_core_and_other_files_classifies_mcs_cert_as_core() -> None: + # Arrange + client = make_client() + files = [ + make_file( + file_name="MCS_cert_job123.pdf", + evidence_category="MCS Compliance Certificate", + ), + ] + + # Act + result = client._group_into_core_and_other_files(files) + + # Assert + assert CoreFiles.MCS_CERTIFICATE in result.core + assert result.other == [] + + +def test_group_into_core_and_other_files_picks_most_recent_mcs_cert() -> None: + # Arrange + client = make_client() + files = [ + make_file( + file_name="mcs_cert_old.pdf", + evidence_category="MCS Compliance Certificate", + created_utc="2024-01-01T00:00:00", + ), + make_file( + file_name="mcs_cert_new.pdf", + evidence_category="MCS Compliance Certificate", + created_utc="2024-06-01T00:00:00", + ), + ] + + # Act + result = client._group_into_core_and_other_files(files) + + # Assert + assert result.core[CoreFiles.MCS_CERTIFICATE].file_name == "mcs_cert_new.pdf" + + def test_group_into_core_and_other_files_falls_back_to_latest_when_no_osm_candidates() -> None: # Arrange client = make_client() @@ -165,7 +206,7 @@ def test_get_evidence_files_by_job_id_returns_downloaded_files_with_empty_other_ # Assert assert isinstance(result, DownloadedFiles) - assert result.core == ["/tmp/SiteNote_001.pdf"] + assert [df.file_path for df in result.core] == ["/tmp/SiteNote_001.pdf"] assert result.other == [] @@ -209,5 +250,5 @@ def test_get_evidence_files_by_job_id_downloads_other_files_when_include_other_t result = client.get_evidence_files_by_job_id("job-1", include_other=True) # Assert - assert result.core == ["/tmp/SiteNote_001.pdf"] - assert result.other == ["/tmp/unknown_doc.pdf"] + assert [df.file_path for df in result.core] == ["/tmp/SiteNote_001.pdf"] + assert [df.file_path for df in result.other] == ["/tmp/unknown_doc.pdf"] diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py index df77287d..988dc854 100644 --- a/backend/pashub_fetcher/tests/test_pashub_service.py +++ b/backend/pashub_fetcher/tests/test_pashub_service.py @@ -1,10 +1,12 @@ import pytest +from datetime import datetime from typing import Any, Callable, Optional from unittest.mock import MagicMock, call, patch from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum from backend.pashub_fetcher.pashub_client import ( + DownloadedFile, DownloadedFiles, PashubClient, UnauthorizedError, @@ -49,8 +51,14 @@ def make_service( ) +_DEFAULT_UTC = datetime(2024, 1, 1) + + def make_downloaded(core: list[str], other: list[str] = []) -> DownloadedFiles: - return DownloadedFiles(core=core, other=other) + return DownloadedFiles( + core=[DownloadedFile(fp, None, _DEFAULT_UTC) for fp in core], + other=[DownloadedFile(fp, None, _DEFAULT_UTC) for fp in other], + ) # ---------------------------------------------------------------------------