From 609468cff961993a731668cebb8505e11a8f6fed Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 08:47:59 +0000 Subject: [PATCH] new methods for downloading all core files for pashub URL. Download currently not being authorised --- backend/pashub_fetcher/cotality_client.py | 77 ++++++++++++++--------- backend/pashub_fetcher/handler/Dockerfile | 2 +- backend/pashub_fetcher/handler/handler.py | 6 +- scripts/download_cotality_evidence.py | 2 +- 4 files changed, 53 insertions(+), 34 deletions(-) diff --git a/backend/pashub_fetcher/cotality_client.py b/backend/pashub_fetcher/cotality_client.py index 9deda776..0ae473b7 100644 --- a/backend/pashub_fetcher/cotality_client.py +++ b/backend/pashub_fetcher/cotality_client.py @@ -1,13 +1,23 @@ -from typing import List, Optional +from collections import defaultdict +from typing import Dict, List, Optional +from datetime import datetime, timezone import requests +from backend.pashub_fetcher.core_files import CoreFiles from backend.pashub_fetcher.evidence_file_data import EvidenceFileData from backend.pashub_fetcher.evidence_metadata import EvidenceMetadata +from utils.logger import setup_logger + + +logger = setup_logger() class CotalityClient: def __init__(self, token: str): + + logger.info(f"Container UTC time: {datetime.now(timezone.utc)}") + self.token = token self.company_id = "cb5249e2-8f31-4ef4-aefd-08ddaccb1fa2" self.base = "https://pashub.net/api" @@ -19,61 +29,68 @@ class CotalityClient: "Accept": "application/json", } ) + logger.info("Finished initialising CotalityClient") def get_core_envidence_files_by_job_id(self, job_id: str) -> List[str]: - # url = f"{self.base}/jobs/{job_id}/evidence" - - raise NotImplementedError - - def get_evidence_files_by_uprn(self, uprn: str) -> List[str]: - """ - Download evidence files for the most recent job for a UPRN. - Returns a list of saved filenames. - """ - - job_id: Optional[str] = self._get_latest_job_id(uprn) - if not job_id: - return [] - + logger.info(f"Getting Core Evidence Files for job ID {job_id}") evidence_list: List[EvidenceFileData] = self._get_evidence_list(job_id) + logger.info(f"Found {len(evidence_list)} Evidence files to get") if not evidence_list: return [] saved_files: List[str] = [] - for evidence in evidence_list: + core_files: Dict[CoreFiles, EvidenceFileData] = self._select_latest_core_files( + evidence_list + ) + + logger.info(f"Number of core files to download is {len(core_files)}") + + for _, evidence in core_files.items(): evidence_id = evidence.file_id if not evidence_id: continue + logger.info(f"Getting metadata for file {evidence.file_name}") metadata: EvidenceMetadata = self._get_evidence_metadata( job_id, evidence_id ) download_url: str = self._build_download_url(metadata, evidence.file_id) + logger.info(f"Download URL: {download_url}") file_name = evidence.file_name self._download_file(download_url, file_name) + logger.info("Successfully downloaded file") saved_files.append(file_name) return saved_files - def _get_latest_job_id(self, uprn: str) -> Optional[str]: - url = f"{self.base}/jobs" - params = { - "pageIndex": 0, - "pageSize": 20, - "orderBy": "createdUtc", - "orderDesc": "true", - "addressUprn": uprn, - "companyId": self.company_id, - } + def _get_core_file_type(self, file: EvidenceFileData) -> Optional[CoreFiles]: + for core_file in CoreFiles: + if file.file_name.startswith(core_file.value): + return core_file + return None - r = self.session.get(url, params=params) - r.raise_for_status() + def _select_latest_core_files( + self, + files: List[EvidenceFileData], + ) -> Dict[CoreFiles, EvidenceFileData]: + grouped: Dict[CoreFiles, List[EvidenceFileData]] = defaultdict(list) - jobs = r.json().get("results", []) - return jobs[0]["id"] if jobs else None + for file in files: + core_type = self._get_core_file_type(file) + if not core_type: + continue + grouped[core_type].append(file) + + latest_files: Dict[CoreFiles, EvidenceFileData] = {} + + for core_type, group in grouped.items(): + latest = max(group, key=lambda f: datetime.fromisoformat(f.created_utc)) + latest_files[core_type] = latest + + return latest_files def _get_evidence_list(self, job_id: str) -> List[EvidenceFileData]: url = f"{self.base}/jobs/{job_id}/evidence" diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile index 2128d12c..cbd3c228 100644 --- a/backend/pashub_fetcher/handler/Dockerfile +++ b/backend/pashub_fetcher/handler/Dockerfile @@ -5,7 +5,7 @@ ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest RUN chmod +x /usr/local/bin/aws-lambda-rie # Install Lambda runtime client -RUN pip install awslambdaric playwright==1.58.0 +RUN pip install awslambdaric playwright==1.58.0 requests # Set working directory (Lambda task root) WORKDIR /var/task diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index b5ec4320..053ad2f4 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -20,7 +20,9 @@ def handler(event: Mapping[str, Any], context: Any) -> None: raise client = CotalityClient(token=token) - uprn = "100061885568" # TODO: get from request body + # uprn = "100061885568" # TODO: get from request body + job_id = "5abf6e27-e4c4-4ba8-b69d-9e34939e0002" # TODO: get from request body - saved_files: List[str] = client.get_evidence_files_by_uprn(uprn=uprn) + saved_files: List[str] = client.get_core_envidence_files_by_job_id(job_id) + # saved_files: List[str] = client.get_evidence_files_by_uprn(uprn=uprn) print(f"saved {len(saved_files)} files") diff --git a/scripts/download_cotality_evidence.py b/scripts/download_cotality_evidence.py index 93148a3a..76400a99 100644 --- a/scripts/download_cotality_evidence.py +++ b/scripts/download_cotality_evidence.py @@ -1,7 +1,7 @@ import requests import json -TOKEN = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6Ik1EUTRNRU5GUTBVNU9FUXpOelk1TVRFME0wUkdOMFpFUkRoR1JVVkJNVGMxT1RFNFJERXlPQSJ9.eyJodHRwOi8vZW1haWwiOiJzZWJhc3RpYW5Ab3Ntb3Npcy1hY2QuY29tIiwiaHR0cDovL2NsdWsudG9rZW4vbGFzdFBhc3N3b3JkQ2hhbmdlIjoiMjAyNS0wOC0yNlQwOTo1NDoyNi4zMjZaIiwiaHR0cDovL2NsdWsudG9rZW4vY29ubmVjdGlvbiI6ImVUZWNoSUQiLCJodHRwOi8vY2x1ay50b2tlbi9zdHJhdGVneSI6ImF1dGgwIiwiaHR0cDovL2NsdWsudG9rZW4vc3RyYXRlZ3lUeXBlIjoiZGF0YWJhc2UiLCJpc3MiOiJodHRwczovL2V0ZWNoaWQuZXUuYXV0aDAuY29tLyIsInN1YiI6ImF1dGgwfDY4YWQ4NDUyZDI2YzI1ZmMyMzkwZmYxYSIsImF1ZCI6WyJodHRwczovL3Bhc2h1Yi5hcGkuZXRlY2gubmV0IiwiaHR0cHM6Ly9ldGVjaGlkLmV1LmF1dGgwLmNvbS91c2VyaW5mbyJdLCJpYXQiOjE3NzQyNzg3NjIsImV4cCI6MTc3NDI4NTk2Miwic2NvcGUiOiJvcGVuaWQiLCJhenAiOiJEaVp6d3VVaTVkVmozOXR3NG00bWZ6emZvRm5MdmVLZyJ9.ESIbau52J7KXL22tM8GlO9eV0f0pCOFdoQGL2YcjsTEcSeucHBuI9lHXT2dNJn0E8qlgafjazaMkoMs2g0TiTUUZU6XsKqKpUAJy4kk-qKp53V5az7e2MG9uDSa5bB1vWsQQw37zaNVQ0FQkpYHSiFeGoBh1PjuKwCpLjbl94bx7S4bQKaJSZRUj5TS75k6HnSOhUtN9LYLMPRoLty7TwqFLDxgj8Ixl_ddEF3C3Y6Mcxa5UF57BNTnFXmLefqsryex0XV4b5Btu4W5wZ4bjhX2M7PSXbk4lTv1YZdQxWLpzvNpEVnFueawtqedGYipqH1v4bg99YUnXDbajd2SSVQ" +TOKEN = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6Ik1EUTRNRU5GUTBVNU9FUXpOelk1TVRFME0wUkdOMFpFUkRoR1JVVkJNVGMxT1RFNFJERXlPQSJ9.eyJodHRwOi8vZW1haWwiOiJzZWJhc3RpYW5Ab3Ntb3Npcy1hY2QuY29tIiwiaHR0cDovL2NsdWsudG9rZW4vbGFzdFBhc3N3b3JkQ2hhbmdlIjoiMjAyNS0wOC0yNlQwOTo1NDoyNi4zMjZaIiwiaHR0cDovL2NsdWsudG9rZW4vY29ubmVjdGlvbiI6ImVUZWNoSUQiLCJodHRwOi8vY2x1ay50b2tlbi9zdHJhdGVneSI6ImF1dGgwIiwiaHR0cDovL2NsdWsudG9rZW4vc3RyYXRlZ3lUeXBlIjoiZGF0YWJhc2UiLCJpc3MiOiJodHRwczovL2V0ZWNoaWQuZXUuYXV0aDAuY29tLyIsInN1YiI6ImF1dGgwfDY4YWQ4NDUyZDI2YzI1ZmMyMzkwZmYxYSIsImF1ZCI6WyJodHRwczovL3Bhc2h1Yi5hcGkuZXRlY2gubmV0IiwiaHR0cHM6Ly9ldGVjaGlkLmV1LmF1dGgwLmNvbS91c2VyaW5mbyJdLCJpYXQiOjE3NzQyODczOTMsImV4cCI6MTc3NDI5NDU5Mywic2NvcGUiOiJvcGVuaWQiLCJhenAiOiJEaVp6d3VVaTVkVmozOXR3NG00bWZ6emZvRm5MdmVLZyJ9.NHh21XfnRofsFkRkc-28Dz-vQAdY70lXkEmh-Mzz7Fg6gjDbZeMu7PnBwgbDP_U8r6R0mI_pDIUc1MzJe1Rf5SF2-RV36TcGzmVzb3ek9wPsy3lxST5WL-vn-qUJ7GsZiGOeQ-jDLLFn8b8tjFrD7BGv8uphrfYAbPDm0atznkdbUSQQy-rfRJWhisnDtHf99j96TuJz3dV4bfI6VGrin-jezbg6BCvUYWQtttUs7knQKEWO0sGGDxtS29sbn4MX8Jqz4-hf6N2XSlgv52aIDwTVX-lyMWzfoeuIGhvCKuDiJeVw2c0r2UZFpHqjnfhXcb0_aacukXe8z-srj8-Rdw" base = "https://pashub.net/api"