From 8a17ea7265aad5d542782688872525cf30058542 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 19 Mar 2026 15:46:31 +0000 Subject: [PATCH 01/28] set up docker and handler --- backend/pashub_fetcher/handler/Dockerfile | 16 ++++++++++++++++ backend/pashub_fetcher/handler/handler.py | 10 ++++++++++ backend/pashub_fetcher/processor.py | 0 3 files changed, 26 insertions(+) create mode 100644 backend/pashub_fetcher/handler/Dockerfile create mode 100644 backend/pashub_fetcher/handler/handler.py create mode 100644 backend/pashub_fetcher/processor.py diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile new file mode 100644 index 00000000..a67304ad --- /dev/null +++ b/backend/pashub_fetcher/handler/Dockerfile @@ -0,0 +1,16 @@ +FROM mcr.microsoft.com/playwright/python:v1.42.0 + +# Set working directory (Lambda task root) +WORKDIR /var/task + +COPY backend/.env.test backend/.env + +COPY utils/ utils/ +COPY backend/pashub_fetcher/ backend/pashub_fetcher/ + +# ----------------------------- +# Lambda handler +# ----------------------------- +# CMD ["backend/pashub_fetcher/handler/handler.handler"] +# For local running +CMD ["python", "-m", "backend.pashub_fetcher.handler.handler"] \ No newline at end of file diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py new file mode 100644 index 00000000..8fd7e175 --- /dev/null +++ b/backend/pashub_fetcher/handler/handler.py @@ -0,0 +1,10 @@ +from typing import Any, Mapping + +from utils.logger import setup_logger + + +logger = setup_logger() + + +def handler(event: Mapping[str, Any], context: Any) -> None: + logger.info("Recevied message") diff --git a/backend/pashub_fetcher/processor.py b/backend/pashub_fetcher/processor.py new file mode 100644 index 00000000..e69de29b From 142024550e1d0fcea3632b48362702f6918ba9a4 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 19 Mar 2026 16:22:53 +0000 Subject: [PATCH 02/28] set up local lambda runner and invoker --- backend/pashub_fetcher/handler/Dockerfile | 13 +++++++++- backend/pashub_fetcher/handler/handler.py | 2 +- .../local_handler/docker-compose.yml} | 6 ++--- .../local_handler/invoke_local_lambda.py | 26 +++++++++++++++++++ 4 files changed, 42 insertions(+), 5 deletions(-) rename backend/{docker-compose-local-lambdas.yml => pashub_fetcher/local_handler/docker-compose.yml} (50%) create mode 100644 backend/pashub_fetcher/local_handler/invoke_local_lambda.py diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile index a67304ad..1534afdb 100644 --- a/backend/pashub_fetcher/handler/Dockerfile +++ b/backend/pashub_fetcher/handler/Dockerfile @@ -1,5 +1,12 @@ FROM mcr.microsoft.com/playwright/python:v1.42.0 +# Install AWS Lambda RIE +ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie /usr/local/bin/aws-lambda-rie +RUN chmod +x /usr/local/bin/aws-lambda-rie + +# Install Lambda runtime client +RUN pip install awslambdaric + # Set working directory (Lambda task root) WORKDIR /var/task @@ -8,9 +15,13 @@ COPY backend/.env.test backend/.env COPY utils/ utils/ COPY backend/pashub_fetcher/ backend/pashub_fetcher/ + +# Lambda entrypoint +ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] + # ----------------------------- # Lambda handler # ----------------------------- # CMD ["backend/pashub_fetcher/handler/handler.handler"] # For local running -CMD ["python", "-m", "backend.pashub_fetcher.handler.handler"] \ No newline at end of file +CMD ["backend.pashub_fetcher.handler.handler.handler"] \ No newline at end of file diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 8fd7e175..00fbd6b5 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -7,4 +7,4 @@ logger = setup_logger() def handler(event: Mapping[str, Any], context: Any) -> None: - logger.info("Recevied message") + logger.info("Received message") diff --git a/backend/docker-compose-local-lambdas.yml b/backend/pashub_fetcher/local_handler/docker-compose.yml similarity index 50% rename from backend/docker-compose-local-lambdas.yml rename to backend/pashub_fetcher/local_handler/docker-compose.yml index 50e9193b..0ee53283 100644 --- a/backend/docker-compose-local-lambdas.yml +++ b/backend/pashub_fetcher/local_handler/docker-compose.yml @@ -3,9 +3,9 @@ version: "3.9" services: categorisation-lambda: build: - context: ../ - dockerfile: backend/categorisation/handler/Dockerfile + context: ../../../ + dockerfile: backend/pashub_fetcher/handler/Dockerfile ports: - "9000:8080" env_file: - - ../.env \ No newline at end of file + - ../../../.env \ No newline at end of file diff --git a/backend/pashub_fetcher/local_handler/invoke_local_lambda.py b/backend/pashub_fetcher/local_handler/invoke_local_lambda.py new file mode 100644 index 00000000..463ef9d8 --- /dev/null +++ b/backend/pashub_fetcher/local_handler/invoke_local_lambda.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +import json +import requests + +HOST = "localhost" +PORT = "9000" + +LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" + +payload = { + "Records": [ + { + "body": json.dumps( + { + "uprn": 123456, + } + ) + } + ] +} + +response = requests.post(LAMBDA_URL, json=payload) + +print("Status code:", response.status_code) +print("Response:") +print(response.text) From a146c7d82e25bee4a75e8fc770596bedc159ae09 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 19 Mar 2026 16:45:53 +0000 Subject: [PATCH 03/28] go to pas hub with playwright --- .devcontainer/backend/requirements.txt | 1 + backend/pashub_fetcher/handler/Dockerfile | 4 ++-- backend/pashub_fetcher/handler/handler.py | 12 +++++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.devcontainer/backend/requirements.txt b/.devcontainer/backend/requirements.txt index 5cd40ced..d7afa6a2 100644 --- a/.devcontainer/backend/requirements.txt +++ b/.devcontainer/backend/requirements.txt @@ -6,6 +6,7 @@ psycopg2-binary==2.9.10 python-jose==3.3.0 cryptography==43.0.3 mangum==0.19.0 +playwright==1.58.0 # AWS boto3==1.35.44 # Data diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile index 1534afdb..2128d12c 100644 --- a/backend/pashub_fetcher/handler/Dockerfile +++ b/backend/pashub_fetcher/handler/Dockerfile @@ -1,11 +1,11 @@ -FROM mcr.microsoft.com/playwright/python:v1.42.0 +FROM mcr.microsoft.com/playwright/python:v1.58.0-jammy # Install AWS Lambda RIE ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie /usr/local/bin/aws-lambda-rie RUN chmod +x /usr/local/bin/aws-lambda-rie # Install Lambda runtime client -RUN pip install awslambdaric +RUN pip install awslambdaric playwright==1.58.0 # Set working directory (Lambda task root) WORKDIR /var/task diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 00fbd6b5..2fe2af1d 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,5 +1,7 @@ from typing import Any, Mapping +from playwright.sync_api import sync_playwright + from utils.logger import setup_logger @@ -7,4 +9,12 @@ logger = setup_logger() def handler(event: Mapping[str, Any], context: Any) -> None: - logger.info("Received message") + logger.info("Starting Playwright flow") + + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + page = browser.new_page() + + page.goto("https://pashub.net/") + + logger.info(f"Page title: {page.title()}") From b156617a2b1248bff533ad5acd0ae11beacba5c8 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 19 Mar 2026 17:04:49 +0000 Subject: [PATCH 04/28] try logging into pas hub --- backend/pashub_fetcher/handler/handler.py | 46 ++++++++++++++++--- .../local_handler/docker-compose.yml | 2 +- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 2fe2af1d..3d22ef44 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,6 +1,6 @@ -from typing import Any, Mapping +from typing import Any, Dict, Mapping -from playwright.sync_api import sync_playwright +from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError from utils.logger import setup_logger @@ -8,13 +8,47 @@ from utils.logger import setup_logger logger = setup_logger() -def handler(event: Mapping[str, Any], context: Any) -> None: +def handler(event: Mapping[str, Any], context: Any) -> Dict[str, str]: logger.info("Starting Playwright flow") + email = "random@test.com" + password = "my_fake_password" + with sync_playwright() as p: - browser = p.chromium.launch(headless=True) + browser = p.chromium.launch( + headless=True, + args=["--no-sandbox", "--disable-dev-shm-usage"], + ) page = browser.new_page() - page.goto("https://pashub.net/") + try: + logger.info("Navigating to site...") + page.goto("https://pashub.net/", timeout=30000) - logger.info(f"Page title: {page.title()}") + logger.info("Filling login form...") + page.fill("#email", email) + page.fill("#password", password) + + logger.info("Submitting login...") + page.click("#btn-login") + + page.wait_for_timeout(3000) + + if "login" in page.url.lower(): + logger.error("Login failed (still on login page)") + return {"status": "error", "message": "Login failed"} + + logger.info(f"Login likely successful. URL: {page.url}") + + return {"status": "ok"} + + except PlaywrightTimeoutError as e: + logger.error(f"Timeout during login flow: {str(e)}") + return {"status": "error", "message": "Timeout during login"} + + except Exception as e: + logger.error(f"Unexpected error: {str(e)}") + return {"status": "error", "message": str(e)} + + finally: + browser.close() diff --git a/backend/pashub_fetcher/local_handler/docker-compose.yml b/backend/pashub_fetcher/local_handler/docker-compose.yml index 0ee53283..34ba9277 100644 --- a/backend/pashub_fetcher/local_handler/docker-compose.yml +++ b/backend/pashub_fetcher/local_handler/docker-compose.yml @@ -1,7 +1,7 @@ version: "3.9" services: - categorisation-lambda: + pashub-fetcher-lambda: build: context: ../../../ dockerfile: backend/pashub_fetcher/handler/Dockerfile From 4d641af0c135b11973dc29c118fabd715f205ea9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 23 Mar 2026 09:36:57 +0000 Subject: [PATCH 05/28] extract token from localStorage after logging in --- backend/pashub_fetcher/handler/handler.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 3d22ef44..a5cbe55a 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -40,7 +40,19 @@ def handler(event: Mapping[str, Any], context: Any) -> Dict[str, str]: logger.info(f"Login likely successful. URL: {page.url}") - return {"status": "ok"} + token = page.evaluate( + """() => { + return localStorage.getItem('token'); + }""" + ) + + if not token: + logger.error("Login succeeded but no token found") + return {"status": "error", "message": "No token found"} + + logger.info(f"Token extracted successfully: {token}") + + return {"status": "ok", "token": token} except PlaywrightTimeoutError as e: logger.error(f"Timeout during login flow: {str(e)}") From b8c0c9ea653df2818aa75bf41af32e2783b39a5a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 23 Mar 2026 09:56:53 +0000 Subject: [PATCH 06/28] move playwright process to separate file --- backend/pashub_fetcher/handler/handler.py | 69 ++++------------------- backend/pashub_fetcher/processor.py | 0 backend/pashub_fetcher/token_getter.py | 54 ++++++++++++++++++ 3 files changed, 65 insertions(+), 58 deletions(-) delete mode 100644 backend/pashub_fetcher/processor.py create mode 100644 backend/pashub_fetcher/token_getter.py diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index a5cbe55a..11f457a6 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,66 +1,19 @@ -from typing import Any, Dict, Mapping - -from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError +from typing import Any, Mapping +from backend.pashub_fetcher.token_getter import get_token_from_local_storage from utils.logger import setup_logger logger = setup_logger() -def handler(event: Mapping[str, Any], context: Any) -> Dict[str, str]: - logger.info("Starting Playwright flow") +def handler(event: Mapping[str, Any], context: Any) -> None: + pas_hub_email = "random@test.com" + pas_hub_password = "my_fake_password" - email = "random@test.com" - password = "my_fake_password" - - with sync_playwright() as p: - browser = p.chromium.launch( - headless=True, - args=["--no-sandbox", "--disable-dev-shm-usage"], - ) - page = browser.new_page() - - try: - logger.info("Navigating to site...") - page.goto("https://pashub.net/", timeout=30000) - - logger.info("Filling login form...") - page.fill("#email", email) - page.fill("#password", password) - - logger.info("Submitting login...") - page.click("#btn-login") - - page.wait_for_timeout(3000) - - if "login" in page.url.lower(): - logger.error("Login failed (still on login page)") - return {"status": "error", "message": "Login failed"} - - logger.info(f"Login likely successful. URL: {page.url}") - - token = page.evaluate( - """() => { - return localStorage.getItem('token'); - }""" - ) - - if not token: - logger.error("Login succeeded but no token found") - return {"status": "error", "message": "No token found"} - - logger.info(f"Token extracted successfully: {token}") - - return {"status": "ok", "token": token} - - except PlaywrightTimeoutError as e: - logger.error(f"Timeout during login flow: {str(e)}") - return {"status": "error", "message": "Timeout during login"} - - except Exception as e: - logger.error(f"Unexpected error: {str(e)}") - return {"status": "error", "message": str(e)} - - finally: - browser.close() + try: + token: str = get_token_from_local_storage(pas_hub_email, pas_hub_password) + logger.info(f"Token extracted successfully: {token}") + except: + logger.error("Error getting auth token from Pas Hub") + raise diff --git a/backend/pashub_fetcher/processor.py b/backend/pashub_fetcher/processor.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/pashub_fetcher/token_getter.py b/backend/pashub_fetcher/token_getter.py new file mode 100644 index 00000000..d5481dd5 --- /dev/null +++ b/backend/pashub_fetcher/token_getter.py @@ -0,0 +1,54 @@ +from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError + +from utils.logger import setup_logger + +logger = setup_logger() + + +def get_token_from_local_storage(email: str, password: str) -> str: + logger.info("Starting Playwright flow") + + with sync_playwright() as p: + browser = p.chromium.launch( + headless=True, + args=["--no-sandbox", "--disable-dev-shm-usage"], + ) + page = browser.new_page() + + try: + logger.info("Navigating to site...") + page.goto("https://pashub.net/", timeout=30000) + + logger.info("Filling login form...") + page.fill("#email", email) + page.fill("#password", password) + + logger.info("Submitting login...") + page.click("#btn-login") + + page.wait_for_timeout(3000) + + if "login" in page.url.lower(): + raise Exception("Login failed (still on login page)") + + logger.info(f"Login likely successful. URL: {page.url}") + + token = page.evaluate( + """() => { + return localStorage.getItem('token'); + }""" + ) + + if not token: + raise Exception("Login succeeded but no token found") + + return token + + except PlaywrightTimeoutError as e: + raise Exception(f"Timeout during login flow: {str(e)}") + + except Exception as e: + raise Exception(f"Unexpected error: {str(e)}") + + finally: + browser.close() From cd514b6e5d630e4a9a665eee54b8a3994067c061 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 23 Mar 2026 10:29:42 +0000 Subject: [PATCH 07/28] add and implement cotality_client --- backend/pashub_fetcher/handler/handler.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 11f457a6..a9288783 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,5 +1,6 @@ -from typing import Any, Mapping +from typing import Any, List, Mapping +from backend.pashub_fetcher.cotality_client import CotalityClient from backend.pashub_fetcher.token_getter import get_token_from_local_storage from utils.logger import setup_logger @@ -17,3 +18,9 @@ def handler(event: Mapping[str, Any], context: Any) -> None: except: logger.error("Error getting auth token from Pas Hub") raise + + client = CotalityClient(token=token) + uprn = "100061885568" # TODO: get from request body + + saved_files: List[str] = client.get_evidence_files(uprn=uprn) + print(f"saved {len(saved_files)} files") From 3dc14480e9dd7f41a175e5c9f511f979afc09703 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 23 Mar 2026 10:31:30 +0000 Subject: [PATCH 08/28] rename evidence file getter method --- backend/pashub_fetcher/handler/handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index a9288783..b5ec4320 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -22,5 +22,5 @@ def handler(event: Mapping[str, Any], context: Any) -> None: client = CotalityClient(token=token) uprn = "100061885568" # TODO: get from request body - saved_files: List[str] = client.get_evidence_files(uprn=uprn) + saved_files: List[str] = client.get_evidence_files_by_uprn(uprn=uprn) print(f"saved {len(saved_files)} files") From 6617d9e6145e9d903dafdbc6bdb70fc30488113e Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 23 Mar 2026 16:16:20 +0000 Subject: [PATCH 09/28] improved typing --- backend/pashub_fetcher/core_files.py | 13 +++ backend/pashub_fetcher/cotality_client.py | 110 +++++++++++++++++++ backend/pashub_fetcher/evidence_file_data.py | 25 +++++ backend/pashub_fetcher/evidence_metadata.py | 16 +++ scripts/download_cotality_evidence.py | 2 +- 5 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 backend/pashub_fetcher/core_files.py create mode 100644 backend/pashub_fetcher/cotality_client.py create mode 100644 backend/pashub_fetcher/evidence_file_data.py create mode 100644 backend/pashub_fetcher/evidence_metadata.py diff --git a/backend/pashub_fetcher/core_files.py b/backend/pashub_fetcher/core_files.py new file mode 100644 index 00000000..82637f70 --- /dev/null +++ b/backend/pashub_fetcher/core_files.py @@ -0,0 +1,13 @@ +from enum import Enum + + +class CoreFiles(Enum): + PHOTOPACK = "Photopack" + SITENOTE = "SiteNote" + RDSAP_SITENOTE = "RdSAP_SiteNote" + PAS2023_VENTILATION = "PAS 2023 Ventilation Assessment Report" + PAS2023_CONDITION = "PAS 2023 Condition Report" + PAS_SIGNIFICANCE = "PAS Significance" + PAR_PHOTOPACK = "PAR Photo Pack" + PAS2023_PROPERTY = "PAS 2023 Property Assessment Report" + PAS2023_OCCUPANCY = "PAS 2023 Occupancy Assessment Report" diff --git a/backend/pashub_fetcher/cotality_client.py b/backend/pashub_fetcher/cotality_client.py new file mode 100644 index 00000000..9deda776 --- /dev/null +++ b/backend/pashub_fetcher/cotality_client.py @@ -0,0 +1,110 @@ +from typing import List, Optional + +import requests + +from backend.pashub_fetcher.evidence_file_data import EvidenceFileData +from backend.pashub_fetcher.evidence_metadata import EvidenceMetadata + + +class CotalityClient: + def __init__(self, token: str): + self.token = token + self.company_id = "cb5249e2-8f31-4ef4-aefd-08ddaccb1fa2" + self.base = "https://pashub.net/api" + + self.session = requests.Session() + self.session.headers.update( + { + "Authorization": f"Bearer {self.token}", + "Accept": "application/json", + } + ) + + def get_core_envidence_files_by_job_id(self, job_id: str) -> List[str]: + # url = f"{self.base}/jobs/{job_id}/evidence" + + raise NotImplementedError + + def get_evidence_files_by_uprn(self, uprn: str) -> List[str]: + """ + Download evidence files for the most recent job for a UPRN. + Returns a list of saved filenames. + """ + + job_id: Optional[str] = self._get_latest_job_id(uprn) + if not job_id: + return [] + + evidence_list: List[EvidenceFileData] = self._get_evidence_list(job_id) + if not evidence_list: + return [] + + saved_files: List[str] = [] + + for evidence in evidence_list: + evidence_id = evidence.file_id + if not evidence_id: + continue + + metadata: EvidenceMetadata = self._get_evidence_metadata( + job_id, evidence_id + ) + + download_url: str = self._build_download_url(metadata, evidence.file_id) + file_name = evidence.file_name + + self._download_file(download_url, file_name) + saved_files.append(file_name) + + return saved_files + + def _get_latest_job_id(self, uprn: str) -> Optional[str]: + url = f"{self.base}/jobs" + params = { + "pageIndex": 0, + "pageSize": 20, + "orderBy": "createdUtc", + "orderDesc": "true", + "addressUprn": uprn, + "companyId": self.company_id, + } + + r = self.session.get(url, params=params) + r.raise_for_status() + + jobs = r.json().get("results", []) + return jobs[0]["id"] if jobs else None + + def _get_evidence_list(self, job_id: str) -> List[EvidenceFileData]: + url = f"{self.base}/jobs/{job_id}/evidence" + + r = self.session.get(url) + r.raise_for_status() + + results = r.json().get("results", []) + + return [EvidenceFileData.from_api(item) for item in results] + + def _get_evidence_metadata(self, job_id: str, evidence_id: str) -> EvidenceMetadata: + url = f"{self.base}/jobs/{job_id}/evidenceMetadata" + params = {"evidenceIds": evidence_id} + + r = self.session.get(url, params=params) + r.raise_for_status() + + return EvidenceMetadata.from_api(r.json()) + + def _build_download_url(self, metadata: EvidenceMetadata, file_id: str) -> str: + container = metadata.container_name + blob_uri = metadata.blob_uri + + base, sas = blob_uri.split("?", 1) + + return f"{base}{container}/{file_id}?{sas}" + + def _download_file(self, url: str, file_name: str) -> None: + r = self.session.get(url) + r.raise_for_status() + + with open(file_name, "wb") as f: + f.write(r.content) diff --git a/backend/pashub_fetcher/evidence_file_data.py b/backend/pashub_fetcher/evidence_file_data.py new file mode 100644 index 00000000..8ecc2441 --- /dev/null +++ b/backend/pashub_fetcher/evidence_file_data.py @@ -0,0 +1,25 @@ +from __future__ import annotations +from dataclasses import dataclass +from typing import Any, Dict, Optional + + +@dataclass +class EvidenceFileData: + file_id: str + file_name: str + created_utc: str + file_size: int + file_extension: str + + evidence_category: Optional[str] = None + + @classmethod + def from_api(cls, data: Dict[str, Any]) -> EvidenceFileData: + return cls( + file_id=data["fileId"], + file_name=data["fileName"], + created_utc=data["createdUtc"], + file_size=data["fileSize"], + file_extension=data["fileExtension"], + evidence_category=data.get("evidenceCategory"), + ) diff --git a/backend/pashub_fetcher/evidence_metadata.py b/backend/pashub_fetcher/evidence_metadata.py new file mode 100644 index 00000000..e3a9536e --- /dev/null +++ b/backend/pashub_fetcher/evidence_metadata.py @@ -0,0 +1,16 @@ +from __future__ import annotations +from dataclasses import dataclass +from typing import Any, Dict + + +@dataclass +class EvidenceMetadata: + container_name: str + blob_uri: str + + @classmethod + def from_api(cls, data: Dict[str, Any]) -> EvidenceMetadata: + return cls( + container_name=data["containerName"], + blob_uri=data["blobUri"], + ) diff --git a/scripts/download_cotality_evidence.py b/scripts/download_cotality_evidence.py index 43f9afea..93148a3a 100644 --- a/scripts/download_cotality_evidence.py +++ b/scripts/download_cotality_evidence.py @@ -1,7 +1,7 @@ import requests import json -TOKEN = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6Ik1EUTRNRU5GUTBVNU9FUXpOelk1TVRFME0wUkdOMFpFUkRoR1JVVkJNVGMxT1RFNFJERXlPQSJ9.eyJodHRwOi8vZW1haWwiOiJzZWJhc3RpYW5Ab3Ntb3Npcy1hY2QuY29tIiwiaHR0cDovL2NsdWsudG9rZW4vbGFzdFBhc3N3b3JkQ2hhbmdlIjoiMjAyNS0wOC0yNlQwOTo1NDoyNi4zMjZaIiwiaHR0cDovL2NsdWsudG9rZW4vY29ubmVjdGlvbiI6ImVUZWNoSUQiLCJodHRwOi8vY2x1ay50b2tlbi9zdHJhdGVneSI6ImF1dGgwIiwiaHR0cDovL2NsdWsudG9rZW4vc3RyYXRlZ3lUeXBlIjoiZGF0YWJhc2UiLCJpc3MiOiJodHRwczovL2V0ZWNoaWQuZXUuYXV0aDAuY29tLyIsInN1YiI6ImF1dGgwfDY4YWQ4NDUyZDI2YzI1ZmMyMzkwZmYxYSIsImF1ZCI6WyJodHRwczovL3Bhc2h1Yi5hcGkuZXRlY2gubmV0IiwiaHR0cHM6Ly9ldGVjaGlkLmV1LmF1dGgwLmNvbS91c2VyaW5mbyJdLCJpYXQiOjE3NzMyMzc4MjQsImV4cCI6MTc3MzI0NTAyNCwic2NvcGUiOiJvcGVuaWQiLCJhenAiOiJEaVp6d3VVaTVkVmozOXR3NG00bWZ6emZvRm5MdmVLZyJ9.mkkxeZiD_ByHY4TJKpLQ-trmeGs15s0ekL6u1n-ek9j-EzNyf6qalEHCyHf8gzdNhU_vay96bIOMRHp4vXFaLqSANwKZayIS3EoA_b9-u2FAZpooxEvReAMNJGoZ6WLD01AQXWv-l7ww1ZqAnQzw0moL_Oma6hVmA5oa-RJKJ3MerS7e0Wei97Db48E140-EAbQf2iPcKYYtCNRA4il6n8DFiqGeoUMGo99jkR1ceZAvMpOAj8RhKX-4qSiDfX6yXUS2G96U5m7S_GWI-DEj5TazkN10Af3TyOY3EVjmZoJcRpiAR4cFmlfcTydjrShU03DWmPZm1QItf2McxfCpNA" +TOKEN = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6Ik1EUTRNRU5GUTBVNU9FUXpOelk1TVRFME0wUkdOMFpFUkRoR1JVVkJNVGMxT1RFNFJERXlPQSJ9.eyJodHRwOi8vZW1haWwiOiJzZWJhc3RpYW5Ab3Ntb3Npcy1hY2QuY29tIiwiaHR0cDovL2NsdWsudG9rZW4vbGFzdFBhc3N3b3JkQ2hhbmdlIjoiMjAyNS0wOC0yNlQwOTo1NDoyNi4zMjZaIiwiaHR0cDovL2NsdWsudG9rZW4vY29ubmVjdGlvbiI6ImVUZWNoSUQiLCJodHRwOi8vY2x1ay50b2tlbi9zdHJhdGVneSI6ImF1dGgwIiwiaHR0cDovL2NsdWsudG9rZW4vc3RyYXRlZ3lUeXBlIjoiZGF0YWJhc2UiLCJpc3MiOiJodHRwczovL2V0ZWNoaWQuZXUuYXV0aDAuY29tLyIsInN1YiI6ImF1dGgwfDY4YWQ4NDUyZDI2YzI1ZmMyMzkwZmYxYSIsImF1ZCI6WyJodHRwczovL3Bhc2h1Yi5hcGkuZXRlY2gubmV0IiwiaHR0cHM6Ly9ldGVjaGlkLmV1LmF1dGgwLmNvbS91c2VyaW5mbyJdLCJpYXQiOjE3NzQyNzg3NjIsImV4cCI6MTc3NDI4NTk2Miwic2NvcGUiOiJvcGVuaWQiLCJhenAiOiJEaVp6d3VVaTVkVmozOXR3NG00bWZ6emZvRm5MdmVLZyJ9.ESIbau52J7KXL22tM8GlO9eV0f0pCOFdoQGL2YcjsTEcSeucHBuI9lHXT2dNJn0E8qlgafjazaMkoMs2g0TiTUUZU6XsKqKpUAJy4kk-qKp53V5az7e2MG9uDSa5bB1vWsQQw37zaNVQ0FQkpYHSiFeGoBh1PjuKwCpLjbl94bx7S4bQKaJSZRUj5TS75k6HnSOhUtN9LYLMPRoLty7TwqFLDxgj8Ixl_ddEF3C3Y6Mcxa5UF57BNTnFXmLefqsryex0XV4b5Btu4W5wZ4bjhX2M7PSXbk4lTv1YZdQxWLpzvNpEVnFueawtqedGYipqH1v4bg99YUnXDbajd2SSVQ" base = "https://pashub.net/api" From 609468cff961993a731668cebb8505e11a8f6fed Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 08:47:59 +0000 Subject: [PATCH 10/28] new methods for downloading all core files for pashub URL. Download currently not being authorised --- backend/pashub_fetcher/cotality_client.py | 77 ++++++++++++++--------- backend/pashub_fetcher/handler/Dockerfile | 2 +- backend/pashub_fetcher/handler/handler.py | 6 +- scripts/download_cotality_evidence.py | 2 +- 4 files changed, 53 insertions(+), 34 deletions(-) diff --git a/backend/pashub_fetcher/cotality_client.py b/backend/pashub_fetcher/cotality_client.py index 9deda776..0ae473b7 100644 --- a/backend/pashub_fetcher/cotality_client.py +++ b/backend/pashub_fetcher/cotality_client.py @@ -1,13 +1,23 @@ -from typing import List, Optional +from collections import defaultdict +from typing import Dict, List, Optional +from datetime import datetime, timezone import requests +from backend.pashub_fetcher.core_files import CoreFiles from backend.pashub_fetcher.evidence_file_data import EvidenceFileData from backend.pashub_fetcher.evidence_metadata import EvidenceMetadata +from utils.logger import setup_logger + + +logger = setup_logger() class CotalityClient: def __init__(self, token: str): + + logger.info(f"Container UTC time: {datetime.now(timezone.utc)}") + self.token = token self.company_id = "cb5249e2-8f31-4ef4-aefd-08ddaccb1fa2" self.base = "https://pashub.net/api" @@ -19,61 +29,68 @@ class CotalityClient: "Accept": "application/json", } ) + logger.info("Finished initialising CotalityClient") def get_core_envidence_files_by_job_id(self, job_id: str) -> List[str]: - # url = f"{self.base}/jobs/{job_id}/evidence" - - raise NotImplementedError - - def get_evidence_files_by_uprn(self, uprn: str) -> List[str]: - """ - Download evidence files for the most recent job for a UPRN. - Returns a list of saved filenames. - """ - - job_id: Optional[str] = self._get_latest_job_id(uprn) - if not job_id: - return [] - + logger.info(f"Getting Core Evidence Files for job ID {job_id}") evidence_list: List[EvidenceFileData] = self._get_evidence_list(job_id) + logger.info(f"Found {len(evidence_list)} Evidence files to get") if not evidence_list: return [] saved_files: List[str] = [] - for evidence in evidence_list: + core_files: Dict[CoreFiles, EvidenceFileData] = self._select_latest_core_files( + evidence_list + ) + + logger.info(f"Number of core files to download is {len(core_files)}") + + for _, evidence in core_files.items(): evidence_id = evidence.file_id if not evidence_id: continue + logger.info(f"Getting metadata for file {evidence.file_name}") metadata: EvidenceMetadata = self._get_evidence_metadata( job_id, evidence_id ) download_url: str = self._build_download_url(metadata, evidence.file_id) + logger.info(f"Download URL: {download_url}") file_name = evidence.file_name self._download_file(download_url, file_name) + logger.info("Successfully downloaded file") saved_files.append(file_name) return saved_files - def _get_latest_job_id(self, uprn: str) -> Optional[str]: - url = f"{self.base}/jobs" - params = { - "pageIndex": 0, - "pageSize": 20, - "orderBy": "createdUtc", - "orderDesc": "true", - "addressUprn": uprn, - "companyId": self.company_id, - } + def _get_core_file_type(self, file: EvidenceFileData) -> Optional[CoreFiles]: + for core_file in CoreFiles: + if file.file_name.startswith(core_file.value): + return core_file + return None - r = self.session.get(url, params=params) - r.raise_for_status() + def _select_latest_core_files( + self, + files: List[EvidenceFileData], + ) -> Dict[CoreFiles, EvidenceFileData]: + grouped: Dict[CoreFiles, List[EvidenceFileData]] = defaultdict(list) - jobs = r.json().get("results", []) - return jobs[0]["id"] if jobs else None + for file in files: + core_type = self._get_core_file_type(file) + if not core_type: + continue + grouped[core_type].append(file) + + latest_files: Dict[CoreFiles, EvidenceFileData] = {} + + for core_type, group in grouped.items(): + latest = max(group, key=lambda f: datetime.fromisoformat(f.created_utc)) + latest_files[core_type] = latest + + return latest_files def _get_evidence_list(self, job_id: str) -> List[EvidenceFileData]: url = f"{self.base}/jobs/{job_id}/evidence" diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile index 2128d12c..cbd3c228 100644 --- a/backend/pashub_fetcher/handler/Dockerfile +++ b/backend/pashub_fetcher/handler/Dockerfile @@ -5,7 +5,7 @@ ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest RUN chmod +x /usr/local/bin/aws-lambda-rie # Install Lambda runtime client -RUN pip install awslambdaric playwright==1.58.0 +RUN pip install awslambdaric playwright==1.58.0 requests # Set working directory (Lambda task root) WORKDIR /var/task diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index b5ec4320..053ad2f4 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -20,7 +20,9 @@ def handler(event: Mapping[str, Any], context: Any) -> None: raise client = CotalityClient(token=token) - uprn = "100061885568" # TODO: get from request body + # uprn = "100061885568" # TODO: get from request body + job_id = "5abf6e27-e4c4-4ba8-b69d-9e34939e0002" # TODO: get from request body - saved_files: List[str] = client.get_evidence_files_by_uprn(uprn=uprn) + saved_files: List[str] = client.get_core_envidence_files_by_job_id(job_id) + # saved_files: List[str] = client.get_evidence_files_by_uprn(uprn=uprn) print(f"saved {len(saved_files)} files") diff --git a/scripts/download_cotality_evidence.py b/scripts/download_cotality_evidence.py index 93148a3a..76400a99 100644 --- a/scripts/download_cotality_evidence.py +++ b/scripts/download_cotality_evidence.py @@ -1,7 +1,7 @@ import requests import json -TOKEN = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6Ik1EUTRNRU5GUTBVNU9FUXpOelk1TVRFME0wUkdOMFpFUkRoR1JVVkJNVGMxT1RFNFJERXlPQSJ9.eyJodHRwOi8vZW1haWwiOiJzZWJhc3RpYW5Ab3Ntb3Npcy1hY2QuY29tIiwiaHR0cDovL2NsdWsudG9rZW4vbGFzdFBhc3N3b3JkQ2hhbmdlIjoiMjAyNS0wOC0yNlQwOTo1NDoyNi4zMjZaIiwiaHR0cDovL2NsdWsudG9rZW4vY29ubmVjdGlvbiI6ImVUZWNoSUQiLCJodHRwOi8vY2x1ay50b2tlbi9zdHJhdGVneSI6ImF1dGgwIiwiaHR0cDovL2NsdWsudG9rZW4vc3RyYXRlZ3lUeXBlIjoiZGF0YWJhc2UiLCJpc3MiOiJodHRwczovL2V0ZWNoaWQuZXUuYXV0aDAuY29tLyIsInN1YiI6ImF1dGgwfDY4YWQ4NDUyZDI2YzI1ZmMyMzkwZmYxYSIsImF1ZCI6WyJodHRwczovL3Bhc2h1Yi5hcGkuZXRlY2gubmV0IiwiaHR0cHM6Ly9ldGVjaGlkLmV1LmF1dGgwLmNvbS91c2VyaW5mbyJdLCJpYXQiOjE3NzQyNzg3NjIsImV4cCI6MTc3NDI4NTk2Miwic2NvcGUiOiJvcGVuaWQiLCJhenAiOiJEaVp6d3VVaTVkVmozOXR3NG00bWZ6emZvRm5MdmVLZyJ9.ESIbau52J7KXL22tM8GlO9eV0f0pCOFdoQGL2YcjsTEcSeucHBuI9lHXT2dNJn0E8qlgafjazaMkoMs2g0TiTUUZU6XsKqKpUAJy4kk-qKp53V5az7e2MG9uDSa5bB1vWsQQw37zaNVQ0FQkpYHSiFeGoBh1PjuKwCpLjbl94bx7S4bQKaJSZRUj5TS75k6HnSOhUtN9LYLMPRoLty7TwqFLDxgj8Ixl_ddEF3C3Y6Mcxa5UF57BNTnFXmLefqsryex0XV4b5Btu4W5wZ4bjhX2M7PSXbk4lTv1YZdQxWLpzvNpEVnFueawtqedGYipqH1v4bg99YUnXDbajd2SSVQ" +TOKEN = "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6Ik1EUTRNRU5GUTBVNU9FUXpOelk1TVRFME0wUkdOMFpFUkRoR1JVVkJNVGMxT1RFNFJERXlPQSJ9.eyJodHRwOi8vZW1haWwiOiJzZWJhc3RpYW5Ab3Ntb3Npcy1hY2QuY29tIiwiaHR0cDovL2NsdWsudG9rZW4vbGFzdFBhc3N3b3JkQ2hhbmdlIjoiMjAyNS0wOC0yNlQwOTo1NDoyNi4zMjZaIiwiaHR0cDovL2NsdWsudG9rZW4vY29ubmVjdGlvbiI6ImVUZWNoSUQiLCJodHRwOi8vY2x1ay50b2tlbi9zdHJhdGVneSI6ImF1dGgwIiwiaHR0cDovL2NsdWsudG9rZW4vc3RyYXRlZ3lUeXBlIjoiZGF0YWJhc2UiLCJpc3MiOiJodHRwczovL2V0ZWNoaWQuZXUuYXV0aDAuY29tLyIsInN1YiI6ImF1dGgwfDY4YWQ4NDUyZDI2YzI1ZmMyMzkwZmYxYSIsImF1ZCI6WyJodHRwczovL3Bhc2h1Yi5hcGkuZXRlY2gubmV0IiwiaHR0cHM6Ly9ldGVjaGlkLmV1LmF1dGgwLmNvbS91c2VyaW5mbyJdLCJpYXQiOjE3NzQyODczOTMsImV4cCI6MTc3NDI5NDU5Mywic2NvcGUiOiJvcGVuaWQiLCJhenAiOiJEaVp6d3VVaTVkVmozOXR3NG00bWZ6emZvRm5MdmVLZyJ9.NHh21XfnRofsFkRkc-28Dz-vQAdY70lXkEmh-Mzz7Fg6gjDbZeMu7PnBwgbDP_U8r6R0mI_pDIUc1MzJe1Rf5SF2-RV36TcGzmVzb3ek9wPsy3lxST5WL-vn-qUJ7GsZiGOeQ-jDLLFn8b8tjFrD7BGv8uphrfYAbPDm0atznkdbUSQQy-rfRJWhisnDtHf99j96TuJz3dV4bfI6VGrin-jezbg6BCvUYWQtttUs7knQKEWO0sGGDxtS29sbn4MX8Jqz4-hf6N2XSlgv52aIDwTVX-lyMWzfoeuIGhvCKuDiJeVw2c0r2UZFpHqjnfhXcb0_aacukXe8z-srj8-Rdw" base = "https://pashub.net/api" From ab40bca32b216583476214d78c092b661de57da7 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 08:58:34 +0000 Subject: [PATCH 11/28] download using requests rather than self.session --- backend/pashub_fetcher/cotality_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/cotality_client.py b/backend/pashub_fetcher/cotality_client.py index 0ae473b7..a9e991bf 100644 --- a/backend/pashub_fetcher/cotality_client.py +++ b/backend/pashub_fetcher/cotality_client.py @@ -120,7 +120,7 @@ class CotalityClient: return f"{base}{container}/{file_id}?{sas}" def _download_file(self, url: str, file_name: str) -> None: - r = self.session.get(url) + r = requests.get(url) r.raise_for_status() with open(file_name, "wb") as f: From ff2df292a861e85ef92e3abfcc114d524a56534f Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 09:05:48 +0000 Subject: [PATCH 12/28] remove unused logs and commented lines --- backend/pashub_fetcher/cotality_client.py | 3 --- backend/pashub_fetcher/handler/handler.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/backend/pashub_fetcher/cotality_client.py b/backend/pashub_fetcher/cotality_client.py index a9e991bf..933e2829 100644 --- a/backend/pashub_fetcher/cotality_client.py +++ b/backend/pashub_fetcher/cotality_client.py @@ -16,8 +16,6 @@ logger = setup_logger() class CotalityClient: def __init__(self, token: str): - logger.info(f"Container UTC time: {datetime.now(timezone.utc)}") - self.token = token self.company_id = "cb5249e2-8f31-4ef4-aefd-08ddaccb1fa2" self.base = "https://pashub.net/api" @@ -57,7 +55,6 @@ class CotalityClient: ) download_url: str = self._build_download_url(metadata, evidence.file_id) - logger.info(f"Download URL: {download_url}") file_name = evidence.file_name self._download_file(download_url, file_name) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 053ad2f4..971cd6cd 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -20,9 +20,7 @@ def handler(event: Mapping[str, Any], context: Any) -> None: raise client = CotalityClient(token=token) - # uprn = "100061885568" # TODO: get from request body job_id = "5abf6e27-e4c4-4ba8-b69d-9e34939e0002" # TODO: get from request body saved_files: List[str] = client.get_core_envidence_files_by_job_id(job_id) - # saved_files: List[str] = client.get_evidence_files_by_uprn(uprn=uprn) print(f"saved {len(saved_files)} files") From 54c05096e8102e9d8fbd6c281054aea293b058e2 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 09:08:51 +0000 Subject: [PATCH 13/28] delete unused import --- backend/pashub_fetcher/cotality_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/cotality_client.py b/backend/pashub_fetcher/cotality_client.py index 933e2829..e32ec7eb 100644 --- a/backend/pashub_fetcher/cotality_client.py +++ b/backend/pashub_fetcher/cotality_client.py @@ -1,6 +1,6 @@ from collections import defaultdict from typing import Dict, List, Optional -from datetime import datetime, timezone +from datetime import datetime import requests From 53502c28a80b748062e140dbbd0be7c0b6b1896b Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 09:17:05 +0000 Subject: [PATCH 14/28] handle multiple jobs in a single call --- backend/pashub_fetcher/handler/handler.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 971cd6cd..2a39db60 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,3 +1,4 @@ +import time from typing import Any, List, Mapping from backend.pashub_fetcher.cotality_client import CotalityClient @@ -20,7 +21,15 @@ def handler(event: Mapping[str, Any], context: Any) -> None: raise client = CotalityClient(token=token) - job_id = "5abf6e27-e4c4-4ba8-b69d-9e34939e0002" # TODO: get from request body - saved_files: List[str] = client.get_core_envidence_files_by_job_id(job_id) + jobs = [ + "5abf6e27-e4c4-4ba8-b69d-9e34939e0002", + "047f4455-85e2-4293-97b1-6b460137d33e", + ] # TODO: get these from request body + + saved_files: List[str] = [] + for job_id in jobs: + saved_files.extend(client.get_core_envidence_files_by_job_id(job_id)) + time.sleep(10) # Simulate manual download + print(f"saved {len(saved_files)} files") From 7c0a947bf4a6feee8c5d59600215418122c742e5 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 09:22:52 +0000 Subject: [PATCH 15/28] try regetting token once if auth error during download --- backend/pashub_fetcher/cotality_client.py | 12 ++++++++++++ backend/pashub_fetcher/handler/handler.py | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/backend/pashub_fetcher/cotality_client.py b/backend/pashub_fetcher/cotality_client.py index e32ec7eb..db7b8bd4 100644 --- a/backend/pashub_fetcher/cotality_client.py +++ b/backend/pashub_fetcher/cotality_client.py @@ -13,6 +13,10 @@ from utils.logger import setup_logger logger = setup_logger() +class UnauthorizedError(Exception): + pass + + class CotalityClient: def __init__(self, token: str): @@ -93,6 +97,10 @@ class CotalityClient: url = f"{self.base}/jobs/{job_id}/evidence" r = self.session.get(url) + + if r.status_code == 401: + raise UnauthorizedError("Token expired or invalid") + r.raise_for_status() results = r.json().get("results", []) @@ -118,6 +126,10 @@ class CotalityClient: def _download_file(self, url: str, file_name: str) -> None: r = requests.get(url) + + if r.status_code == 401: + raise UnauthorizedError() + r.raise_for_status() with open(file_name, "wb") as f: diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 2a39db60..47e45714 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,7 +1,7 @@ import time from typing import Any, List, Mapping -from backend.pashub_fetcher.cotality_client import CotalityClient +from backend.pashub_fetcher.cotality_client import CotalityClient, UnauthorizedError from backend.pashub_fetcher.token_getter import get_token_from_local_storage from utils.logger import setup_logger @@ -29,7 +29,19 @@ def handler(event: Mapping[str, Any], context: Any) -> None: saved_files: List[str] = [] for job_id in jobs: - saved_files.extend(client.get_core_envidence_files_by_job_id(job_id)) + try: + saved_files.extend(client.get_core_envidence_files_by_job_id(job_id)) + + except UnauthorizedError: + logger.warning("Token expired — refreshing") + + token = get_token_from_local_storage(pas_hub_email, pas_hub_password) + + client = CotalityClient(token=token) + + # retry once + saved_files.extend(client.get_core_envidence_files_by_job_id(job_id)) + time.sleep(10) # Simulate manual download print(f"saved {len(saved_files)} files") From 06e0775904eb2dc9249ed9dc5913f5747c8908ac Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 09:23:39 +0000 Subject: [PATCH 16/28] try regetting token once if auth error during download --- backend/pashub_fetcher/cotality_client.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/pashub_fetcher/cotality_client.py b/backend/pashub_fetcher/cotality_client.py index db7b8bd4..b4a30dc2 100644 --- a/backend/pashub_fetcher/cotality_client.py +++ b/backend/pashub_fetcher/cotality_client.py @@ -97,7 +97,6 @@ class CotalityClient: url = f"{self.base}/jobs/{job_id}/evidence" r = self.session.get(url) - if r.status_code == 401: raise UnauthorizedError("Token expired or invalid") @@ -112,6 +111,9 @@ class CotalityClient: params = {"evidenceIds": evidence_id} r = self.session.get(url, params=params) + if r.status_code == 401: + raise UnauthorizedError() + r.raise_for_status() return EvidenceMetadata.from_api(r.json()) @@ -126,7 +128,6 @@ class CotalityClient: def _download_file(self, url: str, file_name: str) -> None: r = requests.get(url) - if r.status_code == 401: raise UnauthorizedError() From 21f5cd40da8af0a648b68e0d39a06bb0238581ca Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 09:53:12 +0000 Subject: [PATCH 17/28] remove token from log --- backend/pashub_fetcher/handler/handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 47e45714..df187f3e 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -15,7 +15,7 @@ def handler(event: Mapping[str, Any], context: Any) -> None: try: token: str = get_token_from_local_storage(pas_hub_email, pas_hub_password) - logger.info(f"Token extracted successfully: {token}") + logger.info(f"Token extracted successfully") except: logger.error("Error getting auth token from Pas Hub") raise From a13ab09ab98b2dcfbd4ae53fbad3a1014a490017 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 24 Mar 2026 13:36:47 +0000 Subject: [PATCH 18/28] added sharepoint functionality for mr roth - enjoy --- .devcontainer/backend/requirements.txt | 1 + utils/sharepoint/__init__.py | 0 utils/sharepoint/domna_sharepoint_client.py | 172 ++++++++++ utils/sharepoint/main.py | 25 ++ utils/sharepoint/sharepoint_client.py | 339 ++++++++++++++++++++ utils/sharepoint/temp | 0 6 files changed, 537 insertions(+) create mode 100644 utils/sharepoint/__init__.py create mode 100644 utils/sharepoint/domna_sharepoint_client.py create mode 100644 utils/sharepoint/main.py create mode 100644 utils/sharepoint/sharepoint_client.py create mode 100644 utils/sharepoint/temp diff --git a/.devcontainer/backend/requirements.txt b/.devcontainer/backend/requirements.txt index d7afa6a2..029e5efa 100644 --- a/.devcontainer/backend/requirements.txt +++ b/.devcontainer/backend/requirements.txt @@ -13,6 +13,7 @@ boto3==1.35.44 openpyxl==3.1.5 # Basic pytz +msal uvicorn[standard] sqlmodel # Testing diff --git a/utils/sharepoint/__init__.py b/utils/sharepoint/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/sharepoint/domna_sharepoint_client.py b/utils/sharepoint/domna_sharepoint_client.py new file mode 100644 index 00000000..55336f85 --- /dev/null +++ b/utils/sharepoint/domna_sharepoint_client.py @@ -0,0 +1,172 @@ +from pprint import pformat +from enum import Enum +import os +from utils.logger import setup_logger +from utils.sharepoint.sharepoint_client import SharePointClient +from functools import wraps +import re +from datetime import datetime, timedelta +from io import BytesIO + + +class DomnaSites(Enum): + # https//{tenant}.sharepoint.com/sites/{site}/_api/site/id + # TODO: Add these to github secrets!!! + DOMNA = os.getenv("DOMNA_SHAREPOINT_ID") + OSMOSIS_ACD = os.getenv("OSMOSIS_ACD_SHAREPOINT_ID") + PRIVATE_PAY = os.getenv("PRIVATE_PAY_SHAREPOINT_ID") + SOCIAL_HOUSING_WAVE_3 = os.getenv("SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID") + + +class DomnaSharepointClient: + """ + A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change + """ + + def __init__(self, sharepoint_location, development=False): + self.logger = setup_logger() + self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None) + self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + self.sharepoint_tenant_id = os.getenv("SHAREPOINT_TENANT_ID", None) + self.sharepoint_drive = sharepoint_location + + assert ( + self.sharepoint_client_id is not None + ), "Please assign SHAREPOINT_CLIENT_ID env variable" + assert ( + self.sharepoint_client_secret is not None + ), "Please assign SHAREPOINT_CLIENT_SECRET env variable" + assert ( + self.sharepoint_tenant_id is not None + ), "Please assign SHAREPOINT_TENANT_ID env variable" + assert ( + self.sharepoint_drive.value is not None + ), "Please set sharepoint driver id env variable. See SharePointInstaller for more information" + + def get_folders_in_path(self, path): + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + return sharepoint_client.list_folder_contents(path) + + def get_file_content(self, url): + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + return sharepoint_client.download_sharepoint_file(url) + + def does_folder_exists_at(self, file_name, file_path): + folders = self.get_folders_in_path(file_path) + if "value" in folders: + for folder in folders["value"]: + if file_name.upper() in folder["name"].upper(): + return True + return False + + def create_dir(self, file_name, at_path="/"): + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + folders = self.get_folders_in_path(at_path) + + # Check if folder already exists (case-insensitive match) + if "value" in folders: + for folder in folders["value"]: + if "name" in folder and folder["name"].lower() == file_name.lower(): + self.logger.info(f"Folder already exists: {file_name} at {at_path}") + return folder["webUrl"] # ✅ return existing folder + + # Folder does NOT exist → create it + self.logger.info(f"Creating folder: {file_name} at {at_path}") + created = sharepoint_client.create_folder(file_name, at_path) + + return created["webUrl"] + + def makedir(self, dir_name, at_path="/"): + return self.create_dir(dir_name, at_path) + + def upload_file(self, file_path, sharepoint_path, file_name): + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + def get_file_stream(file_path): + return open(file_path, "rb") + + sharepoint_client.upload_file( + file_name, get_file_stream(file_path), sharepoint_path + ) + + def download_files_from_path(self, path, avoid=None): + """ + Download all non-media files from a list of root paths. + + Args: + root_paths (List[str]): List of full folder paths to start from. + + Returns: + List[Dict[str, List[str]]]: A list of dictionaries mapping address folder names to downloaded file paths. + """ + if avoid is None: + avoid = [ + ".jpg", + ".mov", + ".JPG", + ".heic", + ".HEIC", + ".png", + ".PNG", + ".jpeg", + ".JPEG", + ".mp4", + ".MP4", + ] + + files_info = self.get_folders_in_path(path) + + if "value" not in files_info: + raise RuntimeError(f"Failed to get files from {path}") + + file_names_to_download = { + file["name"]: file["@microsoft.graph.downloadUrl"] + for file in files_info["value"] + if "file" in file and not any(file["name"].endswith(ext) for ext in avoid) + } + + downloaded_files = [] + for file_name, url in file_names_to_download.items(): + self.logger.info(f"Downloading {file_name} from {url}") + content = self.get_file_content(url) + file_path = self.create_temp_file(content, f"{path}/{file_name}") + downloaded_files.append(file_path) + + return downloaded_files + + def create_temp_file(self, content: BytesIO, path: str): + # Ensure the path is under /tmp/ + new_path = os.path.join("/tmp/sharepoint", path) + + # Ensure the parent directory exists + os.makedirs(os.path.dirname(new_path), exist_ok=True) + + # Write content to the specified file + with open(new_path, "wb+") as temp_file: + temp_file.write(content.getvalue()) + + self.logger.debug(f"Temporary file created at: {new_path}") + return new_path diff --git a/utils/sharepoint/main.py b/utils/sharepoint/main.py new file mode 100644 index 00000000..4cc4cfb0 --- /dev/null +++ b/utils/sharepoint/main.py @@ -0,0 +1,25 @@ +# This is small script to see if Domna Sharepoint Client works +# for basic functionality + +# Can we import it? +from io import BytesIO + +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient, DomnaSites + + +# can we initliase it +client = DomnaSharepointClient(sharepoint_location=DomnaSites.SOCIAL_HOUSING_WAVE_3) + +# can we get an example of root path? + +client.get_folders_in_path("/") +client.get_folders_in_path("/JTK Test Folder") + +# can we make a folder appear in JTK Test Folder? +client.makedir("Dan is the best", "/JTK Test Folder") + +content = BytesIO(b"Hello, this is some file content!") +path = client.create_temp_file(content, "some/place/over/the/rainbow") +client.upload_file( + path, "/JTK Test Folder/Dan is the best", "junte_is_the_worst_at_python.txt" +) diff --git a/utils/sharepoint/sharepoint_client.py b/utils/sharepoint/sharepoint_client.py new file mode 100644 index 00000000..421b1535 --- /dev/null +++ b/utils/sharepoint/sharepoint_client.py @@ -0,0 +1,339 @@ +""" +This file contains the functions which enable interaction with SharePoint via the API. + +Documentation to get api_id: +https://answers.microsoft.com/en-us/msoffice/forum/all/what-is-the-best-way-to-findout-the-share-point/7b2d4183-4188-4cd5-8441-dd93207c5a01 +""" + +from msal import ConfidentialClientApplication +from datetime import datetime, timedelta +import requests +from functools import wraps +import time +import logging +from io import BytesIO +import tempfile +import os + +# Api Documentation: https://learn.microsoft.com/en-us/graph/api/drive-get?view=graph-rest-1.0&tabs=http + + +def handle_error(response): + """ + Handle errors based on HTTP status codes and log detailed information. + """ + try: + error_json = response.json().get("error", {}) + except ValueError: + error_json = {} + + error_code = error_json.get("code", "unknownError") + error_message = error_json.get("message", "No detailed error message provided.") + inner_error = error_json.get("innererror", {}) + details = error_json.get("details", []) + + logger.error(f"Error Code: {error_code}") + logger.error(f"Error Message: {error_message}") + if inner_error: + logger.error(f"Inner Error: {inner_error}") + if details: + logger.error(f"Error Details: {details}") + + if response.status_code == 401: + logger.error("Unauthorized. Token might be invalid.") + elif response.status_code == 403: + logger.error("Forbidden. Access denied to the requested resource.") + elif response.status_code == 404: + logger.error("Not Found. The requested resource doesn’t exist.") + elif response.status_code == 429: + retry_after = int( + response.headers.get("Retry-After", 5) + ) # Default to 5 seconds if not provided + logger.warning(f"Too Many Requests. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return "retry" + elif response.status_code in (500, 503): + retry_after = int( + response.headers.get("Retry-After", 5) + ) # Default to 5 seconds if not provided + logger.error(f"Server error. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return "retry" + else: + raise ValueError( + f"API request failed with status code {response.status_code} - {error_message}" + ) + + raise ValueError( + f"API request failed with status code {response.status_code} - {error_message}" + ) + + +def api_call_decorator(func): + """ + Handles various aspects of the API call, including refreshing the access token if needed and handling pagination. + :param func: The function to be decorated. + :return: The wrapped function. + """ + + @wraps(func) + def wrapper(self, *args, **kwargs): + try: + # Check and refresh the access token if needed + if self.is_access_token_expired(): + self.retrieve_access_token() + logger.debug("Access token refreshed.") + + # Get the HTTP method, URL, and optionally data from the function + http_method, url, data = func(self, *args, **kwargs) + + # Initialize the results list and handle pagination if page_size is provided + results = [] + page_size = kwargs.get("page_size", None) + response_data = {} + + while url: + response = requests.request( + http_method, url, headers=self.headers, json=data + ) + + # Handle the response + if response.status_code == 200 or response.status_code == 201: + response_json = response.json() # Store the response JSON + if page_size: + results.extend(response_json.get("value", [])) + url = response_json.get("@odata.nextLink", None) + else: + response_data = ( + response_json # Capture the full response for consistency + ) + break + else: + retry = handle_error(response) + if retry == "retry": + continue + + if page_size: + response_data = {"value": results} + + return response_data + + except Exception as e: + logger.exception("An error occurred during the API call.") + raise e + + return wrapper + + +class SharePointClient: + access_token = None + access_token_request_timestamp = None + access_token_expiry = None + headers = None + + TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + + def __init__( + self, + tenant_id, + client_id, + client_secret, + site_id, + access_token=None, + access_token_expiration_details=None, + ): + """ + Initializes the SharePointClient with necessary credentials and site information. + :param tenant_id: The tenant ID. + :param client_id: The client ID. + :param client_secret: The client secret. + :param site_id: The site ID. + :param access_token: The access token (optional) + :param access_token_expiration_details: The access token expiration details (optional) + """ + self.tenant_id = tenant_id + self.client_id = client_id + self.client_secret = client_secret + + if access_token: + if not access_token_expiration_details: + raise ValueError("Access token expiration details must be provided.") + self.access_token = access_token + self.set_access_token_expiration_details(access_token_expiration_details) + self.headers = { + "Authorization": f"Bearer {self.access_token['access_token']}" + } + else: + self.retrieve_access_token() + + # Retrieve static identifiers + self.site_id = site_id + self.document_drive = self.get_documents_drive() + self.document_drive_id = self.document_drive["id"] + + def get_token_expiration_details(self): + """ + Returns the access token expiration details. Converts the datetime objects to strings for serialization. + :return: + """ + return { + "access_token_request_timestamp": datetime.strftime( + self.access_token_request_timestamp, self.TIMESTAMP_FORMAT + ), + "access_token_expiry": datetime.strftime( + self.access_token_expiry, self.TIMESTAMP_FORMAT + ), + } + + def set_access_token_expiration_details(self, access_token_expiration_details): + """ + Sets the access token expiration details from a serialized dictionary. + :param access_token_expiration_details: The serialized access token expiration details. + :return: + """ + self.access_token_request_timestamp = datetime.strptime( + access_token_expiration_details["access_token_request_timestamp"], + self.TIMESTAMP_FORMAT, + ) + self.access_token_expiry = datetime.strptime( + access_token_expiration_details["access_token_expiry"], + self.TIMESTAMP_FORMAT, + ) + + def is_access_token_expired(self): + """ + Checks if the access token has expired. If it has, a new access token is retrieved. + :return: True if expired, False otherwise. + """ + return datetime.now() >= self.access_token_expiry + + def retrieve_access_token(self, refresh=False): + """ + Implements authentication using MSAL. + :param refresh: If True, force a refresh of the access token. + :return: None + """ + app = ConfidentialClientApplication( + self.client_id, + authority=f"https://login.microsoftonline.com/{self.tenant_id}", + client_credential=self.client_secret, + ) + + scope = ["https://graph.microsoft.com/.default"] + + access_token_request_timestamp = datetime.now() + + if refresh: + logger.debug("Forcing refresh of access token.") + token = app.acquire_token_for_client(scopes=scope) + else: + # Check if a token is already cached + token = app.acquire_token_silent(scope, account=None) + + if not token: + token = app.acquire_token_for_client(scopes=scope) + + if "access_token" not in token: + logger.error("Authentication failed.") + raise ValueError("Authentication failed") + + access_token_expiry = access_token_request_timestamp + timedelta( + seconds=token["expires_in"] - 20 + ) + + self.access_token = token + self.access_token_request_timestamp = access_token_request_timestamp + self.access_token_expiry = access_token_expiry + self.headers = {"Authorization": f"Bearer {self.access_token['access_token']}"} + + # logger.debug("Access token retrieved successfully.") + + @api_call_decorator + def get_documents_drive(self): + """ + Get the document drive of the SharePoint site. + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/sites/{self.site_id}/drive" + # logger.debug(f"Getting document drive from URL: {url}") + return "GET", url, None + + @api_call_decorator + def list_folder_contents(self, folder_path: str, page_size: int = 100): + """ + GET drive/root/children + + This function will list the contents of a folder in SharePoint. + :param drive_id: The ID of the drive. + :param folder_path: The path of the folder. + :param page_size: The number of items per page (default is 100). + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{folder_path}:/children?$top={page_size}" + # logger.debug(f"Listing folder contents from URL: {url}") + return "GET", url, None + + @api_call_decorator + def create_folder(self, file_name, folder_path): + """ + POST https://graph.microsoft.com/v1.0/me/drive/root/children + Content-Type: application/json + { + "name": "New Folder", + "folder": { }, + "@microsoft.graph.conflictBehavior": "rename" + } + + """ + data = { + "name": file_name, + "folder": {}, + "@microsoft.graph.conflictBehavior": "rename", + } + url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{folder_path}:/children" + + return "POST", url, data + + def upload_file(self, file_name, file_stream, sharepoint_parent_id): + """ + Uploads a file to SharePoint using the Graph API. + PUT /drives/{drive-id}/root:/{path-to-file}:/content + + :param file_name: Name of the file to upload + :param sharepoint_path: Path within the SharePoint site (folder path) + :param file_stream: File content as a binary stream (e.g., BytesIO or open(file, 'rb')) + :return: Response JSON from the API + """ + url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{sharepoint_parent_id}/{file_name}:/content" + # logger.debug(f"Uploading file to URL: {url}") + + response = requests.put(url, headers=self.headers, data=file_stream) + + if response.status_code in (200, 201): + # logger.info(f"File '{file_name}' uploaded successfully.") + return response.json() + else: + retry = handle_error(response) + if retry == "retry": + return self.upload_file(file_name, sharepoint_parent_id, file_stream) + + @staticmethod + def download_sharepoint_file(download_url): + """ + Downloads a file from the given URL and returns its content. + + :param download_url: The URL to download the file from. + :return: The content of the downloaded file. + """ + response = requests.get(download_url, stream=True) + response.raise_for_status() # Check if the request was successful + + file_content = BytesIO() + + # Read the file content into memory + for chunk in response.iter_content(chunk_size=8192): + file_content.write(chunk) + + file_content.seek(0) # Reset the file pointer to the beginning + + return file_content diff --git a/utils/sharepoint/temp b/utils/sharepoint/temp new file mode 100644 index 00000000..e69de29b From fea099afe34cbe5af7e8d613256a9707880ad46e Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 14:42:48 +0000 Subject: [PATCH 19/28] delete unused methods and add typehinting to DomnaSharepointClient --- utils/sharepoint/domna_sharepoint_client.py | 91 ++++----------------- utils/sharepoint/sharepoint_client.py | 17 ++-- 2 files changed, 26 insertions(+), 82 deletions(-) diff --git a/utils/sharepoint/domna_sharepoint_client.py b/utils/sharepoint/domna_sharepoint_client.py index 55336f85..374ee140 100644 --- a/utils/sharepoint/domna_sharepoint_client.py +++ b/utils/sharepoint/domna_sharepoint_client.py @@ -1,11 +1,8 @@ -from pprint import pformat from enum import Enum import os +from typing import Any, Dict, Optional from utils.logger import setup_logger from utils.sharepoint.sharepoint_client import SharePointClient -from functools import wraps -import re -from datetime import datetime, timedelta from io import BytesIO @@ -23,7 +20,7 @@ class DomnaSharepointClient: A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change """ - def __init__(self, sharepoint_location, development=False): + def __init__(self, sharepoint_location: DomnaSites): self.logger = setup_logger() self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None) self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None) @@ -39,11 +36,8 @@ class DomnaSharepointClient: assert ( self.sharepoint_tenant_id is not None ), "Please assign SHAREPOINT_TENANT_ID env variable" - assert ( - self.sharepoint_drive.value is not None - ), "Please set sharepoint driver id env variable. See SharePointInstaller for more information" - def get_folders_in_path(self, path): + def get_folders_in_path(self, path: str) -> Dict[str, Any]: sharepoint_client = SharePointClient( tenant_id=self.sharepoint_tenant_id, client_id=self.sharepoint_client_id, @@ -53,25 +47,15 @@ class DomnaSharepointClient: return sharepoint_client.list_folder_contents(path) - def get_file_content(self, url): - sharepoint_client = SharePointClient( - tenant_id=self.sharepoint_tenant_id, - client_id=self.sharepoint_client_id, - client_secret=self.sharepoint_client_secret, - site_id=self.sharepoint_drive.value, - ) - - return sharepoint_client.download_sharepoint_file(url) - - def does_folder_exists_at(self, file_name, file_path): - folders = self.get_folders_in_path(file_path) + def does_folder_exists_at(self, file_name: str, file_path: str): + folders: Dict[str, Any] = self.get_folders_in_path(file_path) if "value" in folders: for folder in folders["value"]: if file_name.upper() in folder["name"].upper(): return True return False - def create_dir(self, file_name, at_path="/"): + def create_dir(self, dir_name: str, at_path: str = "/") -> str: sharepoint_client = SharePointClient( tenant_id=self.sharepoint_tenant_id, client_id=self.sharepoint_client_id, @@ -79,25 +63,27 @@ class DomnaSharepointClient: site_id=self.sharepoint_drive.value, ) - folders = self.get_folders_in_path(at_path) + folders: Dict[str, Any] = self.get_folders_in_path(at_path) # Check if folder already exists (case-insensitive match) if "value" in folders: for folder in folders["value"]: - if "name" in folder and folder["name"].lower() == file_name.lower(): - self.logger.info(f"Folder already exists: {file_name} at {at_path}") + if "name" in folder and folder["name"].lower() == dir_name.lower(): + self.logger.info(f"Folder already exists: {dir_name} at {at_path}") return folder["webUrl"] # ✅ return existing folder # Folder does NOT exist → create it - self.logger.info(f"Creating folder: {file_name} at {at_path}") - created = sharepoint_client.create_folder(file_name, at_path) + self.logger.info(f"Creating folder: {dir_name} at {at_path}") + created: Dict[str, Any] = sharepoint_client.create_folder(dir_name, at_path) return created["webUrl"] - def makedir(self, dir_name, at_path="/"): + def makedir(self, dir_name: str, at_path: str = "/") -> str: return self.create_dir(dir_name, at_path) - def upload_file(self, file_path, sharepoint_path, file_name): + def upload_file( + self, file_path: str, sharepoint_path: str, file_name: str + ) -> Optional[Dict[str, Any]]: sharepoint_client = SharePointClient( tenant_id=self.sharepoint_tenant_id, client_id=self.sharepoint_client_id, @@ -105,58 +91,13 @@ class DomnaSharepointClient: site_id=self.sharepoint_drive.value, ) - def get_file_stream(file_path): + def get_file_stream(file_path: str): return open(file_path, "rb") sharepoint_client.upload_file( file_name, get_file_stream(file_path), sharepoint_path ) - def download_files_from_path(self, path, avoid=None): - """ - Download all non-media files from a list of root paths. - - Args: - root_paths (List[str]): List of full folder paths to start from. - - Returns: - List[Dict[str, List[str]]]: A list of dictionaries mapping address folder names to downloaded file paths. - """ - if avoid is None: - avoid = [ - ".jpg", - ".mov", - ".JPG", - ".heic", - ".HEIC", - ".png", - ".PNG", - ".jpeg", - ".JPEG", - ".mp4", - ".MP4", - ] - - files_info = self.get_folders_in_path(path) - - if "value" not in files_info: - raise RuntimeError(f"Failed to get files from {path}") - - file_names_to_download = { - file["name"]: file["@microsoft.graph.downloadUrl"] - for file in files_info["value"] - if "file" in file and not any(file["name"].endswith(ext) for ext in avoid) - } - - downloaded_files = [] - for file_name, url in file_names_to_download.items(): - self.logger.info(f"Downloading {file_name} from {url}") - content = self.get_file_content(url) - file_path = self.create_temp_file(content, f"{path}/{file_name}") - downloaded_files.append(file_path) - - return downloaded_files - def create_temp_file(self, content: BytesIO, path: str): # Ensure the path is under /tmp/ new_path = os.path.join("/tmp/sharepoint", path) diff --git a/utils/sharepoint/sharepoint_client.py b/utils/sharepoint/sharepoint_client.py index 421b1535..67c4315c 100644 --- a/utils/sharepoint/sharepoint_client.py +++ b/utils/sharepoint/sharepoint_client.py @@ -5,15 +5,14 @@ Documentation to get api_id: https://answers.microsoft.com/en-us/msoffice/forum/all/what-is-the-best-way-to-findout-the-share-point/7b2d4183-4188-4cd5-8441-dd93207c5a01 """ +from typing import Any, BinaryIO, Dict, Optional + from msal import ConfidentialClientApplication from datetime import datetime, timedelta import requests from functools import wraps import time -import logging from io import BytesIO -import tempfile -import os # Api Documentation: https://learn.microsoft.com/en-us/graph/api/drive-get?view=graph-rest-1.0&tabs=http @@ -259,7 +258,9 @@ class SharePointClient: return "GET", url, None @api_call_decorator - def list_folder_contents(self, folder_path: str, page_size: int = 100): + def list_folder_contents( + self, folder_path: str, page_size: int = 100 + ) -> Dict[str, Any]: """ GET drive/root/children @@ -274,7 +275,7 @@ class SharePointClient: return "GET", url, None @api_call_decorator - def create_folder(self, file_name, folder_path): + def create_folder(self, file_name: str, folder_path: str) -> Dict[str, Any]: """ POST https://graph.microsoft.com/v1.0/me/drive/root/children Content-Type: application/json @@ -285,7 +286,7 @@ class SharePointClient: } """ - data = { + data: Dict[str, Any] = { "name": file_name, "folder": {}, "@microsoft.graph.conflictBehavior": "rename", @@ -294,7 +295,9 @@ class SharePointClient: return "POST", url, data - def upload_file(self, file_name, file_stream, sharepoint_parent_id): + def upload_file( + self, file_name: str, file_stream: BinaryIO, sharepoint_parent_id: str + ) -> Optional[Dict[str, Any]]: """ Uploads a file to SharePoint using the Graph API. PUT /drives/{drive-id}/root:/{path-to-file}:/content From 441dce9726654646b071f6c22b89e6d894855339 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 14:43:46 +0000 Subject: [PATCH 20/28] Move DomnaSites to dedicated file --- utils/sharepoint/domna_sharepoint_client.py | 14 +++----------- utils/sharepoint/domna_sites.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 11 deletions(-) create mode 100644 utils/sharepoint/domna_sites.py diff --git a/utils/sharepoint/domna_sharepoint_client.py b/utils/sharepoint/domna_sharepoint_client.py index 374ee140..67e079ed 100644 --- a/utils/sharepoint/domna_sharepoint_client.py +++ b/utils/sharepoint/domna_sharepoint_client.py @@ -1,18 +1,10 @@ -from enum import Enum import os from typing import Any, Dict, Optional -from utils.logger import setup_logger -from utils.sharepoint.sharepoint_client import SharePointClient from io import BytesIO - -class DomnaSites(Enum): - # https//{tenant}.sharepoint.com/sites/{site}/_api/site/id - # TODO: Add these to github secrets!!! - DOMNA = os.getenv("DOMNA_SHAREPOINT_ID") - OSMOSIS_ACD = os.getenv("OSMOSIS_ACD_SHAREPOINT_ID") - PRIVATE_PAY = os.getenv("PRIVATE_PAY_SHAREPOINT_ID") - SOCIAL_HOUSING_WAVE_3 = os.getenv("SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID") +from utils.logger import setup_logger +from utils.sharepoint.domna_sites import DomnaSites +from utils.sharepoint.sharepoint_client import SharePointClient class DomnaSharepointClient: diff --git a/utils/sharepoint/domna_sites.py b/utils/sharepoint/domna_sites.py new file mode 100644 index 00000000..e5efb82c --- /dev/null +++ b/utils/sharepoint/domna_sites.py @@ -0,0 +1,11 @@ +from enum import Enum +import os + + +class DomnaSites(Enum): + # https//{tenant}.sharepoint.com/sites/{site}/_api/site/id + # TODO: Add these to github secrets!!! + DOMNA = os.getenv("DOMNA_SHAREPOINT_ID") + OSMOSIS_ACD = os.getenv("OSMOSIS_ACD_SHAREPOINT_ID") + PRIVATE_PAY = os.getenv("PRIVATE_PAY_SHAREPOINT_ID") + SOCIAL_HOUSING_WAVE_3 = os.getenv("SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID") From 8e1aacf846aca339356290c636f118d9ce79d69f Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 14:57:08 +0000 Subject: [PATCH 21/28] rename cotality client to pashub client --- backend/pashub_fetcher/handler/handler.py | 12 +++++++----- .../{cotality_client.py => pashub_client.py} | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) rename backend/pashub_fetcher/{cotality_client.py => pashub_client.py} (99%) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index df187f3e..a4ee5339 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,7 +1,7 @@ import time from typing import Any, List, Mapping -from backend.pashub_fetcher.cotality_client import CotalityClient, UnauthorizedError +from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.token_getter import get_token_from_local_storage from utils.logger import setup_logger @@ -20,7 +20,7 @@ def handler(event: Mapping[str, Any], context: Any) -> None: logger.error("Error getting auth token from Pas Hub") raise - client = CotalityClient(token=token) + pashub_client = PashubClient(token=token) jobs = [ "5abf6e27-e4c4-4ba8-b69d-9e34939e0002", @@ -30,17 +30,19 @@ def handler(event: Mapping[str, Any], context: Any) -> None: saved_files: List[str] = [] for job_id in jobs: try: - saved_files.extend(client.get_core_envidence_files_by_job_id(job_id)) + saved_files.extend(pashub_client.get_core_envidence_files_by_job_id(job_id)) + + # Upload files to sharepoint except UnauthorizedError: logger.warning("Token expired — refreshing") token = get_token_from_local_storage(pas_hub_email, pas_hub_password) - client = CotalityClient(token=token) + pashub_client = PashubClient(token=token) # retry once - saved_files.extend(client.get_core_envidence_files_by_job_id(job_id)) + saved_files.extend(pashub_client.get_core_envidence_files_by_job_id(job_id)) time.sleep(10) # Simulate manual download diff --git a/backend/pashub_fetcher/cotality_client.py b/backend/pashub_fetcher/pashub_client.py similarity index 99% rename from backend/pashub_fetcher/cotality_client.py rename to backend/pashub_fetcher/pashub_client.py index b4a30dc2..bed7395f 100644 --- a/backend/pashub_fetcher/cotality_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -17,7 +17,7 @@ class UnauthorizedError(Exception): pass -class CotalityClient: +class PashubClient: def __init__(self, token: str): self.token = token From 9b274bbc157f331c19f52fc16cd303a2972abe64 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 16:03:35 +0000 Subject: [PATCH 22/28] upload files to sharepoint --- backend/pashub_fetcher/handler/Dockerfile | 4 +- backend/pashub_fetcher/handler/handler.py | 46 ++++++++++++++++++----- backend/pashub_fetcher/job.py | 6 +++ backend/pashub_fetcher/pashub_client.py | 19 +++++----- 4 files changed, 54 insertions(+), 21 deletions(-) create mode 100644 backend/pashub_fetcher/job.py diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile index cbd3c228..fd66f53e 100644 --- a/backend/pashub_fetcher/handler/Dockerfile +++ b/backend/pashub_fetcher/handler/Dockerfile @@ -5,12 +5,12 @@ ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest RUN chmod +x /usr/local/bin/aws-lambda-rie # Install Lambda runtime client -RUN pip install awslambdaric playwright==1.58.0 requests +RUN pip install awslambdaric playwright==1.58.0 requests msal # Set working directory (Lambda task root) WORKDIR /var/task -COPY backend/.env.test backend/.env +COPY .env backend/.env COPY utils/ utils/ COPY backend/pashub_fetcher/ backend/pashub_fetcher/ diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index a4ee5339..f99dd688 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,9 +1,12 @@ import time from typing import Any, List, Mapping +from backend.pashub_fetcher.job import Job from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.token_getter import get_token_from_local_storage from utils.logger import setup_logger +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient +from utils.sharepoint.domna_sites import DomnaSites logger = setup_logger() @@ -21,29 +24,52 @@ def handler(event: Mapping[str, Any], context: Any) -> None: raise pashub_client = PashubClient(token=token) + sharepoint_client = DomnaSharepointClient( + sharepoint_location=DomnaSites.SOCIAL_HOUSING_WAVE_3 + ) - jobs = [ - "5abf6e27-e4c4-4ba8-b69d-9e34939e0002", - "047f4455-85e2-4293-97b1-6b460137d33e", - ] # TODO: get these from request body + jobs: List[Job] = [ + { + "id": "5abf6e27-e4c4-4ba8-b69d-9e34939e0002", + "address": "FLAT 11 Abbey View, Garsmouth Way, Watford, WD25 9DY", + }, + { + "id": "047f4455-85e2-4293-97b1-6b460137d33e", + "address": "FLAT 14 Abbey View, Garsmouth Way, Watford, WD25 9DY", + }, + ] # TODO: get these from request body or spreadsheet - saved_files: List[str] = [] - for job_id in jobs: + sharepoint_client.makedir("Watford Test", "/JTK Test Folder") + + saved_file_paths: List[str] = [] + for job in jobs: try: - saved_files.extend(pashub_client.get_core_envidence_files_by_job_id(job_id)) + job_files: List[str] = pashub_client.get_core_evidence_files_by_job_id( + job["id"] + ) # Upload files to sharepoint + sharepoint_client.makedir(job["address"], "/JTK Test Folder/Watford Test") + for file_path in job_files: + sharepoint_client.upload_file( + file_path, + f"/JTK Test Folder/Watford Test/{job['address']}", + file_path.split("/")[-1], + ) + saved_file_paths.extend(job_files) except UnauthorizedError: - logger.warning("Token expired — refreshing") + logger.warning("Token expired - refreshing") token = get_token_from_local_storage(pas_hub_email, pas_hub_password) pashub_client = PashubClient(token=token) # retry once - saved_files.extend(pashub_client.get_core_envidence_files_by_job_id(job_id)) + saved_file_paths.extend( + pashub_client.get_core_evidence_files_by_job_id(job["id"]) + ) time.sleep(10) # Simulate manual download - print(f"saved {len(saved_files)} files") + print(f"saved {len(saved_file_paths)} files") diff --git a/backend/pashub_fetcher/job.py b/backend/pashub_fetcher/job.py new file mode 100644 index 00000000..959ca137 --- /dev/null +++ b/backend/pashub_fetcher/job.py @@ -0,0 +1,6 @@ +from typing import TypedDict + + +class Job(TypedDict): + id: str + address: str diff --git a/backend/pashub_fetcher/pashub_client.py b/backend/pashub_fetcher/pashub_client.py index bed7395f..efc21803 100644 --- a/backend/pashub_fetcher/pashub_client.py +++ b/backend/pashub_fetcher/pashub_client.py @@ -1,4 +1,5 @@ from collections import defaultdict +import os from typing import Dict, List, Optional from datetime import datetime @@ -33,7 +34,7 @@ class PashubClient: ) logger.info("Finished initialising CotalityClient") - def get_core_envidence_files_by_job_id(self, job_id: str) -> List[str]: + def get_core_evidence_files_by_job_id(self, job_id: str) -> List[str]: logger.info(f"Getting Core Evidence Files for job ID {job_id}") evidence_list: List[EvidenceFileData] = self._get_evidence_list(job_id) logger.info(f"Found {len(evidence_list)} Evidence files to get") @@ -59,11 +60,14 @@ class PashubClient: ) download_url: str = self._build_download_url(metadata, evidence.file_id) - file_name = evidence.file_name + output_dir: str = "/tmp" - self._download_file(download_url, file_name) + file_name: str = evidence.file_name + file_path: str = os.path.join(output_dir, file_name) + + self._download_file(download_url, file_path) logger.info("Successfully downloaded file") - saved_files.append(file_name) + saved_files.append(file_path) return saved_files @@ -126,12 +130,9 @@ class PashubClient: return f"{base}{container}/{file_id}?{sas}" - def _download_file(self, url: str, file_name: str) -> None: + def _download_file(self, url: str, file_path: str) -> None: r = requests.get(url) - if r.status_code == 401: - raise UnauthorizedError() - r.raise_for_status() - with open(file_name, "wb") as f: + with open(file_path, "wb") as f: f.write(r.content) From af02899b6f06c59371b720afa836b7183a144290 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 16:48:11 +0000 Subject: [PATCH 23/28] get addresses and jobs from watford file --- ...tford_Warm_Homes_Wave_3_RA Downloads .xlsx | Bin 0 -> 16741 bytes backend/pashub_fetcher/handler/Dockerfile | 2 +- backend/pashub_fetcher/handler/handler.py | 74 ++++++++++++++---- 3 files changed, 61 insertions(+), 15 deletions(-) create mode 100644 backend/pashub_fetcher/Watford_Warm_Homes_Wave_3_RA Downloads .xlsx diff --git a/backend/pashub_fetcher/Watford_Warm_Homes_Wave_3_RA Downloads .xlsx b/backend/pashub_fetcher/Watford_Warm_Homes_Wave_3_RA Downloads .xlsx new file mode 100644 index 0000000000000000000000000000000000000000..6e41291b3750db019ca5bdef1827d1e865761bb6 GIT binary patch literal 16741 zcmeHu1y`I~vUcO{?oM#`;O_3h3GObzA-KD{L$Kf;+}$NOA-KDI&6zphnK?7}7u@@@ zR==yMeyU5J+Pl}Tw-jZ-z|jGa0B8UJKmss2%`(>m0RUpb0RS`rH0UQ0J6mTHTW5V$ z4|@|QT?Tg>YvO!xP^w%2DDdOWDqM#^T9cc#(iEG?+N=w0p5OMyp_ep$nG)hkjY(Nv)}Qf_g4Op)nyqH1QH zH!ueM$rK#hfw(@g0om5cNhrsLTUQo-U9LS4Yen1TrQ_wRfD3k|<-1MhQdc1-MNd2CV}y3L}&Y*7Vn zPMQFplA+;du5N`BV|*{$;%TT|aIyP0^jnnQtykT`$iv+vXgU|H%rnpitAGQ7Y3Upf9SuEBqCy)s@- zp_d6h_)PLSWaw^wJr-G5#!X1FjYQSQPihURA*z5JZ@udSKC&uqAee-2htKoK>N;=K z;SkB~Hd|!`D*8vVMz?RlDUS{=P&AZ|$>I)`TYV_5^EdOiX%f;NRIVK{G!@OIc{0Nr zWa6_IqP57A3>vsFXhrzJn0#se+5>W0KaK9IK<0&2PQC@#G;!u0CQN4dE+!ZMMi2<) zmOY+H!x(ZhGFz_l8nPz2dB9auG3U0ZG0Jx2A@$JzY~6h&n$d;!;zci?IiyU=hI-F3 zDKSWqd*iEJ&vr4I<Eju}_WeFp}biiK@2LE>3 z<%SQP=qR)bZJh}-*?tv=!#77N@uiq8MJ!Tkus0MVOxB||?4mly$;>Z~qh8X20Sh|q`Pqi zs-dX2Nk*c4$F%D?j>eHNEML=uPZGX2=jWAmq1?n4w-$(|YU5UvAgKgu)bC`Ah=5KO zCmJC~f}y*I>?-$Is(uc$DZ{JzynvQ!s{@lwRkpX{3^*Ya+vro^O^K~QpKIc7j+t}9 zSatf;`<(`(T}92_6aowT;x$7&w;^3GjM(F>@Mdzwi?)^;}7 zA7}l?FzwH|U(R9Tib|4o0M6%6NwOVUZakS>SHuV09fJ>3IM>A@A2RO?W!i`7;7#pf zkuXHV3O^=fZP$!p-w9qv{l@$D3GO1FK!Nd9zhic9=_RMAp<|ir81HFJtB4V0m8@Lf zJ#K_%0P%t(z4xZpGLn&(JxwlcF;F7Lij4&zfs^)Ekgh|H(Ul{=}3kprjSZJzM}Z2#|OGSfBrueSfaiAi$qa zK=%IczABaFWcrwpJK$bJnA|d4F;JGB8A*@Tj*y^->#0{MNmzZZH}Gg$bu|}c7{Tm< zJHZ0fB~ke0WrG0*9FHgx10z zKSY;}&b5C}MS&AiP)sU%-7*u*W;wy+!eZ$(MKA(ft48a0{T_o+ zPyBPKaxynDadu+-s z8u6u^mcbwHYKG(G)Cu%zQeo?MaGnklT|_YzG!k@E1bn`D;(Pr@99(~Wury~F=oPwa zU*9;%7SKa@ySU5o)kK6EoZWx0y=wM7tqSgP8{CRn^nIP(TYTA?y~sO~2OB*1YRBcz zlxI8>FW9-w`?7-1c+0r{{b~5+^}L?D#g~vjI+`)W*6El2^)*-B&V|=?{W2dCSZ|vM z+~Di0Pt%F*?dqh$`W1C7e(T^k5H%RGK6){BTW|BOi{IPl!LZHO(*d2xHKuL$EZdpO zeWmkhZ|CZ@=HdL{=N%{Ox;4}W;_2JM3nzET#RgVfw(Dx|Yjc-?F4w-&eb(Npew2E_ z_-&F@=Nlz1D*-_(W7{j3S3#Zcu?EX`4TdLij)%a{R~mJCRlRMKZEY1>zJu3YL~9ze zZ_GYVJI~+W5ULr^Tu;$XH|95|*7Xs5SOZ1fPwL*?V83|VuXx>m`*Pe^ zG9kv?D1Zqrchdw_D@A_4PB(b%!I!GB|HTGrl9&w*#uR zoWK%Yfz>tQRWG=%_vF zv>gU@&6*RYvk#72KGVXb)EzV4d?D%jW0*T(4t7P?!gne}Gu}m53}au@qJQ1H6y%1b zJefMJ=;#yP>Oj~xJrT4D#)EgeEdJ6&ZjGyW2Ip5Lz)XM|z&(7ex$4UyL zc+#ITeQaWDl6`byYtnuAqnqM60q}ek8CG%daFil~&4zb0bK%3t$XE9XUU*V^qj*wA zKlyEtS|DPZ6v%O6PTiCg^UdUvn2xXXpd_Wj{ivi#q~P(`n*f&_)~M7vXiMf zx$p^Og3`+1qDrt*MoR~65U~{5L~t1ILI}LqBLKA&3bGhAd6!<8jbnT5#vOfivnp5ua}4lL5J zy$~+)s)H%mp_LQCrk0wlk&{t^JThvBmzbnGX|OBY<+5s8+Hdn{1HjYkh(XYfc(#sc zpxKA%Dt|uFVvif#pbFj39Exaw!?2kMfl{IbRaX(uk$-Q;B{=(SF-y(51xIHXu0rWP z2m~iU_bh56TgGdBj*5o6sh36W978Rpk_<$HU{mRwA(b^PJE?YHkGHJxyGTsC>|Y3} zl1+mYfHg1n>JUpE3$SvgD=U$c7)oSp38IdUh138_tX+SSj;+>9Yv2$VTNbir)iv15 zrREbO{6wFUT|RDyNYyING;lut0~h)2t5pv=goS;npXq?l0%E-#H~>VLus-g zViTc4m8>fhte|wWWZeqYghY9li4}1ml`c?8C;y|QITpsH5{o$A=3ZEEd#}^Jq8_Pal6wVER!!( z(GAC2s>2?!TSt}i2$IBALL$l5d@`XdlTSST1r>p#F_z3*_BdriBph2uTvtpZ9ZEdM zlAyXSPMCy%c)9QjR7=!K`bUr)MV&KP?j) zQ?&PV%oet8W3c{_U|=lPT14_#ILj%4!I)(cx{spy)5?}(P?gNGRt4ujVt!u!(J(xK zZB7*=YK){C-Ap?44_#@*?Lm_7$zWF#B-6k#2xq0}^Z8gb@XD;EqI-dwkEE+-0{sUe zRzZgwXN438(&b#*f$1F##c{F|&vlPYW!})2%&nuTtD&)-=r%}`WtqRI$WD&vQGxNN z$!9~?0EsIF>o#cVNa4wLq2a5*iFiqwu?dKqQ8XwruWdC#fW&5r^ggq8v(;MW>L2(-_hlx&){#5{Z6T9^2oNNAxJ(318d6S3Y6&c(PMpm zVh2?dwWz7_lMF|7k2jb;{kdCpraIW`sm%~BSwEu{kU3*uOs)MK+z-j-$4r@G?&5W< zs;-03)lv`CSjeWSB}kr$2W#pWram|-Z!$lR9cTUnQ#H*T@0d197Aja%JLiSZq>PFL6Q`+C}; zo^{ilUZEbmJalH$93-x}_iKRImpK(BXZ#bo`>AFkS1mCCJ{r=F$cl7)zyD z?=OHMHaI=uSFlIs`G@Jzn8?Gw!5VRQqfy43@xPVq%}S83U`oiy?)?S}z2Cyymg4t6 z&?e>^E!#2w)jciWQb3{TQHWeh1xt(?jR_GU&sms%z_r3{U+=YnZ^uHs6Xe|pM_5=)UARKBcO!(NTt^7#|CC6;Qb9{d2q?dMjnAdcKfd^nc%Z~_ zeL#fmX#t~h`iIXb9~7bg2rVcnw>PK)DqXp4$shUxePyj*BmWWL(rHjqhm*hnv0p}i zYP4nfr$!Q#F1v8S@3L{ORmK*4^f>~gv!E%|3_hV{)~e)jWA((Er=b%%gEK%%!8QT2 z0-qF2N>7TKhPm|ffJ%YI7n3J>VU|THKS6jaJXX*|30f7OIy5u~w)Jc77~;^^=#ray z9gI$hctH#`TwZ$8@QLl95N1I|$_hGXSzv(6&ntyCYi-LSN2o@EA43*jzHtcqJ?U$0{NW>6<Y%!^bf8^*mNnX&ekWO zOWF?7fr1!HxYqQa7by6L8;SuUIN3IxPgRo#g*5YC^gYQGRfN zq5EgxffqP8*CyNE(0~9j zGfH3zbWTuJEj=zj{XvlmpyIMfCSPKzJ1Twxjkd(Wt}$ueChqk(PcoRUMue*ko{>yit>JSO z<5D)zMN-vF&sLQO=ia?%1cr=76;~GMWih4^OGac0Pw~QRasCo>mx|&ZuNJw9o71Ji+>~o3^zi{ zczwy^nyJXM@5yRfu7C;^{vgOK>btkRKyTA94ISTM*+7_*Xfc9o1CTi-Mdrg)o194~ zly2F^MLR(HqlsnyB32Yjgj9eF-Mxn44YU7C8|iESs}V~Ho4a;+a{1`uXPmM9i9an2 z6@`;7Q3k?frj3+>rlGJiK2Fxo)Ybzx1X5HRnXKgZC8@5$s}LZ2${X^9*H3vAZ0brd z^8k8!_7~yM<0_6CI;2tefac5*xn^6|WU|DCXUYd1hSc0VxQ0j&9|UVN>lmU4C+gJ8QEE_{H3%e0vi=Qj z**1*>PZ<@B<~o@RG7}n&_pV-XgAXdQth*G%m5V1VPP$+oB+}EiKjYcP`96RW{cw!p zgQ^MZ5(p zOeiEp?kBAH;(kOh8e)W+-svTycjOfDn0N{E1RCs;3R9l;F_Sq6hmBUTiyPzfVjp7t73ZhSIQ|0uh+1JboJE_*jZL?)eXJA~yQX@y zopY_lV4cL(7#Pd!G(En8j`x4okO0)R>@%Il z*-IBdqKAn2rUF@I3AD$OmC~~hGSh2$YeCJ)a7MC(IusTro)rEV3aH($5jW zI8s@^u6wi)r6g{mbS22rp*Qy?wS5&-sAZ)_IfMmFJ5kGl@^>UK;mUZBazne@QLSlv z=|k8X2(krg-1fcsF|wh0lkD`isVnIeCQc4{06~sL`GNaR1Kb(Em6`)8wEV>)3y0ph zO*$MurO8x#Y$yn}mR}y&8!Iv*cm^S2PHD&;RdNvokbEPN?NU&R96Y>9>eh*#X*g43 zW0$OIN37-XHrhPtBs9}f=mQb08^DY|GnreN_A@iwY0CQk%A=HvAc9n_n+Vz7i+04> zc#ts^_lGo7$KQvYu3#-M8pSD$XN6_s`Cc)`3|26AIChm?6+F0gvzKDEb^BP_lGDl9 zd0TuxySk>Ztn<+Mbz}Nw@8EvQr6gnA!pZxg?qTW0#p%BJ3qK-Z+m}23K`h%ezGHs= z%r~EvuKTmrhf70Wo|Bx3H_rnOxAnZY6qN&?E`hAKEu-u9E|-!?#=sZDMZO%4lJ}F7 z`#OP4#Ef+dE_eHlx5=L`n-}{gzpRV!A}bzxZa?XCIfu;J()v!l9qQkOD7-Fqy$bx- zgC8cOP^%MQi{~f7dw=zhgCA#e6B`r8KVJVh^EuX%vBTp;?!-FqCBDtptfLs@Acab% zqp;cTt4TuVR&JYPFg|q@Qf_pijW_lO6`RdZ0qwX^tVf6ibQE?C*o1RHTw?K%Yg818 zJtTp9i$Q%M+JOHxj&nM16+$iYBq}-bU0tzD2M&2ym~fiT7Q*eRLxvV49@Y%Sl$Y$A zf$V2{o0pfns%`e9*hJ#o$z5Dqh%n`cm7W0jE#~AD4|x`M&Rx=eSX;4nWEd}imr7%e zW(7TCVX7&}h&`(|Zp|_ah>7;~-5er|U=4mr1 ze8CJXF&9rcc5K8Cr_3Ml${`E7jNL1u9mKCvpbkH_V}&o)Jk55 z-j9wTc<#2XuhjsrBK7VPs?cDT)@xLe>@ttq3jmug@h$ zM@icKzc)+YU#?ERK0g{?56;@@@xS)Oq^Lbt3wT|QPUbxM#|o*0>3KcuHkm2tdENXz z+u*zH<^(5H-SSRH4YN%EtM65r8pj?;%7pKGpn8NUMud>}YYK7Epg=so#bza{lGE8) zM^ohW>OLxfimq%6!=^sedN7UW>8791%7Ci|u{~^{!90zrTk2}YR&)4>efG3)x;f@x z4;-23y8mD?EuBR-}Owrb%vn&n61(o&2<|r z!SgTL>vkQVD;E##@H}aSFE+CtB~Ex}Bqr7xQ8+CWa%c30VBwzUQ}7U`EObz{y#Ay4 z<8Jn~-NBGBRs8uh%X0RJ-e1-u<7(+N2T}KQdfLqiGyIeBcWlH}VZXJdab$QR)+UJ< zHT07`xtHAVZwAl_3_8W!Tylqc z$ltM}BZMie=d#lyC|8Pcs~-!>5ks&z;!N{Lq;_;98^TPCyv_A6s*KXoMx&l1#)Ap@8_o1{Qoopoh`MNrGjztbv=)qVv*Pl}B}M`@FC_Jg!0cV%iOvM#>~ zD-8mJOPG3ulwYiG7^iJ&#B}-QS!`l|%Wdir5Mf!SU`B~!wne!ijWd>&v=9o_>S4k; zlBJK4bzvu+WQqMYsXDz1l1WS_wgN_ie+nDv+}Q}qpxToMy9a~t1$3fA_QcQJA5GgG zVIYLe^h!V0`(w%d{KpchA}z&E!gh&!pv~Rb$_l z!0Hq<^%uKMbOXT>1xqt5l@mb%KN9+$ddCR`YTi-3%?RsNmXC&AWKGI>H305M}H1 zGTZ&LVKT7aSrU)qm+(s;u)FVSg+KGHq$Rnqnl&tFpvlx8yU4miZfWharnybI zy5|=#7_hjw*A2tU_+OZUA$8ir!uOWF=s{#e)6-CXUVH=g%Ko!0qMutL7YPOcRKNiM z2>;j}ot!b1QRyTR1C8ki)M@mW>sUuB4!8UtbGBQ*xu88>>8-C2CXrq%; zSnBX=L?j9~vVKOwRXUkQYg9)cq>BD3v*?<6y{MyJiiZ^B@+fw#+qIo_1Mfzfd_(=( zKka&pYpiIBU3yRG02)egF38|UZOD$4StfQ%j~4Oh?7-hXws|SKSJkmBHh4~GS38WD z;)IPO=zErtIQ<8Tok96y9HER8z3R`%U&!haVxr~nPa_u$Mma~xE~AWs-n=BNii}Ib zb%PT69IEUVe{fF)ZJX?*tn;j1sTz3BNDEj@m1(jIXZW6AU4)s9|vnYCZ}CvWpFIVc-4IvlSSPCv}R@49gUrVxtK17 zb+tgRpa$2R6^`WQA}n@i-piwhWbWf4<{I_?e74g1aVBqxkiw71Dh^8x&H2H7X&MQ@ zQIxrnmVZYf&}-{Rj%cEM7^4-G=av3l0G=e^rKb0Y64Ez(!BD}V+W&IYP$?njobklf zUteLQ*IfFr9xtwZTSA3FmgK;!#w>H)UOhD471jZIJ})D`L-qKhHJeF)5N<|_*!Q_H zm8lY47xT-}Q;-UO;=aoi7`?)4Bhc@00Qk32=o=IdkH|5SFOQ#JV*56*lA1Cyv!gV= zoW^ALp8Lwle+k^0kcaKn(&Ey!|D4?h{bu+`jHuw~D>Z%E-E6uY77s|(V#UN6ROPkt zOmBS7tQ3@INlySu1Zn)nv<+0|oZ>)$=Gi-#tb2=Q#otv2=XZoDMow z&#V<*Z4<@ZS$-ch*?~@FJL-En{i3udpiSeh+M68i(H=ANPXe!M%6d9lC5E9~3)HyP zk3MPD{a`i(MLRA%l7{Wb{@I-7EBTM>5AsVQl*hHB(Qm*==dsZce$$;I=1VsP< z{hwuPZs=(8S;g7W!q)7M>aEZGWTzwH`gH?rSAgvvnY=qu>rRH$As1zDtXI}3a*#Q` zsa0EAR9U2v@}g@}V~$v`@0z=0otrU0EWKyH%3Ig!#OLPdt@X+M-pSYN@!bCHQQDiV z8u6u*zaHsMpri9~D`q3y-Nl_}|0!93f9-BjL*TBYLVx4=M!&<;lZWtr`Ke@4<1A)S z!PnROMRnt@?fH7Vx5Q`CZJ>8Ym;Z8cFsCYqpYL`p`{$ic`;+^(qU%Mshu)X@JbtfE zfzHlL1|0(a4d46OU$3{L8qTm9oYmRb25WLw139P=1^Y={*zJX!3cx;e4asxT)^C zUGJSmA!@&WB>IWpDX_ous{~8nd*4eqs>&=gWGA z0AV7J7xRTa|MlDR{1(yG&g^>cubnSGTRFb{M3Y^7Zw?jfBib&oU!yU;{C>cC-0vi+ zetXH|)os%WKH~|st-xs9r3zP&E3~y{DU+eTaXA`%-r9j%^uF-x=mX-WUzucJoP(@IE00=>adB+%v{Wa`xI7D_xLI?CzN2R*hd>t zYL=~eDzVD}SPEUd#=44%%awAoxH`xDGLBHi58ia63~YGE zo@u*26+86^t<))B;Y46pb+M)!saj!ITr;ffY3Rj_|n4deWxLVnnPm|z~V9ZBK%(oRD zs`|L?%=~qXQ6_I3<*+EiL-K=X==HEc7L_hLiPeJ~WS(~S&CYPqzLD>Jstzj4&mY|qBP@{Au!bdJNVjB+ znrdbj7CHHmQQW5({JQ{8TJ=<^9)C|#t0Z-;uIv-(^=_%V(-9|<^aNMsynJD}hz()l z6G@0q)TKT14AMgF{tbJx_aTMTLaO2nbfy3O_+(e8Npk#5{@!cxcyt80~pU67lY+IpkNfJ}=A?tC zbk8oeZ^ws*G4_3#Hah>OCkQaZ{ytxo&w9#_}23nG#-_8m#85X*(7TwN;6^ znmDKUoj7ClsbS7NO)ZlhI+cw3)AQ@-S=^Mv6PgMhZ+Sh>JuZ?zK+^nCMs-Fy;8n3=Vmo-ar{c4Eu`4(h?uVA%17#+dsPfnzhAqH2>5*XKVy!(T0vdMv{k2DQ4vXB$>)j)%sC?8ZR?8ib?CS)_$ zOTANW=v(!UoNbx-Xe$vfWf^22=a34KP%hxZ_X$_#scrr+obL_%n;FG4A_l$9I+5rj zDn=fiOBp(iRylo%^tfrCG0D(TXHm2xC#6#jGgT4Do9I87wxjE=wX<{YScuk!Z9+53 z{7yXpOU63+;xrP2F~*UV1bDl>@p{9Oed%9Rxbd`v`{uN%s6p1j|4HW)wp74pQAIsUGed@OC7C6P0vcHM>P?rlwCa|ba!EPq zPT*1#1Xu)xSt4zIa9C9oV^^zUxDfuXUuIf#r(=Cmh2*W)NHut~9g8nfE0!^~&-C5>y7I4X`K? zT5Y)pmA`54oKNi7H=3EqvbgBg25_^Knb3T7mNuX2VD!jKh7(xrXMkwbqDjiDJS;WU zvH|bP&o|72k(Yu4A{-Yi^!%0m22ax^FWsGd-L1c1hfh*3g>ro1~(n3%D2Mj8vRQutDa42pc=RH@?L#8oUNjggI_GVh~5` zGF|=xZ9-6Nn4z5nYvRNk2t71RN*yHDNE_7LC6?K&&p7jzOvutnV|eCI+NhgNH^cbq z=nju=oD2s;bBOOOmf3disGbegX2*y{d5@Su)|%(3MQL#WXr(0fl8m3mw6*T98Lhj_ zT8D@8py9WMJz})K%bbOcZ8F&uYyx~Hwqm6vO9uQ7TZ&Ak{49mFy|>^k$(+l91SJGL#u|0^{R@e(LOpTE|b`tgXq|0$y;79#ab#`k%HE9(L z^ClFB3;I1@wqjVLF`?k2jQ{wo=?|XV!nj-5)?VcZT?e^mLvvKJgj>_IR5oCs7Dz-_ zRgLYsb@a1!oIX7ZY!7MaJQqa1ALO?+PC&a+NW$dxt0Ot-#9DtC@tncju4r$!ly4rc z5`y@89%Y|=*=jiYRgWZZQrooh7NlQ(IJBfD3ryQYE<%RN$Yx`HAx7Ayyx}6H@kAA^ zPI`AZ$iBh?iHpcNM;RX9+9f(%Y4fNG3y;=`3Ai{uyejk)&cm`-9Ls>;=GWjx_(}lH zsgb7JgzJIYHM_q*Ze*Mm?s3R?G?sYs&oPz-J2zM#XIkpAiPf(#aI8KPDB7@P^v8z= z19ziZVel?F0~kf)1ZMN9!_qlIqsU`39S8WEQ0bkmbbIU6wCpLQl7?HLJF1PM>QTzF zAe2-P6N_wk$p~PoYlFs!nw6@tR)xslyYoTwIhn^BVAT>?(MIYV-KvG6VTqJ56g*H} z#>hq$K&aK^y!}eh{7m|5va%iRvxD(5i`rRFtZYW#x%l(G=Kuju`Z2WvOK-6y&kGw@;SC^cvS>fwX>CVwiU}Sy7G`hfEvg*BiJWS*=2EPKeW zT;O)OTJ0C@IgEs_nbg#6BwN3>VMl2u@j@G);EPMu!Exrkm-Pbx)TN%os9bU55 zahl1D(8f<3T)ejF#w5-jLCdNFxj!-%SCbcr!oFjS;Cn1HRBOdGa3++D02FFh@?K!N z%SP@pTV~4Ee5!KyKXw#56zQ`v9EoM`2M#imTT3lt(Dj>F&K5mC0rt}y z-e$0dfH}k`Va$1B_Bh0to|AW2aM>P#bt5tPOzoFI>>oInO_atCo4ru3F?Lz~UzpGc-BK5289`H9y>>tK2+NJB~>e=1d~ z!A>h5q^izkQm0$3uoI@UB=R@MxWQO4`-us%svabx7O+rU56?%#rrXv|*@O!l9_MeZ z>6HxW5PX=^>nJL8U(d$Ed1mGcm|!gvDlVh z4>NDo{c+D)y9*;f#KGvGW8UaWiDIiFK*3M3bly)M_q=^7_C@e+#63a7j&Mrg#3-%G zUWu=#z|B-BbXnFJvVaG*`3<2_K+l`;~>y}Vo&M$sCUAFzwQ?5j;i}LAi6|X>zBNs!V zsuC+WMD4IyyxEJ{+9=sz? zlL-)ka47lqnLn{{Aozk&9c*I8*`LClGS#dZYQU5=n$*4s|@=cU9jyf>}y)SA0f?T;G*cDQ_QDY9+N~ljaI5A7LB2^yc3r; zt%b|+#cnw-LlJl0xwRw;fp2nSFchD~dBFzOQgU)1|LZ9p7QT)}Hjr>Mzyo;XeOLa?lCRoQ{n+)6FHrhq=VR zJWg|@^Vle~DS07H=13hLd8Hph@C-Pi*;6KOIjjbjAXH`%O~N0-&VYLJ6>T{!XmSa~aZ z3QIGyVNj5pl7Z)OV~TVsn~~2Nksb`sd$zIRyWn%me#&>|LdE#|&L&MOb`18}Z`1q| z5>N>i?B0hBUvX3Lt))xREN@n(+@N2k)HjV-S_{NNl7-QSU-oLVWG_k=a)8eEv?ZIGkdu|CTl9(=&)E0j})hC5E^cv*C zOPZG{vYl)KF%;`lC_$gNYTg#@eO(}8f4|1G-xnzUu>h2xUd}k_n9u%Sjc;ZjN}M-r zd+33Wwm;ODEnp#9Gr{(1i| List[Job]: + wb = load_workbook(filepath, data_only=True) + ws = wb["watford warm homes (wave 3) mai"] + + HEADER_ROW = 3 + + headers: Dict[str, int] = {} + for col in range(1, ws.max_column + 1): + value = str(ws.cell(row=HEADER_ROW, column=col).value) + if value: + headers[value.strip()] = col + + name_col = headers["Name"] + link_col = headers["Pashub Link"] + + jobs: List[Job] = [] + + for row in range(HEADER_ROW + 1, ws.max_row + 1): + name = ws.cell(row=row, column=name_col).value + link = ws.cell(row=row, column=link_col).value + + if not name or not link: + continue + + link = str(link) + + match = re.search(r"/jobs/([0-9a-fA-F\-]+)/", link) + if not match: + continue + + job_id = match.group(1) + + jobs.append({"id": job_id, "address": str(name)}) + if not match: + continue + + job_id = match.group(1) + + jobs.append({"id": job_id, "address": str(name)}) + + return jobs + + def handler(event: Mapping[str, Any], context: Any) -> None: - pas_hub_email = "random@test.com" - pas_hub_password = "my_fake_password" + BASE_DIR = os.path.dirname(os.path.dirname(__file__)) + filepath = os.path.join(BASE_DIR, "Watford_Warm_Homes_Wave_3_RA Downloads .xlsx") + + jobs: List[Job] = extract_jobs(filepath) + + logger.info("Successfully loaded jobs from spreadsheet") + + # pas_hub_email = "random@test.com" + # pas_hub_password = "my_fake_password" + + pas_hub_email = "sebastian@osmosis-acd.com" + pas_hub_password = "Osmosis2025!" try: token: str = get_token_from_local_storage(pas_hub_email, pas_hub_password) @@ -28,17 +85,6 @@ def handler(event: Mapping[str, Any], context: Any) -> None: sharepoint_location=DomnaSites.SOCIAL_HOUSING_WAVE_3 ) - jobs: List[Job] = [ - { - "id": "5abf6e27-e4c4-4ba8-b69d-9e34939e0002", - "address": "FLAT 11 Abbey View, Garsmouth Way, Watford, WD25 9DY", - }, - { - "id": "047f4455-85e2-4293-97b1-6b460137d33e", - "address": "FLAT 14 Abbey View, Garsmouth Way, Watford, WD25 9DY", - }, - ] # TODO: get these from request body or spreadsheet - sharepoint_client.makedir("Watford Test", "/JTK Test Folder") saved_file_paths: List[str] = [] From 156c41a5c561c7204a98f9a1d587fadab87695f9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 24 Mar 2026 17:01:40 +0000 Subject: [PATCH 24/28] test credentials --- backend/pashub_fetcher/handler/handler.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index d9941903..af5ed336 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -67,11 +67,8 @@ def handler(event: Mapping[str, Any], context: Any) -> None: logger.info("Successfully loaded jobs from spreadsheet") - # pas_hub_email = "random@test.com" - # pas_hub_password = "my_fake_password" - - pas_hub_email = "sebastian@osmosis-acd.com" - pas_hub_password = "Osmosis2025!" + pas_hub_email = "random@test.com" + pas_hub_password = "my_fake_password" try: token: str = get_token_from_local_storage(pas_hub_email, pas_hub_password) From 890bd313bd072aa3490cfd2fc965836f86bd99f9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 25 Mar 2026 13:53:42 +0000 Subject: [PATCH 25/28] minor tweaks --- backend/pashub_fetcher/handler/handler.py | 17 +++++++++-------- backend/pashub_fetcher/token_getter.py | 4 +++- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index af5ed336..fb7d6b1d 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,6 +1,7 @@ import os import re -import time + +# import time from typing import Any, Dict, List, Mapping from openpyxl import load_workbook @@ -49,12 +50,6 @@ def extract_jobs(filepath: str) -> List[Job]: job_id = match.group(1) jobs.append({"id": job_id, "address": str(name)}) - if not match: - continue - - job_id = match.group(1) - - jobs.append({"id": job_id, "address": str(name)}) return jobs @@ -113,6 +108,12 @@ def handler(event: Mapping[str, Any], context: Any) -> None: pashub_client.get_core_evidence_files_by_job_id(job["id"]) ) - time.sleep(10) # Simulate manual download + # time.sleep(10) # Simulate manual download print(f"saved {len(saved_file_paths)} files") + + +if __name__ == "__main__": + event = {"Records": [{"body": "{}"}]} + + handler(event, None) diff --git a/backend/pashub_fetcher/token_getter.py b/backend/pashub_fetcher/token_getter.py index d5481dd5..5954feec 100644 --- a/backend/pashub_fetcher/token_getter.py +++ b/backend/pashub_fetcher/token_getter.py @@ -24,7 +24,9 @@ def get_token_from_local_storage(email: str, password: str) -> str: page.fill("#password", password) logger.info("Submitting login...") - page.click("#btn-login") + page.wait_for_selector("#btn-login", state="visible", timeout=10000) + with page.expect_navigation(timeout=15000): + page.click("#btn-login") page.wait_for_timeout(3000) From 64b46f851254c87951a2efabbdaacc251d6cfc15 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 25 Mar 2026 13:53:54 +0000 Subject: [PATCH 26/28] minor tweaks --- backend/pashub_fetcher/handler/handler.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index fb7d6b1d..fa48487e 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,7 +1,6 @@ import os import re -# import time from typing import Any, Dict, List, Mapping from openpyxl import load_workbook @@ -108,8 +107,6 @@ def handler(event: Mapping[str, Any], context: Any) -> None: pashub_client.get_core_evidence_files_by_job_id(job["id"]) ) - # time.sleep(10) # Simulate manual download - print(f"saved {len(saved_file_paths)} files") From c233d9117b36a4d6c042e7292c54570c17dfadb3 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 25 Mar 2026 14:12:38 +0000 Subject: [PATCH 27/28] deploy test lambda - create pashub_to_ara ecr --- backend/pashub_fetcher/handler/Dockerfile | 8 ++------ backend/pashub_fetcher/handler/requirements.txt | 5 +++++ backend/pashub_fetcher/handler/test_handler.py | 7 +++++++ infrastructure/terraform/shared/main.tf | 15 ++++++++++++--- 4 files changed, 26 insertions(+), 9 deletions(-) create mode 100644 backend/pashub_fetcher/handler/requirements.txt create mode 100644 backend/pashub_fetcher/handler/test_handler.py diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile index a4d322ab..e06daa67 100644 --- a/backend/pashub_fetcher/handler/Dockerfile +++ b/backend/pashub_fetcher/handler/Dockerfile @@ -4,9 +4,6 @@ FROM mcr.microsoft.com/playwright/python:v1.58.0-jammy ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie /usr/local/bin/aws-lambda-rie RUN chmod +x /usr/local/bin/aws-lambda-rie -# Install Lambda runtime client -RUN pip install awslambdaric playwright==1.58.0 requests msal openpyxl - # Set working directory (Lambda task root) WORKDIR /var/task @@ -15,6 +12,7 @@ COPY .env backend/.env COPY utils/ utils/ COPY backend/pashub_fetcher/ backend/pashub_fetcher/ +RUN pip install --no-cache-dir -r requirements.txt # Lambda entrypoint ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] @@ -22,6 +20,4 @@ ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] # ----------------------------- # Lambda handler # ----------------------------- -# CMD ["backend/pashub_fetcher/handler/handler.handler"] -# For local running -CMD ["backend.pashub_fetcher.handler.handler.handler"] \ No newline at end of file +CMD ["backend.pashub_fetcher.handler.test_handler.handler"] \ No newline at end of file diff --git a/backend/pashub_fetcher/handler/requirements.txt b/backend/pashub_fetcher/handler/requirements.txt new file mode 100644 index 00000000..c4e416a8 --- /dev/null +++ b/backend/pashub_fetcher/handler/requirements.txt @@ -0,0 +1,5 @@ +awslambdaric +playwright==1.58.0 +requests +msal +openpyxl \ No newline at end of file diff --git a/backend/pashub_fetcher/handler/test_handler.py b/backend/pashub_fetcher/handler/test_handler.py new file mode 100644 index 00000000..996835a2 --- /dev/null +++ b/backend/pashub_fetcher/handler/test_handler.py @@ -0,0 +1,7 @@ +from typing import Any, Mapping +import json + + +def handler(event: Mapping[str, Any], context: Any) -> None: + print("Received event:") + print(json.dumps(event, indent=2)) diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 4d3a1425..d88a0a43 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -525,14 +525,23 @@ output "ordnance_s3_read_and_write_arn" { } ################################################ -# Engine – Lambda ECR +# Pas Hub to Ara – Lambda ################################################ -module "engine_state_bucket" { +module "pashub_to_ara_state_bucket" { source = "../modules/tf_state_bucket" - bucket_name = "ara-engine-terraform-state" + bucket_name = "pashub-to-ara-terraform-state" } +module "pashub_to_ara_registry" { + source = "../modules/container_registry" + name = "pashub_to_ara" + stage = var.stage +} + +################################################ +# Engine – Lambda ECR +################################################ module "engine_registry" { source = "../modules/container_registry" name = "engine" From 59dd8c73bb153b6f676752a8119ee0e0ab7cf363 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 25 Mar 2026 14:24:13 +0000 Subject: [PATCH 28/28] deleted wrong state bucket --- infrastructure/terraform/shared/main.tf | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index d88a0a43..84c6748b 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -527,12 +527,6 @@ output "ordnance_s3_read_and_write_arn" { ################################################ # Pas Hub to Ara – Lambda ################################################ -module "pashub_to_ara_state_bucket" { - source = "../modules/tf_state_bucket" - bucket_name = "pashub-to-ara-terraform-state" - -} - module "pashub_to_ara_registry" { source = "../modules/container_registry" name = "pashub_to_ara" @@ -542,6 +536,12 @@ module "pashub_to_ara_registry" { ################################################ # Engine – Lambda ECR ################################################ +module "engine_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "ara-engine-terraform-state" + +} + module "engine_registry" { source = "../modules/container_registry" name = "engine"