From a886911de45d27eeb836aa313f727cbfdb8e7c50 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 2 Apr 2026 08:29:09 +0000 Subject: [PATCH] add debugging --- backend/ecmk_fetcher/handler/handler.py | 97 +++++++++++++++++++++---- 1 file changed, 83 insertions(+), 14 deletions(-) diff --git a/backend/ecmk_fetcher/handler/handler.py b/backend/ecmk_fetcher/handler/handler.py index 932b8552..c4c1385c 100644 --- a/backend/ecmk_fetcher/handler/handler.py +++ b/backend/ecmk_fetcher/handler/handler.py @@ -7,6 +7,7 @@ from openpyxl import load_workbook from playwright.sync_api import ( Locator, Page, + Response, sync_playwright, TimeoutError as PlaywrightTimeoutError, ) @@ -29,6 +30,20 @@ class file_download_button_types(Enum): SAP_WORK_SHEET = 15 +def attach_debug_listeners(page: Page) -> None: + def handle_response(response: Response) -> None: + url: str = response.url + status: int = response.status + + if "download" in url or "report" in url: + logger.info(f"[RESPONSE] {status} {url}") + + if status >= 400: + logger.error(f"[ERROR RESPONSE] {status} {url}") + + page.on("response", handle_response) + + def extract_addresses_from_spreadsheet(filepath: str) -> Dict[str, str]: wb = load_workbook(filepath, data_only=True) ws = wb["Southern RA-Lite Programme 3103"] @@ -147,6 +162,12 @@ def go_to_assessment_details(page: Page, row: Locator) -> None: with page.expect_navigation(): account_link.click() + page.wait_for_load_state("networkidle") + + page.wait_for_selector("a.download-report-btn", timeout=10000) + + logger.info("Assessment details page fully loaded") + def go_to_next_page(page: Page) -> bool: next_button: Locator = page.locator("#assessmentDatatable_next a") @@ -168,21 +189,60 @@ def build_report_selector(report_type: int) -> str: return f"a.download-report-btn[data-report-type='{report_type}']" -def download_report_by_selector(page: Page, selector: str) -> str: - page.wait_for_selector(selector, timeout=10000) +def download_report_by_selector(page: Page, selector: str) -> Optional[str]: + try: + element: Locator = page.locator(selector) - with page.expect_download() as download_info: - page.click(selector) + element.wait_for(state="visible", timeout=10000) - download = download_info.value - filename: str = download.suggested_filename + if not element.is_enabled(): + logger.warning(f"Element not enabled: {selector}") + return None - save_path: str = os.path.join(os.getcwd(), filename) - download.save_as(save_path) + element.scroll_into_view_if_needed() - logger.info(f"Downloaded: {filename}") + page.wait_for_timeout(300) - return save_path + logger.info(f"Attempting download via selector: {selector}") + logger.info(f"Current URL: {page.url}") + + with page.expect_download(timeout=15000) as download_info: + element.click() + + download = download_info.value + filename: str = download.suggested_filename + + save_path: str = os.path.join(os.getcwd(), filename) + download.save_as(save_path) + + logger.info(f"Downloaded: {filename}") + + return save_path + + except PlaywrightTimeoutError: + logger.error(f"Download NOT triggered for selector: {selector}") + logger.error(f"Current URL at failure: {page.url}") + + try: + content_snippet = page.content()[:1000] + logger.error(f"Page snippet: {content_snippet}") + except Exception: + pass + + return None + + +def download_with_retry(page: Page, selector: str) -> Optional[str]: + for attempt in range(3): + file_path = download_report_by_selector(page, selector) + + if file_path: + return file_path + + logger.warning(f"Retry {attempt + 1} for {selector}") + page.wait_for_timeout(1500) + + return None def upload_job_to_s3_and_update_db(job_files: List[str], uprn: str) -> None: @@ -244,6 +304,7 @@ def download_report() -> None: context = browser.new_context() page = context.new_page() + attach_debug_listeners(page) try: login(page, username, password) @@ -267,7 +328,7 @@ def download_report() -> None: last_name: str = cells.nth(2).inner_text().strip() address: str = cells.nth(5).inner_text().strip() postcode: str = cells.nth(7).inner_text().strip() - uprn: str = cells.nth(8).inner_text().strip() + # uprn: str = cells.nth(8).inner_text().strip() status: str = cells.nth(9).inner_text().strip() if first_name == "Oliver" and last_name == "Stephens": @@ -296,16 +357,24 @@ def download_report() -> None: for report_type in report_types: selector: str = build_report_selector(report_type) - file_path: str = download_report_by_selector(page, selector) + file_path: Optional[str] = download_with_retry( + page, selector + ) + + if not file_path: + continue try: sharepoint_client.upload_file( file_path=file_path, sharepoint_path=f"{sharepoint_base_path}/{sharepoint_address}/1. Retrofit Assessment/A. Assessment", file_name=os.path.basename(file_path), ) + logger.info( + f"Successfully uploaded file {os.path.basename(file_path)} to sharepoint" + ) # TODO: could s3 load happen for all files at once to reduce db roundtrips? - if uprn: - upload_job_to_s3_and_update_db([file_path], uprn) + # if uprn: + # upload_job_to_s3_and_update_db([file_path], uprn) finally: if os.path.exists(file_path): os.remove(file_path)