diff --git a/backend/ecmk_fetcher/ecmk_service.py b/backend/ecmk_fetcher/ecmk_service.py new file mode 100644 index 00000000..138d0b28 --- /dev/null +++ b/backend/ecmk_fetcher/ecmk_service.py @@ -0,0 +1,196 @@ +import os +from typing import Dict + +from playwright.sync_api import Browser, BrowserContext, Locator, Page, sync_playwright + +from backend.app.db.functions.uploaded_files_functions import ( + get_uploaded_file_by_listing_type_and_source, +) +from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum +from backend.ecmk_fetcher.address_list import PropertyRow, extract_addresses_from_spreadsheet +from backend.ecmk_fetcher.browser import ( + attach_debug_listeners, + download_with_retry, + go_to_assessment_details, + go_to_assessments, + go_to_next_page, + login, +) +from backend.ecmk_fetcher.excel_writer import write_row +from backend.ecmk_fetcher.reports import ( + REPORT_TYPES, + FileDownloadButtonType, + build_property_id, + map_report_type_to_db_file_type, +) +from backend.ecmk_fetcher.upload import ( + upload_excel_to_sharepoint, + upload_file_to_s3_and_record, + upload_file_to_sharepoint, +) +from backend.ecmk_fetcher.xml_processor import flatten_sap_property, parse_rdsap +from utils.logger import setup_logger +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient + +logger = setup_logger() + +DIMENSIONS_FILENAME: str = "Dimensions.xlsx" + + +class EcmkService: + def __init__( + self, + sharepoint_client: DomnaSharepointClient, + s3_bucket: str, + property_list_filepath: str, + sharepoint_base_path: str, + sharepoint_excel_path: str, + local_dimensions_path: str, + ) -> None: + self._sharepoint_client = sharepoint_client + self._s3_bucket = s3_bucket + self._sharepoint_base_path = sharepoint_base_path + self._sharepoint_excel_path = sharepoint_excel_path + self._local_dimensions_path = local_dimensions_path + self._property_map: Dict[str, PropertyRow] = extract_addresses_from_spreadsheet( + property_list_filepath + ) + + def run(self) -> None: + self._sharepoint_client.download_file( + sharepoint_path=f"{self._sharepoint_excel_path}/{DIMENSIONS_FILENAME}", + local_path=self._local_dimensions_path, + ) + + with sync_playwright() as p: + browser: Browser = p.chromium.launch(headless=True) + context: BrowserContext = browser.new_context() + page: Page = context.new_page() + try: + self._run_browser_session(page) + finally: + context.close() + browser.close() + + def _run_browser_session(self, page: Page) -> None: + username: str = "" # TODO: get from github secrets + password: str = "" + + attach_debug_listeners(page) + + login(page, username, password) + go_to_assessments(page) + + while True: + rows: Locator = page.locator("#assessmentDatatable tbody tr") + row_count: int = rows.count() + + for i in range(row_count): + row: Locator = rows.nth(i) + + try: + cells: Locator = row.locator("td") + + first_name: str = cells.nth(1).inner_text().strip() + last_name: str = cells.nth(2).inner_text().strip() + address: str = cells.nth(5).inner_text().strip() + postcode: str = cells.nth(7).inner_text().strip() + status: str = cells.nth(9).inner_text().strip() + + if first_name == "Oliver" and last_name == "Stephens": + continue + + if status != "Submitted (not Lodged)": + continue + + property_id: str = build_property_id(address, postcode) + + property_row: PropertyRow | None = self._property_map.get(property_id) + + if not property_row: + continue + + logger.info(f"Match found for property {address}") + + sharepoint_address: str = property_row.address + + go_to_assessment_details(page, row) + + for report_type in REPORT_TYPES: + hubspot_listing_id: str = property_row.listing_id + try: + db_file_type: FileTypeEnum = ( + map_report_type_to_db_file_type(report_type) + ) + except ValueError: + logger.error( + f"Unknown report type {report_type}, skipping file" + ) + continue + + if get_uploaded_file_by_listing_type_and_source( + hubspot_listing_id=int(hubspot_listing_id), + file_type=db_file_type, + file_source=FileSourceEnum.ECMK, + ): + logger.debug("File already uploaded to s3, skipping") + continue + + file_path: str | None = download_with_retry(page, report_type) + + if not file_path: + continue + + logger.info( + f"Successfully downloaded file {os.path.basename(file_path)} from ECMK" + ) + + try: + if report_type == FileDownloadButtonType.RAW_XML.value: + with open(file_path, "r", encoding="utf-8") as f: + xml_string = f.read() + details = parse_rdsap(xml_string) + row_data = flatten_sap_property(details) + write_row(self._local_dimensions_path, row_data) + upload_excel_to_sharepoint( + client=self._sharepoint_client, + file_path=self._local_dimensions_path, + sharepoint_path=self._sharepoint_excel_path, + ) + logger.info( + f"Written dimensions row and uploaded Dimensions.xlsx for {address}" + ) + else: + upload_file_to_sharepoint( + client=self._sharepoint_client, + file_path=file_path, + base_path=self._sharepoint_base_path, + subpath=sharepoint_address, + ) + logger.info( + f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" + ) + + upload_file_to_s3_and_record( + bucket=self._s3_bucket, + file_path=file_path, + hubspot_listing_id=hubspot_listing_id, + file_type=db_file_type, + ) + + except Exception: + raise + finally: + if os.path.exists(file_path): + os.remove(file_path) + + page.go_back() + page.wait_for_selector( + "#assessmentDatatable tbody tr", timeout=15000 + ) + + except Exception as e: + raise Exception(f"Row processing failed: {str(e)}") from e + + if not go_to_next_page(page): + break diff --git a/backend/ecmk_fetcher/handler/handler.py b/backend/ecmk_fetcher/handler/handler.py index b777cc9f..b49518bf 100644 --- a/backend/ecmk_fetcher/handler/handler.py +++ b/backend/ecmk_fetcher/handler/handler.py @@ -1,14 +1,32 @@ +import os from typing import Any, Mapping -from backend.ecmk_fetcher.processor import run_job +from backend.ecmk_fetcher.ecmk_service import EcmkService from utils.logger import setup_logger +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient +from utils.sharepoint.domna_sites import DomnaSites logger = setup_logger() +_PROPERTY_LIST_FILE: str = ( + "hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx" +) +_BASE_DIR: str = os.path.dirname(os.path.dirname(__file__)) + def handler(event: Mapping[str, Any], context: Any) -> None: logger.info("Entered handler") - run_job() + service = EcmkService( + sharepoint_client=DomnaSharepointClient( + sharepoint_location=DomnaSites.PRIVATE_PAY + ), + s3_bucket="retrofit-energy-assessments-dev", + property_list_filepath=os.path.join(_BASE_DIR, _PROPERTY_LIST_FILE), + sharepoint_base_path="/Projects/Southern Housing/SH-SURV-26-001/Assessments", + sharepoint_excel_path="/Projects/Southern Housing/SH-SURV-26-001/Modelling", + local_dimensions_path=os.path.join(_BASE_DIR, "Dimensions.xlsx"), + ) + service.run() if __name__ == "__main__": diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py deleted file mode 100644 index 03e40f1c..00000000 --- a/backend/ecmk_fetcher/processor.py +++ /dev/null @@ -1,209 +0,0 @@ -import os -from typing import Dict -from playwright.sync_api import ( - sync_playwright, - Locator, - Page, - Browser, - BrowserContext, -) - -from backend.app.db.functions.uploaded_files_functions import ( - get_uploaded_file_by_listing_type_and_source, -) -from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum -from backend.ecmk_fetcher.address_list import ( - PropertyRow, - extract_addresses_from_spreadsheet, -) -from backend.ecmk_fetcher.browser import ( - attach_debug_listeners, - download_with_retry, - go_to_assessment_details, - go_to_assessments, - go_to_next_page, - login, -) -from backend.ecmk_fetcher.reports import ( - REPORT_TYPES, - FileDownloadButtonType, - build_property_id, - map_report_type_to_db_file_type, -) -from backend.ecmk_fetcher.excel_writer import write_row -from backend.ecmk_fetcher.upload import ( - upload_excel_to_sharepoint, - upload_file_to_s3_and_record, - upload_file_to_sharepoint, -) -from backend.ecmk_fetcher.xml_processor import flatten_sap_property, parse_rdsap -from utils.logger import setup_logger -from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient -from utils.sharepoint.domna_sites import DomnaSites - -logger = setup_logger() - - -def run_job() -> None: - - username: str = "" # TODO: get from github secrets - password: str = "" - - property_list_file: str = ( - "hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx" - ) - - BASE_DIR: str = os.path.dirname(__file__) - filepath: str = os.path.join(BASE_DIR, property_list_file) - - property_map: Dict[str, PropertyRow] = extract_addresses_from_spreadsheet(filepath) - - sharepoint_client: DomnaSharepointClient = DomnaSharepointClient( - sharepoint_location=DomnaSites.PRIVATE_PAY - ) - - sharepoint_base_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Assessments" - sharepoint_excel_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Modelling" - - DIMENSIONS_FILENAME: str = "Dimensions.xlsx" - local_dimensions_path: str = os.path.join(BASE_DIR, DIMENSIONS_FILENAME) - - sharepoint_client.download_file( - sharepoint_path=f"{sharepoint_excel_path}/{DIMENSIONS_FILENAME}", - local_path=local_dimensions_path, - ) - - s3_bucket: str = "retrofit-energy-assessments-dev" - - with sync_playwright() as p: - browser: Browser = p.chromium.launch(headless=True) - context: BrowserContext = browser.new_context() - page: Page = context.new_page() - - attach_debug_listeners(page) - - try: - login(page, username, password) - go_to_assessments(page) - - while True: - rows: Locator = page.locator("#assessmentDatatable tbody tr") - row_count: int = rows.count() - - for i in range(row_count): - row: Locator = rows.nth(i) - - try: - cells: Locator = row.locator("td") - - first_name: str = cells.nth(1).inner_text().strip() - last_name: str = cells.nth(2).inner_text().strip() - address: str = cells.nth(5).inner_text().strip() - postcode: str = cells.nth(7).inner_text().strip() - status: str = cells.nth(9).inner_text().strip() - - if first_name == "Oliver" and last_name == "Stephens": - continue - - if status != "Submitted (not Lodged)": - continue - - property_id: str = build_property_id(address, postcode) - - property_row: PropertyRow | None = property_map.get(property_id) - - if not property_row: - continue - - logger.info(f"Match found for property {address}") - - sharepoint_address: str = property_row.address - - go_to_assessment_details(page, row) - - for report_type in REPORT_TYPES: - hubspot_listing_id: str = property_row.listing_id - try: - db_file_type: FileTypeEnum = ( - map_report_type_to_db_file_type(report_type) - ) - - except ValueError: - logger.error( - f"Unknown report type {report_type}, skipping file" - ) - continue - - if get_uploaded_file_by_listing_type_and_source( - hubspot_listing_id=int(hubspot_listing_id), - file_type=db_file_type, - file_source=FileSourceEnum.ECMK, - ): - logger.debug("File already uploaded to s3, skipping") - continue - - file_path: str | None = download_with_retry( - page, report_type - ) - - if not file_path: - continue - - logger.info( - f"Successfully downloaded file {os.path.basename(file_path)} from ECMK" - ) - - try: - if report_type == FileDownloadButtonType.RAW_XML.value: - with open(file_path, "r", encoding="utf-8") as f: - xml_string = f.read() - details = parse_rdsap(xml_string) - row_data = flatten_sap_property(details) - write_row(local_dimensions_path, row_data) - upload_excel_to_sharepoint( - client=sharepoint_client, - file_path=local_dimensions_path, - sharepoint_path=sharepoint_excel_path, - ) - logger.info( - f"Written dimensions row and uploaded Dimensions.xlsx for {address}" - ) - else: - upload_file_to_sharepoint( - client=sharepoint_client, - file_path=file_path, - base_path=sharepoint_base_path, - subpath=sharepoint_address, - ) - logger.info( - f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" - ) - - # Upload to s3 and update db - upload_file_to_s3_and_record( - bucket=s3_bucket, - file_path=file_path, - hubspot_listing_id=hubspot_listing_id, - file_type=db_file_type, - ) - - except Exception: - raise - finally: - if os.path.exists(file_path): - os.remove(file_path) - - page.go_back() - page.wait_for_selector( - "#assessmentDatatable tbody tr", timeout=15000 - ) - - except Exception as e: - raise Exception(f"Row processing failed: {str(e)}") from e - - if not go_to_next_page(page): - break - - finally: - context.close() - browser.close() diff --git a/backend/ecmk_fetcher/tests/test_ecmk_service.py b/backend/ecmk_fetcher/tests/test_ecmk_service.py new file mode 100644 index 00000000..c2694489 --- /dev/null +++ b/backend/ecmk_fetcher/tests/test_ecmk_service.py @@ -0,0 +1,148 @@ +from typing import Dict +from unittest.mock import MagicMock, patch + +from backend.ecmk_fetcher.address_list import PropertyRow +from backend.ecmk_fetcher.ecmk_service import EcmkService +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient + + +FAKE_PROPERTY_MAP: Dict[str, PropertyRow] = { + "10 FAKE ST SW1A 1AA": PropertyRow( + row_index=2, address="10 Fake St SW1A 1AA", listing_id="hs-001" + ) +} + + +def make_service( + sharepoint_client: DomnaSharepointClient | None = None, + s3_bucket: str = "test-bucket", + property_list_filepath: str = "/fake/properties.xlsx", + sharepoint_base_path: str = "/base", + sharepoint_excel_path: str = "/excel", + local_dimensions_path: str = "/fake/Dimensions.xlsx", +) -> EcmkService: + return EcmkService( + sharepoint_client=sharepoint_client or MagicMock(spec=DomnaSharepointClient), + s3_bucket=s3_bucket, + property_list_filepath=property_list_filepath, + sharepoint_base_path=sharepoint_base_path, + sharepoint_excel_path=sharepoint_excel_path, + local_dimensions_path=local_dimensions_path, + ) + + +# --------------------------------------------------------------------------- +# __init__: loads property map from spreadsheet filepath +# --------------------------------------------------------------------------- + + +def test_init_loads_property_map_from_filepath() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ) as mock_extract: + _ = make_service(property_list_filepath="/some/props.xlsx") + + mock_extract.assert_called_once_with("/some/props.xlsx") + + +# --------------------------------------------------------------------------- +# run(): downloads Dimensions.xlsx before Playwright browser launches +# --------------------------------------------------------------------------- + + +def _make_playwright_mocks() -> tuple[MagicMock, MagicMock, MagicMock, MagicMock]: + mock_page = MagicMock() + mock_context = MagicMock() + mock_context.new_page.return_value = mock_page + mock_browser = MagicMock() + mock_browser.new_context.return_value = mock_context + mock_playwright = MagicMock() + mock_playwright.chromium.launch.return_value = mock_browser + return mock_page, mock_context, mock_browser, mock_playwright + + +def test_run_downloads_dimensions_before_browser_launch() -> None: + call_order: list[str] = [] + + mock_client = MagicMock(spec=DomnaSharepointClient) + + def _on_download(**_: object) -> None: + call_order.append("download") + + mock_client.download_file.side_effect = _on_download + + _, _, mock_browser, mock_playwright = _make_playwright_mocks() + + def _on_launch(**_: object) -> MagicMock: + call_order.append("browser") + return mock_browser + + mock_playwright.chromium.launch.side_effect = _on_launch + + with ( + patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ), + patch("backend.ecmk_fetcher.ecmk_service.sync_playwright") as mock_sync_pw, + ): + mock_sync_pw.return_value.__enter__.return_value = mock_playwright + service = make_service( + sharepoint_client=mock_client, + sharepoint_excel_path="/excel", + local_dimensions_path="/fake/Dimensions.xlsx", + ) + with patch.object(service, "_run_browser_session"): + service.run() + + assert call_order == ["download", "browser"] + + +def test_run_downloads_dimensions_with_correct_paths() -> None: + mock_client = MagicMock(spec=DomnaSharepointClient) + _, _, _, mock_playwright = _make_playwright_mocks() + + with ( + patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ), + patch("backend.ecmk_fetcher.ecmk_service.sync_playwright") as mock_sync_pw, + ): + mock_sync_pw.return_value.__enter__.return_value = mock_playwright + service = make_service( + sharepoint_client=mock_client, + sharepoint_excel_path="/excel", + local_dimensions_path="/fake/Dimensions.xlsx", + ) + with patch.object(service, "_run_browser_session"): + service.run() + + mock_client.download_file.assert_called_once_with( + sharepoint_path="/excel/Dimensions.xlsx", + local_path="/fake/Dimensions.xlsx", + ) + + +# --------------------------------------------------------------------------- +# run(): passes the Playwright Page into _run_browser_session +# --------------------------------------------------------------------------- + + +def test_run_passes_page_to_run_browser_session() -> None: + mock_page, _, _, mock_playwright = _make_playwright_mocks() + + with ( + patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ), + patch("backend.ecmk_fetcher.ecmk_service.sync_playwright") as mock_sync_pw, + ): + mock_sync_pw.return_value.__enter__.return_value = mock_playwright + service = make_service() + with patch.object(service, "_run_browser_session") as mock_session: + service.run() + + mock_session.assert_called_once_with(mock_page)