diff --git a/backend/ecmk_fetcher/ecmk_service.py b/backend/ecmk_fetcher/ecmk_service.py index 138d0b28..35b8f552 100644 --- a/backend/ecmk_fetcher/ecmk_service.py +++ b/backend/ecmk_fetcher/ecmk_service.py @@ -3,11 +3,17 @@ from typing import Dict from playwright.sync_api import Browser, BrowserContext, Locator, Page, sync_playwright +from backend.app.db.connection import db_session from backend.app.db.functions.uploaded_files_functions import ( get_uploaded_file_by_listing_type_and_source, ) from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum -from backend.ecmk_fetcher.address_list import PropertyRow, extract_addresses_from_spreadsheet +from backend.documents_parser.db_writer import save_epc_property_data +from backend.documents_parser.parser import parse_site_notes_pdf +from backend.ecmk_fetcher.address_list import ( + PropertyRow, + extract_addresses_from_spreadsheet, +) from backend.ecmk_fetcher.browser import ( attach_debug_listeners, download_with_retry, @@ -105,7 +111,9 @@ class EcmkService: property_id: str = build_property_id(address, postcode) - property_row: PropertyRow | None = self._property_map.get(property_id) + property_row: PropertyRow | None = self._property_map.get( + property_id + ) if not property_row: continue @@ -146,38 +154,13 @@ class EcmkService: ) try: - if report_type == FileDownloadButtonType.RAW_XML.value: - with open(file_path, "r", encoding="utf-8") as f: - xml_string = f.read() - details = parse_rdsap(xml_string) - row_data = flatten_sap_property(details) - write_row(self._local_dimensions_path, row_data) - upload_excel_to_sharepoint( - client=self._sharepoint_client, - file_path=self._local_dimensions_path, - sharepoint_path=self._sharepoint_excel_path, - ) - logger.info( - f"Written dimensions row and uploaded Dimensions.xlsx for {address}" - ) - else: - upload_file_to_sharepoint( - client=self._sharepoint_client, - file_path=file_path, - base_path=self._sharepoint_base_path, - subpath=sharepoint_address, - ) - logger.info( - f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" - ) - - upload_file_to_s3_and_record( - bucket=self._s3_bucket, + self._process_file( file_path=file_path, + report_type=report_type, + db_file_type=db_file_type, + sharepoint_address=sharepoint_address, hubspot_listing_id=hubspot_listing_id, - file_type=db_file_type, ) - except Exception: raise finally: @@ -194,3 +177,81 @@ class EcmkService: if not go_to_next_page(page): break + + def _process_file( + self, + file_path: str, + report_type: int, + db_file_type: FileTypeEnum, + sharepoint_address: str, + hubspot_listing_id: str, + ) -> None: + if report_type == FileDownloadButtonType.RAW_XML.value: + self._process_xml_file( + file_path=file_path, + db_file_type=db_file_type, + hubspot_listing_id=hubspot_listing_id, + ) + else: + self._process_pdf_file( + file_path=file_path, + file_type=db_file_type, + sharepoint_address=sharepoint_address, + hubspot_listing_id=hubspot_listing_id, + ) + + def _process_xml_file( + self, + file_path: str, + db_file_type: FileTypeEnum, + hubspot_listing_id: str, + ) -> None: + with open(file_path, "r", encoding="utf-8") as f: + xml_string: str = f.read() + details = parse_rdsap(xml_string) + row_data = flatten_sap_property(details) + write_row(self._local_dimensions_path, row_data) + upload_excel_to_sharepoint( + client=self._sharepoint_client, + file_path=self._local_dimensions_path, + sharepoint_path=self._sharepoint_excel_path, + ) + upload_file_to_s3_and_record( + bucket=self._s3_bucket, + file_path=file_path, + hubspot_listing_id=hubspot_listing_id, + file_type=db_file_type, + ) + + def _process_pdf_file( + self, + file_path: str, + file_type: FileTypeEnum, + sharepoint_address: str, + hubspot_listing_id: str, + ) -> None: + upload_file_to_sharepoint( + client=self._sharepoint_client, + file_path=file_path, + base_path=self._sharepoint_base_path, + subpath=sharepoint_address, + ) + uploaded_file_id: int = upload_file_to_s3_and_record( + bucket=self._s3_bucket, + file_path=file_path, + hubspot_listing_id=hubspot_listing_id, + file_type=file_type, + ) + if file_type == FileTypeEnum.ECMK_RD_SAP_SITE_NOTE: + try: + epc_data = parse_site_notes_pdf(file_path) + with db_session() as session: + save_epc_property_data( + session=session, + data=epc_data, + uploaded_file_id=uploaded_file_id, + ) + except Exception: + logger.warning( + f"EPC extraction failed for {os.path.basename(file_path)} — file record retained" + ) diff --git a/backend/ecmk_fetcher/tests/test_ecmk_service.py b/backend/ecmk_fetcher/tests/test_ecmk_service.py index c2694489..703bc4c5 100644 --- a/backend/ecmk_fetcher/tests/test_ecmk_service.py +++ b/backend/ecmk_fetcher/tests/test_ecmk_service.py @@ -1,8 +1,10 @@ from typing import Dict -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, call, patch +from backend.app.db.models.uploaded_file import FileTypeEnum from backend.ecmk_fetcher.address_list import PropertyRow from backend.ecmk_fetcher.ecmk_service import EcmkService +from backend.ecmk_fetcher.reports import FileDownloadButtonType from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient @@ -146,3 +148,447 @@ def test_run_passes_page_to_run_browser_session() -> None: service.run() mock_session.assert_called_once_with(mock_page) + + +# --------------------------------------------------------------------------- +# _process_file: dispatches based on report_type +# --------------------------------------------------------------------------- + + +def test_process_file_dispatches_to_xml_for_raw_xml() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch.object(service, "_process_xml_file") as mock_xml, + patch.object(service, "_process_pdf_file") as mock_pdf, + ): + service._process_file( + file_path="/tmp/file.xml", + report_type=FileDownloadButtonType.RAW_XML.value, + db_file_type=FileTypeEnum.ECMK_SURVEY_XML, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_xml.assert_called_once() + mock_pdf.assert_not_called() + + +def test_process_file_dispatches_to_pdf_for_non_xml() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch.object(service, "_process_xml_file") as mock_xml, + patch.object(service, "_process_pdf_file") as mock_pdf, + ): + service._process_file( + file_path="/tmp/file.pdf", + report_type=FileDownloadButtonType.SITENOTE_REPORT.value, + db_file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_pdf.assert_called_once() + mock_xml.assert_not_called() + + +# --------------------------------------------------------------------------- +# _process_xml_file: parse → flatten → write row → upload excel → S3 +# --------------------------------------------------------------------------- + + +def test_process_xml_file_full_chain() -> None: + fake_details = MagicMock() + fake_row_data = MagicMock() + + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service( + s3_bucket="my-bucket", + sharepoint_excel_path="/excel", + local_dimensions_path="/dims/Dimensions.xlsx", + ) + + with ( + patch( + "backend.ecmk_fetcher.ecmk_service.parse_rdsap", return_value=fake_details + ) as mock_parse, + patch( + "backend.ecmk_fetcher.ecmk_service.flatten_sap_property", + return_value=fake_row_data, + ) as mock_flatten, + patch("backend.ecmk_fetcher.ecmk_service.write_row") as mock_write, + patch( + "backend.ecmk_fetcher.ecmk_service.upload_excel_to_sharepoint" + ) as mock_upload_excel, + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record" + ) as mock_s3, + patch( + "builtins.open", + MagicMock(return_value=MagicMock( + __enter__=lambda s: MagicMock(read=lambda: ""), + __exit__=MagicMock(return_value=False), + )), + ), + ): + service._process_xml_file( + file_path="/tmp/report.xml", + db_file_type=FileTypeEnum.ECMK_SURVEY_XML, + hubspot_listing_id="hs-001", + ) + + mock_parse.assert_called_once() + mock_flatten.assert_called_once_with(fake_details) + mock_write.assert_called_once_with("/dims/Dimensions.xlsx", fake_row_data) + mock_upload_excel.assert_called_once_with( + client=service._sharepoint_client, + file_path="/dims/Dimensions.xlsx", + sharepoint_path="/excel", + ) + mock_s3.assert_called_once_with( + bucket="my-bucket", + file_path="/tmp/report.xml", + hubspot_listing_id="hs-001", + file_type=FileTypeEnum.ECMK_SURVEY_XML, + ) + + +# --------------------------------------------------------------------------- +# _process_pdf_file: sharepoint upload → S3 upload +# --------------------------------------------------------------------------- + + +def test_process_pdf_file_uploads_to_sharepoint_then_s3() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service( + s3_bucket="my-bucket", + sharepoint_base_path="/base", + ) + + with ( + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint" + ) as mock_sp, + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record", + return_value=42, + ) as mock_s3, + patch("backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf"), + patch("backend.ecmk_fetcher.ecmk_service.db_session"), + ): + service._process_pdf_file( + file_path="/tmp/report.pdf", + file_type=FileTypeEnum.ECMK_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_sp.assert_called_once_with( + client=service._sharepoint_client, + file_path="/tmp/report.pdf", + base_path="/base", + subpath="10 Fake St", + ) + mock_s3.assert_called_once_with( + bucket="my-bucket", + file_path="/tmp/report.pdf", + hubspot_listing_id="hs-001", + file_type=FileTypeEnum.ECMK_SITE_NOTE, + ) + + +# --------------------------------------------------------------------------- +# _process_pdf_file: EPC extraction conditional on file_type +# --------------------------------------------------------------------------- + + +def test_process_pdf_file_runs_epc_extraction_for_rd_sap_site_note() -> None: + fake_epc_data = MagicMock() + fake_session = MagicMock() + + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"), + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record", + return_value=99, + ), + patch( + "backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf", + return_value=fake_epc_data, + ) as mock_parse, + patch( + "backend.ecmk_fetcher.ecmk_service.save_epc_property_data" + ) as mock_save, + patch( + "backend.ecmk_fetcher.ecmk_service.db_session" + ) as mock_db_session, + ): + mock_db_session.return_value.__enter__.return_value = fake_session + service._process_pdf_file( + file_path="/tmp/sitenote.pdf", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_parse.assert_called_once_with("/tmp/sitenote.pdf") + mock_save.assert_called_once_with( + session=fake_session, + data=fake_epc_data, + uploaded_file_id=99, + ) + + +def test_process_pdf_file_skips_epc_extraction_for_ecmk_site_note() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"), + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record", + return_value=42, + ), + patch( + "backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf" + ) as mock_parse, + patch("backend.ecmk_fetcher.ecmk_service.db_session") as mock_db_session, + ): + service._process_pdf_file( + file_path="/tmp/sitenote.pdf", + file_type=FileTypeEnum.ECMK_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_parse.assert_not_called() + mock_db_session.assert_not_called() + + +def test_process_pdf_file_epc_uses_separate_db_session_from_s3_upload() -> None: + """EPC db_session opens only after upload_file_to_s3_and_record returns.""" + call_order: list[str] = [] + + def _on_s3(**_: object) -> int: + call_order.append("s3") + return 77 + + def _on_db_session() -> MagicMock: + call_order.append("db_session") + ctx = MagicMock() + ctx.__enter__ = MagicMock(return_value=MagicMock()) + ctx.__exit__ = MagicMock(return_value=False) + return ctx + + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"), + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record", + side_effect=_on_s3, + ), + patch("backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf"), + patch("backend.ecmk_fetcher.ecmk_service.save_epc_property_data"), + patch( + "backend.ecmk_fetcher.ecmk_service.db_session", + side_effect=_on_db_session, + ), + ): + service._process_pdf_file( + file_path="/tmp/sitenote.pdf", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + assert call_order == ["s3", "db_session"] + + +# --------------------------------------------------------------------------- +# _process_pdf_file: EPC failures swallowed with warning +# --------------------------------------------------------------------------- + + +def _pdf_file_patches_for_failure() -> tuple: # type: ignore[type-arg] + return ( + patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"), + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record", + return_value=1, + ), + ) + + +def test_process_pdf_file_parse_failure_logged_as_warning_not_raised() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + sp_patch, s3_patch = _pdf_file_patches_for_failure() + with ( + sp_patch, + s3_patch, + patch( + "backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf", + side_effect=ValueError("bad pdf"), + ), + patch("backend.ecmk_fetcher.ecmk_service.save_epc_property_data") as mock_save, + patch("backend.ecmk_fetcher.ecmk_service.db_session"), + patch("backend.ecmk_fetcher.ecmk_service.logger") as mock_logger, + ): + service._process_pdf_file( + file_path="/tmp/sitenote.pdf", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_logger.warning.assert_called_once() + mock_save.assert_not_called() + + +def test_process_pdf_file_save_failure_logged_as_warning_not_raised() -> None: + fake_session = MagicMock() + + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + sp_patch, s3_patch = _pdf_file_patches_for_failure() + with ( + sp_patch, + s3_patch, + patch( + "backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf", + return_value=MagicMock(), + ), + patch( + "backend.ecmk_fetcher.ecmk_service.save_epc_property_data", + side_effect=RuntimeError("db exploded"), + ), + patch("backend.ecmk_fetcher.ecmk_service.db_session") as mock_db_session, + patch("backend.ecmk_fetcher.ecmk_service.logger") as mock_logger, + ): + mock_db_session.return_value.__enter__.return_value = fake_session + service._process_pdf_file( + file_path="/tmp/sitenote.pdf", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_logger.warning.assert_called_once() + + +# --------------------------------------------------------------------------- +# _run_browser_session: delegates file processing to _process_file +# --------------------------------------------------------------------------- + + +def _make_page_mock_with_one_matching_row() -> MagicMock: + cells_nth: dict[int, MagicMock] = {n: MagicMock() for n in (1, 2, 5, 7, 9)} + cells_nth[1].inner_text.return_value = "John" + cells_nth[2].inner_text.return_value = "Doe" + cells_nth[5].inner_text.return_value = "10 FAKE ST" + cells_nth[7].inner_text.return_value = "SW1A 1AA" + cells_nth[9].inner_text.return_value = "Submitted (not Lodged)" + + cells_mock = MagicMock() + cells_mock.nth.side_effect = lambda n: cells_nth[n] + + row_mock = MagicMock() + row_mock.locator.return_value = cells_mock + + rows_mock = MagicMock() + rows_mock.count.return_value = 1 + rows_mock.nth.return_value = row_mock + + page = MagicMock() + page.locator.return_value = rows_mock + return page + + +# address "10 FAKE ST" + postcode "SW1A 1AA" → build_property_id → "10SW1A1AA" +_BROWSER_SESSION_PROPERTY_MAP: Dict[str, PropertyRow] = { + "10SW1A1AA": PropertyRow( + row_index=2, address="10 Fake St SW1A 1AA", listing_id="12345" + ) +} + + +def test_run_browser_session_calls_process_file_for_downloaded_file() -> None: + mock_page = _make_page_mock_with_one_matching_row() + + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=_BROWSER_SESSION_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch("backend.ecmk_fetcher.ecmk_service.attach_debug_listeners"), + patch("backend.ecmk_fetcher.ecmk_service.login"), + patch("backend.ecmk_fetcher.ecmk_service.go_to_assessments"), + patch("backend.ecmk_fetcher.ecmk_service.go_to_assessment_details"), + patch("backend.ecmk_fetcher.ecmk_service.go_to_next_page", return_value=False), + patch( + "backend.ecmk_fetcher.ecmk_service.get_uploaded_file_by_listing_type_and_source", + return_value=None, + ), + patch( + "backend.ecmk_fetcher.ecmk_service.download_with_retry", + return_value="/tmp/fake.pdf", + ), + patch( + "backend.ecmk_fetcher.ecmk_service.map_report_type_to_db_file_type", + return_value=FileTypeEnum.ECMK_SITE_NOTE, + ), + patch( + "backend.ecmk_fetcher.ecmk_service.REPORT_TYPES", + [FileDownloadButtonType.SITENOTE_REPORT.value], + ), + patch.object(service, "_process_file") as mock_process_file, + patch("os.path.exists", return_value=False), + ): + service._run_browser_session(mock_page) + + mock_process_file.assert_called_once_with( + file_path="/tmp/fake.pdf", + report_type=FileDownloadButtonType.SITENOTE_REPORT.value, + db_file_type=FileTypeEnum.ECMK_SITE_NOTE, + sharepoint_address="10 Fake St SW1A 1AA", + hubspot_listing_id="12345", + ) diff --git a/backend/ecmk_fetcher/tests/test_handler.py b/backend/ecmk_fetcher/tests/test_handler.py new file mode 100644 index 00000000..9de97253 --- /dev/null +++ b/backend/ecmk_fetcher/tests/test_handler.py @@ -0,0 +1,59 @@ +from unittest.mock import MagicMock, patch + +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient + + +def test_handler_constructs_ecmk_service_and_calls_run() -> None: + mock_service = MagicMock() + mock_service_cls = MagicMock(return_value=mock_service) + + with ( + patch( + "backend.ecmk_fetcher.handler.handler.EcmkService", + mock_service_cls, + ), + patch( + "backend.ecmk_fetcher.handler.handler.DomnaSharepointClient", + return_value=MagicMock(spec=DomnaSharepointClient), + ), + ): + from backend.ecmk_fetcher.handler.handler import handler + + handler({}, None) + + mock_service_cls.assert_called_once() + mock_service.run.assert_called_once() + + +def test_handler_passes_correct_config_to_ecmk_service() -> None: + mock_service = MagicMock() + mock_service_cls = MagicMock(return_value=mock_service) + + with ( + patch( + "backend.ecmk_fetcher.handler.handler.EcmkService", + mock_service_cls, + ), + patch( + "backend.ecmk_fetcher.handler.handler.DomnaSharepointClient", + return_value=MagicMock(spec=DomnaSharepointClient), + ), + ): + from backend.ecmk_fetcher.handler.handler import handler + + handler({}, None) + + _, kwargs = mock_service_cls.call_args + assert kwargs["s3_bucket"] == "retrofit-energy-assessments-dev" + assert ( + kwargs["sharepoint_base_path"] + == "/Projects/Southern Housing/SH-SURV-26-001/Assessments" + ) + assert ( + kwargs["sharepoint_excel_path"] + == "/Projects/Southern Housing/SH-SURV-26-001/Modelling" + ) + assert kwargs["property_list_filepath"].endswith( + "hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx" + ) + assert kwargs["local_dimensions_path"].endswith("Dimensions.xlsx")