ECMK : refactor EcmkService class 🟪

This commit is contained in:
Daniel Roth 2026-04-29 14:10:55 +00:00
parent 7cfa800c42
commit 6851a84ede
3 changed files with 598 additions and 32 deletions

View file

@ -3,11 +3,17 @@ from typing import Dict
from playwright.sync_api import Browser, BrowserContext, Locator, Page, sync_playwright from playwright.sync_api import Browser, BrowserContext, Locator, Page, sync_playwright
from backend.app.db.connection import db_session
from backend.app.db.functions.uploaded_files_functions import ( from backend.app.db.functions.uploaded_files_functions import (
get_uploaded_file_by_listing_type_and_source, get_uploaded_file_by_listing_type_and_source,
) )
from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum
from backend.ecmk_fetcher.address_list import PropertyRow, extract_addresses_from_spreadsheet from backend.documents_parser.db_writer import save_epc_property_data
from backend.documents_parser.parser import parse_site_notes_pdf
from backend.ecmk_fetcher.address_list import (
PropertyRow,
extract_addresses_from_spreadsheet,
)
from backend.ecmk_fetcher.browser import ( from backend.ecmk_fetcher.browser import (
attach_debug_listeners, attach_debug_listeners,
download_with_retry, download_with_retry,
@ -105,7 +111,9 @@ class EcmkService:
property_id: str = build_property_id(address, postcode) property_id: str = build_property_id(address, postcode)
property_row: PropertyRow | None = self._property_map.get(property_id) property_row: PropertyRow | None = self._property_map.get(
property_id
)
if not property_row: if not property_row:
continue continue
@ -146,38 +154,13 @@ class EcmkService:
) )
try: try:
if report_type == FileDownloadButtonType.RAW_XML.value: self._process_file(
with open(file_path, "r", encoding="utf-8") as f:
xml_string = f.read()
details = parse_rdsap(xml_string)
row_data = flatten_sap_property(details)
write_row(self._local_dimensions_path, row_data)
upload_excel_to_sharepoint(
client=self._sharepoint_client,
file_path=self._local_dimensions_path,
sharepoint_path=self._sharepoint_excel_path,
)
logger.info(
f"Written dimensions row and uploaded Dimensions.xlsx for {address}"
)
else:
upload_file_to_sharepoint(
client=self._sharepoint_client,
file_path=file_path,
base_path=self._sharepoint_base_path,
subpath=sharepoint_address,
)
logger.info(
f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}"
)
upload_file_to_s3_and_record(
bucket=self._s3_bucket,
file_path=file_path, file_path=file_path,
report_type=report_type,
db_file_type=db_file_type,
sharepoint_address=sharepoint_address,
hubspot_listing_id=hubspot_listing_id, hubspot_listing_id=hubspot_listing_id,
file_type=db_file_type,
) )
except Exception: except Exception:
raise raise
finally: finally:
@ -194,3 +177,81 @@ class EcmkService:
if not go_to_next_page(page): if not go_to_next_page(page):
break break
def _process_file(
self,
file_path: str,
report_type: int,
db_file_type: FileTypeEnum,
sharepoint_address: str,
hubspot_listing_id: str,
) -> None:
if report_type == FileDownloadButtonType.RAW_XML.value:
self._process_xml_file(
file_path=file_path,
db_file_type=db_file_type,
hubspot_listing_id=hubspot_listing_id,
)
else:
self._process_pdf_file(
file_path=file_path,
file_type=db_file_type,
sharepoint_address=sharepoint_address,
hubspot_listing_id=hubspot_listing_id,
)
def _process_xml_file(
self,
file_path: str,
db_file_type: FileTypeEnum,
hubspot_listing_id: str,
) -> None:
with open(file_path, "r", encoding="utf-8") as f:
xml_string: str = f.read()
details = parse_rdsap(xml_string)
row_data = flatten_sap_property(details)
write_row(self._local_dimensions_path, row_data)
upload_excel_to_sharepoint(
client=self._sharepoint_client,
file_path=self._local_dimensions_path,
sharepoint_path=self._sharepoint_excel_path,
)
upload_file_to_s3_and_record(
bucket=self._s3_bucket,
file_path=file_path,
hubspot_listing_id=hubspot_listing_id,
file_type=db_file_type,
)
def _process_pdf_file(
self,
file_path: str,
file_type: FileTypeEnum,
sharepoint_address: str,
hubspot_listing_id: str,
) -> None:
upload_file_to_sharepoint(
client=self._sharepoint_client,
file_path=file_path,
base_path=self._sharepoint_base_path,
subpath=sharepoint_address,
)
uploaded_file_id: int = upload_file_to_s3_and_record(
bucket=self._s3_bucket,
file_path=file_path,
hubspot_listing_id=hubspot_listing_id,
file_type=file_type,
)
if file_type == FileTypeEnum.ECMK_RD_SAP_SITE_NOTE:
try:
epc_data = parse_site_notes_pdf(file_path)
with db_session() as session:
save_epc_property_data(
session=session,
data=epc_data,
uploaded_file_id=uploaded_file_id,
)
except Exception:
logger.warning(
f"EPC extraction failed for {os.path.basename(file_path)} — file record retained"
)

View file

@ -1,8 +1,10 @@
from typing import Dict from typing import Dict
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, call, patch
from backend.app.db.models.uploaded_file import FileTypeEnum
from backend.ecmk_fetcher.address_list import PropertyRow from backend.ecmk_fetcher.address_list import PropertyRow
from backend.ecmk_fetcher.ecmk_service import EcmkService from backend.ecmk_fetcher.ecmk_service import EcmkService
from backend.ecmk_fetcher.reports import FileDownloadButtonType
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
@ -146,3 +148,447 @@ def test_run_passes_page_to_run_browser_session() -> None:
service.run() service.run()
mock_session.assert_called_once_with(mock_page) mock_session.assert_called_once_with(mock_page)
# ---------------------------------------------------------------------------
# _process_file: dispatches based on report_type
# ---------------------------------------------------------------------------
def test_process_file_dispatches_to_xml_for_raw_xml() -> None:
with patch(
"backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet",
return_value=FAKE_PROPERTY_MAP,
):
service = make_service()
with (
patch.object(service, "_process_xml_file") as mock_xml,
patch.object(service, "_process_pdf_file") as mock_pdf,
):
service._process_file(
file_path="/tmp/file.xml",
report_type=FileDownloadButtonType.RAW_XML.value,
db_file_type=FileTypeEnum.ECMK_SURVEY_XML,
sharepoint_address="10 Fake St",
hubspot_listing_id="hs-001",
)
mock_xml.assert_called_once()
mock_pdf.assert_not_called()
def test_process_file_dispatches_to_pdf_for_non_xml() -> None:
with patch(
"backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet",
return_value=FAKE_PROPERTY_MAP,
):
service = make_service()
with (
patch.object(service, "_process_xml_file") as mock_xml,
patch.object(service, "_process_pdf_file") as mock_pdf,
):
service._process_file(
file_path="/tmp/file.pdf",
report_type=FileDownloadButtonType.SITENOTE_REPORT.value,
db_file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE,
sharepoint_address="10 Fake St",
hubspot_listing_id="hs-001",
)
mock_pdf.assert_called_once()
mock_xml.assert_not_called()
# ---------------------------------------------------------------------------
# _process_xml_file: parse → flatten → write row → upload excel → S3
# ---------------------------------------------------------------------------
def test_process_xml_file_full_chain() -> None:
fake_details = MagicMock()
fake_row_data = MagicMock()
with patch(
"backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet",
return_value=FAKE_PROPERTY_MAP,
):
service = make_service(
s3_bucket="my-bucket",
sharepoint_excel_path="/excel",
local_dimensions_path="/dims/Dimensions.xlsx",
)
with (
patch(
"backend.ecmk_fetcher.ecmk_service.parse_rdsap", return_value=fake_details
) as mock_parse,
patch(
"backend.ecmk_fetcher.ecmk_service.flatten_sap_property",
return_value=fake_row_data,
) as mock_flatten,
patch("backend.ecmk_fetcher.ecmk_service.write_row") as mock_write,
patch(
"backend.ecmk_fetcher.ecmk_service.upload_excel_to_sharepoint"
) as mock_upload_excel,
patch(
"backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record"
) as mock_s3,
patch(
"builtins.open",
MagicMock(return_value=MagicMock(
__enter__=lambda s: MagicMock(read=lambda: "<xml/>"),
__exit__=MagicMock(return_value=False),
)),
),
):
service._process_xml_file(
file_path="/tmp/report.xml",
db_file_type=FileTypeEnum.ECMK_SURVEY_XML,
hubspot_listing_id="hs-001",
)
mock_parse.assert_called_once()
mock_flatten.assert_called_once_with(fake_details)
mock_write.assert_called_once_with("/dims/Dimensions.xlsx", fake_row_data)
mock_upload_excel.assert_called_once_with(
client=service._sharepoint_client,
file_path="/dims/Dimensions.xlsx",
sharepoint_path="/excel",
)
mock_s3.assert_called_once_with(
bucket="my-bucket",
file_path="/tmp/report.xml",
hubspot_listing_id="hs-001",
file_type=FileTypeEnum.ECMK_SURVEY_XML,
)
# ---------------------------------------------------------------------------
# _process_pdf_file: sharepoint upload → S3 upload
# ---------------------------------------------------------------------------
def test_process_pdf_file_uploads_to_sharepoint_then_s3() -> None:
with patch(
"backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet",
return_value=FAKE_PROPERTY_MAP,
):
service = make_service(
s3_bucket="my-bucket",
sharepoint_base_path="/base",
)
with (
patch(
"backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"
) as mock_sp,
patch(
"backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record",
return_value=42,
) as mock_s3,
patch("backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf"),
patch("backend.ecmk_fetcher.ecmk_service.db_session"),
):
service._process_pdf_file(
file_path="/tmp/report.pdf",
file_type=FileTypeEnum.ECMK_SITE_NOTE,
sharepoint_address="10 Fake St",
hubspot_listing_id="hs-001",
)
mock_sp.assert_called_once_with(
client=service._sharepoint_client,
file_path="/tmp/report.pdf",
base_path="/base",
subpath="10 Fake St",
)
mock_s3.assert_called_once_with(
bucket="my-bucket",
file_path="/tmp/report.pdf",
hubspot_listing_id="hs-001",
file_type=FileTypeEnum.ECMK_SITE_NOTE,
)
# ---------------------------------------------------------------------------
# _process_pdf_file: EPC extraction conditional on file_type
# ---------------------------------------------------------------------------
def test_process_pdf_file_runs_epc_extraction_for_rd_sap_site_note() -> None:
fake_epc_data = MagicMock()
fake_session = MagicMock()
with patch(
"backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet",
return_value=FAKE_PROPERTY_MAP,
):
service = make_service()
with (
patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"),
patch(
"backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record",
return_value=99,
),
patch(
"backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf",
return_value=fake_epc_data,
) as mock_parse,
patch(
"backend.ecmk_fetcher.ecmk_service.save_epc_property_data"
) as mock_save,
patch(
"backend.ecmk_fetcher.ecmk_service.db_session"
) as mock_db_session,
):
mock_db_session.return_value.__enter__.return_value = fake_session
service._process_pdf_file(
file_path="/tmp/sitenote.pdf",
file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE,
sharepoint_address="10 Fake St",
hubspot_listing_id="hs-001",
)
mock_parse.assert_called_once_with("/tmp/sitenote.pdf")
mock_save.assert_called_once_with(
session=fake_session,
data=fake_epc_data,
uploaded_file_id=99,
)
def test_process_pdf_file_skips_epc_extraction_for_ecmk_site_note() -> None:
with patch(
"backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet",
return_value=FAKE_PROPERTY_MAP,
):
service = make_service()
with (
patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"),
patch(
"backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record",
return_value=42,
),
patch(
"backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf"
) as mock_parse,
patch("backend.ecmk_fetcher.ecmk_service.db_session") as mock_db_session,
):
service._process_pdf_file(
file_path="/tmp/sitenote.pdf",
file_type=FileTypeEnum.ECMK_SITE_NOTE,
sharepoint_address="10 Fake St",
hubspot_listing_id="hs-001",
)
mock_parse.assert_not_called()
mock_db_session.assert_not_called()
def test_process_pdf_file_epc_uses_separate_db_session_from_s3_upload() -> None:
"""EPC db_session opens only after upload_file_to_s3_and_record returns."""
call_order: list[str] = []
def _on_s3(**_: object) -> int:
call_order.append("s3")
return 77
def _on_db_session() -> MagicMock:
call_order.append("db_session")
ctx = MagicMock()
ctx.__enter__ = MagicMock(return_value=MagicMock())
ctx.__exit__ = MagicMock(return_value=False)
return ctx
with patch(
"backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet",
return_value=FAKE_PROPERTY_MAP,
):
service = make_service()
with (
patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"),
patch(
"backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record",
side_effect=_on_s3,
),
patch("backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf"),
patch("backend.ecmk_fetcher.ecmk_service.save_epc_property_data"),
patch(
"backend.ecmk_fetcher.ecmk_service.db_session",
side_effect=_on_db_session,
),
):
service._process_pdf_file(
file_path="/tmp/sitenote.pdf",
file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE,
sharepoint_address="10 Fake St",
hubspot_listing_id="hs-001",
)
assert call_order == ["s3", "db_session"]
# ---------------------------------------------------------------------------
# _process_pdf_file: EPC failures swallowed with warning
# ---------------------------------------------------------------------------
def _pdf_file_patches_for_failure() -> tuple: # type: ignore[type-arg]
return (
patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"),
patch(
"backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record",
return_value=1,
),
)
def test_process_pdf_file_parse_failure_logged_as_warning_not_raised() -> None:
with patch(
"backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet",
return_value=FAKE_PROPERTY_MAP,
):
service = make_service()
sp_patch, s3_patch = _pdf_file_patches_for_failure()
with (
sp_patch,
s3_patch,
patch(
"backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf",
side_effect=ValueError("bad pdf"),
),
patch("backend.ecmk_fetcher.ecmk_service.save_epc_property_data") as mock_save,
patch("backend.ecmk_fetcher.ecmk_service.db_session"),
patch("backend.ecmk_fetcher.ecmk_service.logger") as mock_logger,
):
service._process_pdf_file(
file_path="/tmp/sitenote.pdf",
file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE,
sharepoint_address="10 Fake St",
hubspot_listing_id="hs-001",
)
mock_logger.warning.assert_called_once()
mock_save.assert_not_called()
def test_process_pdf_file_save_failure_logged_as_warning_not_raised() -> None:
fake_session = MagicMock()
with patch(
"backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet",
return_value=FAKE_PROPERTY_MAP,
):
service = make_service()
sp_patch, s3_patch = _pdf_file_patches_for_failure()
with (
sp_patch,
s3_patch,
patch(
"backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf",
return_value=MagicMock(),
),
patch(
"backend.ecmk_fetcher.ecmk_service.save_epc_property_data",
side_effect=RuntimeError("db exploded"),
),
patch("backend.ecmk_fetcher.ecmk_service.db_session") as mock_db_session,
patch("backend.ecmk_fetcher.ecmk_service.logger") as mock_logger,
):
mock_db_session.return_value.__enter__.return_value = fake_session
service._process_pdf_file(
file_path="/tmp/sitenote.pdf",
file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE,
sharepoint_address="10 Fake St",
hubspot_listing_id="hs-001",
)
mock_logger.warning.assert_called_once()
# ---------------------------------------------------------------------------
# _run_browser_session: delegates file processing to _process_file
# ---------------------------------------------------------------------------
def _make_page_mock_with_one_matching_row() -> MagicMock:
cells_nth: dict[int, MagicMock] = {n: MagicMock() for n in (1, 2, 5, 7, 9)}
cells_nth[1].inner_text.return_value = "John"
cells_nth[2].inner_text.return_value = "Doe"
cells_nth[5].inner_text.return_value = "10 FAKE ST"
cells_nth[7].inner_text.return_value = "SW1A 1AA"
cells_nth[9].inner_text.return_value = "Submitted (not Lodged)"
cells_mock = MagicMock()
cells_mock.nth.side_effect = lambda n: cells_nth[n]
row_mock = MagicMock()
row_mock.locator.return_value = cells_mock
rows_mock = MagicMock()
rows_mock.count.return_value = 1
rows_mock.nth.return_value = row_mock
page = MagicMock()
page.locator.return_value = rows_mock
return page
# address "10 FAKE ST" + postcode "SW1A 1AA" → build_property_id → "10SW1A1AA"
_BROWSER_SESSION_PROPERTY_MAP: Dict[str, PropertyRow] = {
"10SW1A1AA": PropertyRow(
row_index=2, address="10 Fake St SW1A 1AA", listing_id="12345"
)
}
def test_run_browser_session_calls_process_file_for_downloaded_file() -> None:
mock_page = _make_page_mock_with_one_matching_row()
with patch(
"backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet",
return_value=_BROWSER_SESSION_PROPERTY_MAP,
):
service = make_service()
with (
patch("backend.ecmk_fetcher.ecmk_service.attach_debug_listeners"),
patch("backend.ecmk_fetcher.ecmk_service.login"),
patch("backend.ecmk_fetcher.ecmk_service.go_to_assessments"),
patch("backend.ecmk_fetcher.ecmk_service.go_to_assessment_details"),
patch("backend.ecmk_fetcher.ecmk_service.go_to_next_page", return_value=False),
patch(
"backend.ecmk_fetcher.ecmk_service.get_uploaded_file_by_listing_type_and_source",
return_value=None,
),
patch(
"backend.ecmk_fetcher.ecmk_service.download_with_retry",
return_value="/tmp/fake.pdf",
),
patch(
"backend.ecmk_fetcher.ecmk_service.map_report_type_to_db_file_type",
return_value=FileTypeEnum.ECMK_SITE_NOTE,
),
patch(
"backend.ecmk_fetcher.ecmk_service.REPORT_TYPES",
[FileDownloadButtonType.SITENOTE_REPORT.value],
),
patch.object(service, "_process_file") as mock_process_file,
patch("os.path.exists", return_value=False),
):
service._run_browser_session(mock_page)
mock_process_file.assert_called_once_with(
file_path="/tmp/fake.pdf",
report_type=FileDownloadButtonType.SITENOTE_REPORT.value,
db_file_type=FileTypeEnum.ECMK_SITE_NOTE,
sharepoint_address="10 Fake St SW1A 1AA",
hubspot_listing_id="12345",
)

View file

@ -0,0 +1,59 @@
from unittest.mock import MagicMock, patch
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
def test_handler_constructs_ecmk_service_and_calls_run() -> None:
mock_service = MagicMock()
mock_service_cls = MagicMock(return_value=mock_service)
with (
patch(
"backend.ecmk_fetcher.handler.handler.EcmkService",
mock_service_cls,
),
patch(
"backend.ecmk_fetcher.handler.handler.DomnaSharepointClient",
return_value=MagicMock(spec=DomnaSharepointClient),
),
):
from backend.ecmk_fetcher.handler.handler import handler
handler({}, None)
mock_service_cls.assert_called_once()
mock_service.run.assert_called_once()
def test_handler_passes_correct_config_to_ecmk_service() -> None:
mock_service = MagicMock()
mock_service_cls = MagicMock(return_value=mock_service)
with (
patch(
"backend.ecmk_fetcher.handler.handler.EcmkService",
mock_service_cls,
),
patch(
"backend.ecmk_fetcher.handler.handler.DomnaSharepointClient",
return_value=MagicMock(spec=DomnaSharepointClient),
),
):
from backend.ecmk_fetcher.handler.handler import handler
handler({}, None)
_, kwargs = mock_service_cls.call_args
assert kwargs["s3_bucket"] == "retrofit-energy-assessments-dev"
assert (
kwargs["sharepoint_base_path"]
== "/Projects/Southern Housing/SH-SURV-26-001/Assessments"
)
assert (
kwargs["sharepoint_excel_path"]
== "/Projects/Southern Housing/SH-SURV-26-001/Modelling"
)
assert kwargs["property_list_filepath"].endswith(
"hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx"
)
assert kwargs["local_dimensions_path"].endswith("Dimensions.xlsx")