From 1bc8343738a39bc2ab0d471cb537b770b4e82e85 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 28 Apr 2026 16:33:53 +0000 Subject: [PATCH 01/10] identify site notes files after file upload so we can save them to epc_property table --- backend/app/db/models/epc_property.py | 1 + backend/pashub_fetcher/handler/handler.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/backend/app/db/models/epc_property.py b/backend/app/db/models/epc_property.py index 2a47d57d..50523fbb 100644 --- a/backend/app/db/models/epc_property.py +++ b/backend/app/db/models/epc_property.py @@ -20,6 +20,7 @@ class EpcPropertyModel(SQLModel, table=True): id: Optional[int] = Field(default=None, primary_key=True) property_id: Optional[int] = Field(default=None) portfolio_id: Optional[int] = Field(default=None) + uploaded_file_id: Optional[int] = Field(default=None) # Identity / admin uprn: Optional[int] = Field(default=None) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 01f1a881..fc1f4f80 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -8,6 +8,7 @@ from backend.app.config import get_settings from backend.app.db.connection import db_session from backend.app.db.models.uploaded_file import ( FileSourceEnum, + FileTypeEnum, UploadedFile, ) from backend.pashub_fetcher.core_files import infer_file_type @@ -141,8 +142,22 @@ def upload_job_to_s3_and_update_db( with db_session() as session: session.add_all(uploaded_files) + + # Ensure IDs are generated + session.flush() + + results = [ + {"file": file.s3_file_key, "type": file.file_type, "id": file.id} + for file in uploaded_files + ] + session.commit() + for result in results: + if FileTypeEnum(result["type"]) == FileTypeEnum.RD_SAP_SITE_NOTE: + # upload site notes to epc_property table + continue + pass From 252657a37423543f6d630f0a08206a729c64eaca Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 29 Apr 2026 09:55:14 +0000 Subject: [PATCH 02/10] include updating epc_property_data to pashub to ara workflow --- backend/documents_parser/db_writer.py | 76 +++++++++++++++++++++++ backend/documents_parser/parser.py | 13 ++++ backend/pashub_fetcher/handler/handler.py | 50 +++++++-------- 3 files changed, 114 insertions(+), 25 deletions(-) create mode 100644 backend/documents_parser/db_writer.py create mode 100644 backend/documents_parser/parser.py diff --git a/backend/documents_parser/db_writer.py b/backend/documents_parser/db_writer.py new file mode 100644 index 00000000..2039aabe --- /dev/null +++ b/backend/documents_parser/db_writer.py @@ -0,0 +1,76 @@ +from typing import Optional + +from sqlmodel import Session + +from backend.app.db.models.epc_property import ( + EpcBuildingPartModel, + EpcEnergyElementModel, + EpcFlatDetailsModel, + EpcFloorDimensionModel, + EpcMainHeatingDetailModel, + EpcPropertyEnergyPerformanceModel, + EpcPropertyModel, + EpcWindowModel, +) +from datatypes.epc.domain.epc_property_data import EpcPropertyData + + +def save_epc_property_data( + session: Session, + data: EpcPropertyData, + uploaded_file_id: Optional[int] = None, + property_id: Optional[int] = None, + portfolio_id: Optional[int] = None, +) -> EpcPropertyModel: + epc_prop = EpcPropertyModel.from_epc_property_data( + data, property_id=property_id, portfolio_id=portfolio_id + ) + epc_prop.uploaded_file_id = uploaded_file_id + session.add(epc_prop) + session.flush() + assert epc_prop.id is not None + epc_property_id: int = epc_prop.id + + session.add( + EpcPropertyEnergyPerformanceModel.from_epc_property_data( + data, epc_property_id=epc_property_id + ) + ) + + for detail in data.sap_heating.main_heating_details: + session.add(EpcMainHeatingDetailModel.from_domain(detail, epc_property_id)) + + for part in data.sap_building_parts: + bp = EpcBuildingPartModel.from_domain(part, epc_property_id) + session.add(bp) + session.flush() + assert bp.id is not None + for dim in part.sap_floor_dimensions: + session.add(EpcFloorDimensionModel.from_domain(dim, bp.id)) + + for window in data.sap_windows: + session.add(EpcWindowModel.from_domain(window, epc_property_id)) + + for el in data.roofs: + session.add(EpcEnergyElementModel.from_domain(el, "roof", epc_property_id)) + for el in data.walls: + session.add(EpcEnergyElementModel.from_domain(el, "wall", epc_property_id)) + for el in data.floors: + session.add(EpcEnergyElementModel.from_domain(el, "floor", epc_property_id)) + for el in data.main_heating: + session.add(EpcEnergyElementModel.from_domain(el, "main_heating", epc_property_id)) + + for el, etype in [ + (data.window, "window"), + (data.lighting, "lighting"), + (data.hot_water, "hot_water"), + (data.secondary_heating, "secondary_heating"), + (data.main_heating_controls, "main_heating_controls"), + ]: + if el is not None: + session.add(EpcEnergyElementModel.from_domain(el, etype, epc_property_id)) + + if data.sap_flat_details is not None: + session.add(EpcFlatDetailsModel.from_domain(data.sap_flat_details, epc_property_id)) + + return epc_prop diff --git a/backend/documents_parser/parser.py b/backend/documents_parser/parser.py new file mode 100644 index 00000000..0f6760d7 --- /dev/null +++ b/backend/documents_parser/parser.py @@ -0,0 +1,13 @@ +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.domain.mapper import EpcPropertyDataMapper + +from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor +from backend.documents_parser.pdf import pdf_to_text_list + + +def parse_pashub_site_notes(file_path: str) -> EpcPropertyData: + with open(file_path, "rb") as f: + pdf_bytes = f.read() + tokens = pdf_to_text_list(pdf_bytes) + site_notes = PasHubRdSapSiteNotesExtractor(tokens).extract() + return EpcPropertyDataMapper.from_site_notes(site_notes) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index fc1f4f80..b9df216e 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,7 +1,7 @@ from datetime import datetime, timezone import os import re -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple, cast from openpyxl import load_workbook from backend.app.config import get_settings @@ -11,7 +11,10 @@ from backend.app.db.models.uploaded_file import ( FileTypeEnum, UploadedFile, ) +from backend.documents_parser.db_writer import save_epc_property_data +from backend.documents_parser.parser import parse_pashub_site_notes from backend.pashub_fetcher.core_files import infer_file_type +from datatypes.epc.domain.epc_property_data import EpcPropertyData from backend.pashub_fetcher.job import Job from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError @@ -119,6 +122,7 @@ def upload_job_to_s3_and_update_db( ) uploaded_files: List[UploadedFile] = [] + site_notes_pairs: List[Tuple[UploadedFile, EpcPropertyData]] = [] for file_path in job_files: filename = os.path.basename(file_path) @@ -126,40 +130,36 @@ def upload_job_to_s3_and_update_db( upload_file_to_s3(file_path, bucket, file_key) - # load row to db # TODO: use same upload_file_to_s3_and_update_db method as ecmk fetcher does - uploaded_files.append( - UploadedFile( - s3_file_bucket=bucket, - s3_file_key=file_key, - s3_upload_timestamp=datetime.now(timezone.utc), - uprn=int(uprn) if uprn else None, - hubspot_deal_id=hubspot_deal_id, - file_source=FileSourceEnum.PAS_HUB.value, - file_type=infer_file_type(filename), - ) + uploaded_file = UploadedFile( + s3_file_bucket=bucket, + s3_file_key=file_key, + s3_upload_timestamp=datetime.now(timezone.utc), + uprn=int(uprn) if uprn else None, + hubspot_deal_id=hubspot_deal_id, + file_source=FileSourceEnum.PAS_HUB.value, + file_type=infer_file_type(filename), ) + uploaded_files.append(uploaded_file) + + file_type: Optional[str] = cast(Optional[str], uploaded_file.file_type) + if file_type is not None and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE: + try: + site_notes_pairs.append((uploaded_file, parse_pashub_site_notes(file_path))) + except Exception: + logger.warning(f"Failed to parse site notes {file_path}", exc_info=True) with db_session() as session: session.add_all(uploaded_files) - - # Ensure IDs are generated session.flush() - results = [ - {"file": file.s3_file_key, "type": file.file_type, "id": file.id} - for file in uploaded_files - ] + for uploaded_file, epc_data in site_notes_pairs: + save_epc_property_data( + session, epc_data, uploaded_file_id=cast(int, uploaded_file.id) + ) session.commit() - for result in results: - if FileTypeEnum(result["type"]) == FileTypeEnum.RD_SAP_SITE_NOTE: - # upload site notes to epc_property table - continue - - pass - def process_job( job: PashubToAraTriggerRequest, From 272837d6ef42866e1fad432a1618792b8fede7db Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 29 Apr 2026 10:04:55 +0000 Subject: [PATCH 03/10] move extraction from spreadsheet to dedicated file --- backend/pashub_fetcher/handler/handler.py | 54 ++++------------------- backend/pashub_fetcher/spreadsheet.py | 43 ++++++++++++++++++ 2 files changed, 51 insertions(+), 46 deletions(-) create mode 100644 backend/pashub_fetcher/spreadsheet.py diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index b9df216e..a74f9a2d 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,8 +1,6 @@ from datetime import datetime, timezone import os -import re from typing import Any, Dict, List, Optional, Tuple, cast -from openpyxl import load_workbook from backend.app.config import get_settings from backend.app.db.connection import db_session @@ -14,9 +12,6 @@ from backend.app.db.models.uploaded_file import ( from backend.documents_parser.db_writer import save_epc_property_data from backend.documents_parser.parser import parse_pashub_site_notes from backend.pashub_fetcher.core_files import infer_file_type -from datatypes.epc.domain.epc_property_data import EpcPropertyData - -from backend.pashub_fetcher.job import Job from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( PashubToAraTriggerRequest, @@ -24,6 +19,7 @@ from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders from backend.pashub_fetcher.token_getter import get_token_from_local_storage from backend.utils.subtasks import task_handler +from datatypes.epc.domain.epc_property_data import EpcPropertyData from utils.logger import setup_logger from utils.s3 import upload_file_to_s3 from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient @@ -33,45 +29,6 @@ from utils.sharepoint.domna_sites import DomnaSites logger = setup_logger() -def extract_jobs(filepath: str) -> List[Job]: - wb = load_workbook(filepath, data_only=True) - # ws = wb["watford warm homes (wave 3) mai"] - ws = wb["filtered"] - - HEADER_ROW = 3 - - headers: Dict[str, int] = {} - for col in range(1, ws.max_column + 1): - value = str(ws.cell(row=HEADER_ROW, column=col).value) - if value: - headers[value.strip()] = col - - name_col = headers["Name"] - # link_col = headers["Pashub Link"] - link_col = headers["PasHub Link"] - - jobs: List[Job] = [] - - for row in range(HEADER_ROW + 1, ws.max_row + 1): - name = ws.cell(row=row, column=name_col).value - link = ws.cell(row=row, column=link_col).value - - if not name or not link: - continue - - match = re.search(r"/jobs/([0-9a-fA-F\-]+)/", str(link)) - if not match: - continue - - jobs.append( - { - "id": match.group(1), - "address": str(name), - } - ) - - return jobs - def get_pashub_client(email: str, password: str) -> PashubClient: token = get_token_from_local_storage(email, password) @@ -143,9 +100,14 @@ def upload_job_to_s3_and_update_db( uploaded_files.append(uploaded_file) file_type: Optional[str] = cast(Optional[str], uploaded_file.file_type) - if file_type is not None and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE: + if ( + file_type is not None + and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE + ): try: - site_notes_pairs.append((uploaded_file, parse_pashub_site_notes(file_path))) + site_notes_pairs.append( + (uploaded_file, parse_pashub_site_notes(file_path)) + ) except Exception: logger.warning(f"Failed to parse site notes {file_path}", exc_info=True) diff --git a/backend/pashub_fetcher/spreadsheet.py b/backend/pashub_fetcher/spreadsheet.py new file mode 100644 index 00000000..5f8f74f6 --- /dev/null +++ b/backend/pashub_fetcher/spreadsheet.py @@ -0,0 +1,43 @@ +import re +from typing import Dict, List +from openpyxl import load_workbook + +from backend.pashub_fetcher.job import Job + + +def extract_jobs(filepath: str) -> List[Job]: + wb = load_workbook(filepath, data_only=True) + ws = wb["filtered"] + + HEADER_ROW = 3 + + headers: Dict[str, int] = {} + for col in range(1, ws.max_column + 1): + value = str(ws.cell(row=HEADER_ROW, column=col).value) + if value: + headers[value.strip()] = col + + name_col = headers["Name"] + link_col = headers["PasHub Link"] + + jobs: List[Job] = [] + + for row in range(HEADER_ROW + 1, ws.max_row + 1): + name = ws.cell(row=row, column=name_col).value + link = ws.cell(row=row, column=link_col).value + + if not name or not link: + continue + + match = re.search(r"/jobs/([0-9a-fA-F\-]+)/", str(link)) + if not match: + continue + + jobs.append( + { + "id": match.group(1), + "address": str(name), + } + ) + + return jobs From b347039b80de9880902b8dccf6ad08648319ea24 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 29 Apr 2026 11:20:47 +0000 Subject: [PATCH 04/10] load ecmk site notes to db --- backend/documents_parser/parser.py | 19 +++++++++++++++++-- backend/ecmk_fetcher/upload.py | 20 ++++++++++++++++++-- backend/pashub_fetcher/handler/handler.py | 4 ++-- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/backend/documents_parser/parser.py b/backend/documents_parser/parser.py index 0f6760d7..cff21e0e 100644 --- a/backend/documents_parser/parser.py +++ b/backend/documents_parser/parser.py @@ -1,13 +1,28 @@ +from typing import List + from datatypes.epc.domain.epc_property_data import EpcPropertyData from datatypes.epc.domain.mapper import EpcPropertyDataMapper +from backend.documents_parser.elmhurst_extractor import ElmhurstSiteNotesExtractor from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor -from backend.documents_parser.pdf import pdf_to_text_list +from backend.documents_parser.pdf import pdf_to_pages, pdf_to_text_list -def parse_pashub_site_notes(file_path: str) -> EpcPropertyData: +def parse_site_notes_pdf(file_path: str) -> EpcPropertyData: with open(file_path, "rb") as f: pdf_bytes = f.read() + pages = pdf_to_pages(pdf_bytes) + if "Elmhurst Energy Systems" in "\n".join(pages): + return _parse_elmhurst(pages) + return _parse_pashub(pdf_bytes) + + +def _parse_elmhurst(pages: List[str]) -> EpcPropertyData: + site_notes = ElmhurstSiteNotesExtractor(pages).extract() + return EpcPropertyDataMapper.from_elmhurst_site_notes(site_notes) + + +def _parse_pashub(pdf_bytes: bytes) -> EpcPropertyData: tokens = pdf_to_text_list(pdf_bytes) site_notes = PasHubRdSapSiteNotesExtractor(tokens).extract() return EpcPropertyDataMapper.from_site_notes(site_notes) diff --git a/backend/ecmk_fetcher/upload.py b/backend/ecmk_fetcher/upload.py index cc2c908d..edfaf9f1 100644 --- a/backend/ecmk_fetcher/upload.py +++ b/backend/ecmk_fetcher/upload.py @@ -1,5 +1,6 @@ from datetime import datetime, timezone import os +from typing import cast from backend.app.db.connection import db_session from backend.app.db.models.uploaded_file import ( @@ -7,9 +8,14 @@ from backend.app.db.models.uploaded_file import ( FileTypeEnum, UploadedFile, ) +from backend.documents_parser.db_writer import save_epc_property_data +from backend.documents_parser.parser import parse_site_notes_pdf +from utils.logger import setup_logger from utils.s3 import upload_file_to_s3 from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient +logger = setup_logger() + def upload_file_to_sharepoint( client: DomnaSharepointClient, @@ -43,7 +49,7 @@ def upload_excel_to_sharepoint( # TODO: this should be moved to somewhere common and called by pashub fetcher def upload_file_to_s3_and_update_db( bucket: str, file_path: str, hubspot_listing_id: str, file_type: FileTypeEnum -) -> None: +) -> int: filename: str = os.path.basename(file_path) key: str = f"documents/hubspot_listing_id/{hubspot_listing_id}/{filename}" @@ -61,4 +67,14 @@ def upload_file_to_s3_and_update_db( with db_session() as session: # TODO: we should do multiple files at once to reduce db trips session.add(uploaded_file) - session.commit() + session.flush() + uploaded_file_id: int = int(cast(int, uploaded_file.id)) + + if file_type == FileTypeEnum.ECMK_RD_SAP_SITE_NOTE: + try: + epc_data = parse_site_notes_pdf(file_path) + save_epc_property_data(session, epc_data, uploaded_file_id=uploaded_file_id) + except Exception: + logger.warning(f"Failed to parse/save site notes {file_path}", exc_info=True) + + return uploaded_file_id diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index a74f9a2d..7874d686 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -10,7 +10,7 @@ from backend.app.db.models.uploaded_file import ( UploadedFile, ) from backend.documents_parser.db_writer import save_epc_property_data -from backend.documents_parser.parser import parse_pashub_site_notes +from backend.documents_parser.parser import parse_site_notes_pdf from backend.pashub_fetcher.core_files import infer_file_type from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( @@ -106,7 +106,7 @@ def upload_job_to_s3_and_update_db( ): try: site_notes_pairs.append( - (uploaded_file, parse_pashub_site_notes(file_path)) + (uploaded_file, parse_site_notes_pdf(file_path)) ) except Exception: logger.warning(f"Failed to parse site notes {file_path}", exc_info=True) From 9e8911701184277b1e7eaee31a992773eba68eca Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 29 Apr 2026 13:23:53 +0000 Subject: [PATCH 05/10] =?UTF-8?q?ECMK=20:=20slim=20upload=5Ffile=5Fto=5Fs3?= =?UTF-8?q?=5Fand=5Frecord=20to=20single=20responsibility=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/ecmk_fetcher/processor.py | 4 +- backend/ecmk_fetcher/tests/test_upload.py | 108 ++++++++++++++++++++++ backend/ecmk_fetcher/upload.py | 11 +-- 3 files changed, 111 insertions(+), 12 deletions(-) create mode 100644 backend/ecmk_fetcher/tests/test_upload.py diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py index 4f8c24ea..03e40f1c 100644 --- a/backend/ecmk_fetcher/processor.py +++ b/backend/ecmk_fetcher/processor.py @@ -33,7 +33,7 @@ from backend.ecmk_fetcher.reports import ( from backend.ecmk_fetcher.excel_writer import write_row from backend.ecmk_fetcher.upload import ( upload_excel_to_sharepoint, - upload_file_to_s3_and_update_db, + upload_file_to_s3_and_record, upload_file_to_sharepoint, ) from backend.ecmk_fetcher.xml_processor import flatten_sap_property, parse_rdsap @@ -180,7 +180,7 @@ def run_job() -> None: ) # Upload to s3 and update db - upload_file_to_s3_and_update_db( + upload_file_to_s3_and_record( bucket=s3_bucket, file_path=file_path, hubspot_listing_id=hubspot_listing_id, diff --git a/backend/ecmk_fetcher/tests/test_upload.py b/backend/ecmk_fetcher/tests/test_upload.py new file mode 100644 index 00000000..79823e8e --- /dev/null +++ b/backend/ecmk_fetcher/tests/test_upload.py @@ -0,0 +1,108 @@ +from typing import Generator +from unittest.mock import MagicMock, call, patch + +import pytest + +from backend.app.db.models.uploaded_file import FileTypeEnum +from backend.ecmk_fetcher.upload import upload_file_to_s3_and_record + + +@pytest.fixture +def mock_uploaded_file() -> MagicMock: + obj = MagicMock() + obj.id = 42 + return obj + + +@pytest.fixture +def mock_session() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def patched_deps( + mock_uploaded_file: MagicMock, mock_session: MagicMock +) -> Generator[dict[str, MagicMock], None, None]: + with ( + patch( + "backend.ecmk_fetcher.upload.upload_file_to_s3" + ) as mock_s3, + patch( + "backend.ecmk_fetcher.upload.db_session" + ) as mock_db_ctx, + patch( + "backend.ecmk_fetcher.upload.UploadedFile", + return_value=mock_uploaded_file, + ) as mock_model, + ): + mock_db_ctx.return_value.__enter__.return_value = mock_session + mock_db_ctx.return_value.__exit__.return_value = False + yield { + "s3": mock_s3, + "db_ctx": mock_db_ctx, + "model": mock_model, + "session": mock_session, + "uploaded_file": mock_uploaded_file, + } + + +def test_returns_uploaded_file_id_as_int( + patched_deps: dict[str, MagicMock], +) -> None: + result = upload_file_to_s3_and_record( + bucket="test-bucket", + file_path="/tmp/report.pdf", + hubspot_listing_id="hs-001", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + ) + + assert isinstance(result, int) + assert result == 42 + + +def test_uploads_to_s3_with_key_derived_from_listing_id_and_filename( + patched_deps: dict[str, MagicMock], +) -> None: + upload_file_to_s3_and_record( + bucket="my-bucket", + file_path="/some/path/site_note.pdf", + hubspot_listing_id="hs-999", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + ) + + patched_deps["s3"].assert_called_once_with( + "/some/path/site_note.pdf", + "my-bucket", + "documents/hubspot_listing_id/hs-999/site_note.pdf", + ) + + +def test_adds_uploaded_file_record_to_session( + patched_deps: dict[str, MagicMock], +) -> None: + upload_file_to_s3_and_record( + bucket="test-bucket", + file_path="/tmp/report.pdf", + hubspot_listing_id="hs-001", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + ) + + patched_deps["session"].add.assert_called_once_with( + patched_deps["uploaded_file"] + ) + patched_deps["session"].flush.assert_called_once() + + +def test_site_note_type_does_not_trigger_pdf_parsing( + patched_deps: dict[str, MagicMock], +) -> None: + # If parsing branch still existed, this would blow up without a + # parse_site_notes_pdf mock — test passes only when branch is absent. + result = upload_file_to_s3_and_record( + bucket="test-bucket", + file_path="/tmp/site_note.pdf", + hubspot_listing_id="hs-002", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + ) + + assert result == 42 diff --git a/backend/ecmk_fetcher/upload.py b/backend/ecmk_fetcher/upload.py index edfaf9f1..fc05363c 100644 --- a/backend/ecmk_fetcher/upload.py +++ b/backend/ecmk_fetcher/upload.py @@ -8,8 +8,6 @@ from backend.app.db.models.uploaded_file import ( FileTypeEnum, UploadedFile, ) -from backend.documents_parser.db_writer import save_epc_property_data -from backend.documents_parser.parser import parse_site_notes_pdf from utils.logger import setup_logger from utils.s3 import upload_file_to_s3 from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient @@ -47,7 +45,7 @@ def upload_excel_to_sharepoint( # TODO: this should be moved to somewhere common and called by pashub fetcher -def upload_file_to_s3_and_update_db( +def upload_file_to_s3_and_record( bucket: str, file_path: str, hubspot_listing_id: str, file_type: FileTypeEnum ) -> int: filename: str = os.path.basename(file_path) @@ -70,11 +68,4 @@ def upload_file_to_s3_and_update_db( session.flush() uploaded_file_id: int = int(cast(int, uploaded_file.id)) - if file_type == FileTypeEnum.ECMK_RD_SAP_SITE_NOTE: - try: - epc_data = parse_site_notes_pdf(file_path) - save_epc_property_data(session, epc_data, uploaded_file_id=uploaded_file_id) - except Exception: - logger.warning(f"Failed to parse/save site notes {file_path}", exc_info=True) - return uploaded_file_id From 7cfa800c42ab1a14b7343ae0bd10cd7f95ae1543 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 29 Apr 2026 13:49:18 +0000 Subject: [PATCH 06/10] =?UTF-8?q?ECMK=20:=20replace=20processor.run=5Fjob?= =?UTF-8?q?=20with=20EcmkService=20class=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/ecmk_fetcher/ecmk_service.py | 196 ++++++++++++++++ backend/ecmk_fetcher/handler/handler.py | 22 +- backend/ecmk_fetcher/processor.py | 209 ------------------ .../ecmk_fetcher/tests/test_ecmk_service.py | 148 +++++++++++++ 4 files changed, 364 insertions(+), 211 deletions(-) create mode 100644 backend/ecmk_fetcher/ecmk_service.py delete mode 100644 backend/ecmk_fetcher/processor.py create mode 100644 backend/ecmk_fetcher/tests/test_ecmk_service.py diff --git a/backend/ecmk_fetcher/ecmk_service.py b/backend/ecmk_fetcher/ecmk_service.py new file mode 100644 index 00000000..138d0b28 --- /dev/null +++ b/backend/ecmk_fetcher/ecmk_service.py @@ -0,0 +1,196 @@ +import os +from typing import Dict + +from playwright.sync_api import Browser, BrowserContext, Locator, Page, sync_playwright + +from backend.app.db.functions.uploaded_files_functions import ( + get_uploaded_file_by_listing_type_and_source, +) +from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum +from backend.ecmk_fetcher.address_list import PropertyRow, extract_addresses_from_spreadsheet +from backend.ecmk_fetcher.browser import ( + attach_debug_listeners, + download_with_retry, + go_to_assessment_details, + go_to_assessments, + go_to_next_page, + login, +) +from backend.ecmk_fetcher.excel_writer import write_row +from backend.ecmk_fetcher.reports import ( + REPORT_TYPES, + FileDownloadButtonType, + build_property_id, + map_report_type_to_db_file_type, +) +from backend.ecmk_fetcher.upload import ( + upload_excel_to_sharepoint, + upload_file_to_s3_and_record, + upload_file_to_sharepoint, +) +from backend.ecmk_fetcher.xml_processor import flatten_sap_property, parse_rdsap +from utils.logger import setup_logger +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient + +logger = setup_logger() + +DIMENSIONS_FILENAME: str = "Dimensions.xlsx" + + +class EcmkService: + def __init__( + self, + sharepoint_client: DomnaSharepointClient, + s3_bucket: str, + property_list_filepath: str, + sharepoint_base_path: str, + sharepoint_excel_path: str, + local_dimensions_path: str, + ) -> None: + self._sharepoint_client = sharepoint_client + self._s3_bucket = s3_bucket + self._sharepoint_base_path = sharepoint_base_path + self._sharepoint_excel_path = sharepoint_excel_path + self._local_dimensions_path = local_dimensions_path + self._property_map: Dict[str, PropertyRow] = extract_addresses_from_spreadsheet( + property_list_filepath + ) + + def run(self) -> None: + self._sharepoint_client.download_file( + sharepoint_path=f"{self._sharepoint_excel_path}/{DIMENSIONS_FILENAME}", + local_path=self._local_dimensions_path, + ) + + with sync_playwright() as p: + browser: Browser = p.chromium.launch(headless=True) + context: BrowserContext = browser.new_context() + page: Page = context.new_page() + try: + self._run_browser_session(page) + finally: + context.close() + browser.close() + + def _run_browser_session(self, page: Page) -> None: + username: str = "" # TODO: get from github secrets + password: str = "" + + attach_debug_listeners(page) + + login(page, username, password) + go_to_assessments(page) + + while True: + rows: Locator = page.locator("#assessmentDatatable tbody tr") + row_count: int = rows.count() + + for i in range(row_count): + row: Locator = rows.nth(i) + + try: + cells: Locator = row.locator("td") + + first_name: str = cells.nth(1).inner_text().strip() + last_name: str = cells.nth(2).inner_text().strip() + address: str = cells.nth(5).inner_text().strip() + postcode: str = cells.nth(7).inner_text().strip() + status: str = cells.nth(9).inner_text().strip() + + if first_name == "Oliver" and last_name == "Stephens": + continue + + if status != "Submitted (not Lodged)": + continue + + property_id: str = build_property_id(address, postcode) + + property_row: PropertyRow | None = self._property_map.get(property_id) + + if not property_row: + continue + + logger.info(f"Match found for property {address}") + + sharepoint_address: str = property_row.address + + go_to_assessment_details(page, row) + + for report_type in REPORT_TYPES: + hubspot_listing_id: str = property_row.listing_id + try: + db_file_type: FileTypeEnum = ( + map_report_type_to_db_file_type(report_type) + ) + except ValueError: + logger.error( + f"Unknown report type {report_type}, skipping file" + ) + continue + + if get_uploaded_file_by_listing_type_and_source( + hubspot_listing_id=int(hubspot_listing_id), + file_type=db_file_type, + file_source=FileSourceEnum.ECMK, + ): + logger.debug("File already uploaded to s3, skipping") + continue + + file_path: str | None = download_with_retry(page, report_type) + + if not file_path: + continue + + logger.info( + f"Successfully downloaded file {os.path.basename(file_path)} from ECMK" + ) + + try: + if report_type == FileDownloadButtonType.RAW_XML.value: + with open(file_path, "r", encoding="utf-8") as f: + xml_string = f.read() + details = parse_rdsap(xml_string) + row_data = flatten_sap_property(details) + write_row(self._local_dimensions_path, row_data) + upload_excel_to_sharepoint( + client=self._sharepoint_client, + file_path=self._local_dimensions_path, + sharepoint_path=self._sharepoint_excel_path, + ) + logger.info( + f"Written dimensions row and uploaded Dimensions.xlsx for {address}" + ) + else: + upload_file_to_sharepoint( + client=self._sharepoint_client, + file_path=file_path, + base_path=self._sharepoint_base_path, + subpath=sharepoint_address, + ) + logger.info( + f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" + ) + + upload_file_to_s3_and_record( + bucket=self._s3_bucket, + file_path=file_path, + hubspot_listing_id=hubspot_listing_id, + file_type=db_file_type, + ) + + except Exception: + raise + finally: + if os.path.exists(file_path): + os.remove(file_path) + + page.go_back() + page.wait_for_selector( + "#assessmentDatatable tbody tr", timeout=15000 + ) + + except Exception as e: + raise Exception(f"Row processing failed: {str(e)}") from e + + if not go_to_next_page(page): + break diff --git a/backend/ecmk_fetcher/handler/handler.py b/backend/ecmk_fetcher/handler/handler.py index b777cc9f..b49518bf 100644 --- a/backend/ecmk_fetcher/handler/handler.py +++ b/backend/ecmk_fetcher/handler/handler.py @@ -1,14 +1,32 @@ +import os from typing import Any, Mapping -from backend.ecmk_fetcher.processor import run_job +from backend.ecmk_fetcher.ecmk_service import EcmkService from utils.logger import setup_logger +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient +from utils.sharepoint.domna_sites import DomnaSites logger = setup_logger() +_PROPERTY_LIST_FILE: str = ( + "hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx" +) +_BASE_DIR: str = os.path.dirname(os.path.dirname(__file__)) + def handler(event: Mapping[str, Any], context: Any) -> None: logger.info("Entered handler") - run_job() + service = EcmkService( + sharepoint_client=DomnaSharepointClient( + sharepoint_location=DomnaSites.PRIVATE_PAY + ), + s3_bucket="retrofit-energy-assessments-dev", + property_list_filepath=os.path.join(_BASE_DIR, _PROPERTY_LIST_FILE), + sharepoint_base_path="/Projects/Southern Housing/SH-SURV-26-001/Assessments", + sharepoint_excel_path="/Projects/Southern Housing/SH-SURV-26-001/Modelling", + local_dimensions_path=os.path.join(_BASE_DIR, "Dimensions.xlsx"), + ) + service.run() if __name__ == "__main__": diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py deleted file mode 100644 index 03e40f1c..00000000 --- a/backend/ecmk_fetcher/processor.py +++ /dev/null @@ -1,209 +0,0 @@ -import os -from typing import Dict -from playwright.sync_api import ( - sync_playwright, - Locator, - Page, - Browser, - BrowserContext, -) - -from backend.app.db.functions.uploaded_files_functions import ( - get_uploaded_file_by_listing_type_and_source, -) -from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum -from backend.ecmk_fetcher.address_list import ( - PropertyRow, - extract_addresses_from_spreadsheet, -) -from backend.ecmk_fetcher.browser import ( - attach_debug_listeners, - download_with_retry, - go_to_assessment_details, - go_to_assessments, - go_to_next_page, - login, -) -from backend.ecmk_fetcher.reports import ( - REPORT_TYPES, - FileDownloadButtonType, - build_property_id, - map_report_type_to_db_file_type, -) -from backend.ecmk_fetcher.excel_writer import write_row -from backend.ecmk_fetcher.upload import ( - upload_excel_to_sharepoint, - upload_file_to_s3_and_record, - upload_file_to_sharepoint, -) -from backend.ecmk_fetcher.xml_processor import flatten_sap_property, parse_rdsap -from utils.logger import setup_logger -from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient -from utils.sharepoint.domna_sites import DomnaSites - -logger = setup_logger() - - -def run_job() -> None: - - username: str = "" # TODO: get from github secrets - password: str = "" - - property_list_file: str = ( - "hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx" - ) - - BASE_DIR: str = os.path.dirname(__file__) - filepath: str = os.path.join(BASE_DIR, property_list_file) - - property_map: Dict[str, PropertyRow] = extract_addresses_from_spreadsheet(filepath) - - sharepoint_client: DomnaSharepointClient = DomnaSharepointClient( - sharepoint_location=DomnaSites.PRIVATE_PAY - ) - - sharepoint_base_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Assessments" - sharepoint_excel_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Modelling" - - DIMENSIONS_FILENAME: str = "Dimensions.xlsx" - local_dimensions_path: str = os.path.join(BASE_DIR, DIMENSIONS_FILENAME) - - sharepoint_client.download_file( - sharepoint_path=f"{sharepoint_excel_path}/{DIMENSIONS_FILENAME}", - local_path=local_dimensions_path, - ) - - s3_bucket: str = "retrofit-energy-assessments-dev" - - with sync_playwright() as p: - browser: Browser = p.chromium.launch(headless=True) - context: BrowserContext = browser.new_context() - page: Page = context.new_page() - - attach_debug_listeners(page) - - try: - login(page, username, password) - go_to_assessments(page) - - while True: - rows: Locator = page.locator("#assessmentDatatable tbody tr") - row_count: int = rows.count() - - for i in range(row_count): - row: Locator = rows.nth(i) - - try: - cells: Locator = row.locator("td") - - first_name: str = cells.nth(1).inner_text().strip() - last_name: str = cells.nth(2).inner_text().strip() - address: str = cells.nth(5).inner_text().strip() - postcode: str = cells.nth(7).inner_text().strip() - status: str = cells.nth(9).inner_text().strip() - - if first_name == "Oliver" and last_name == "Stephens": - continue - - if status != "Submitted (not Lodged)": - continue - - property_id: str = build_property_id(address, postcode) - - property_row: PropertyRow | None = property_map.get(property_id) - - if not property_row: - continue - - logger.info(f"Match found for property {address}") - - sharepoint_address: str = property_row.address - - go_to_assessment_details(page, row) - - for report_type in REPORT_TYPES: - hubspot_listing_id: str = property_row.listing_id - try: - db_file_type: FileTypeEnum = ( - map_report_type_to_db_file_type(report_type) - ) - - except ValueError: - logger.error( - f"Unknown report type {report_type}, skipping file" - ) - continue - - if get_uploaded_file_by_listing_type_and_source( - hubspot_listing_id=int(hubspot_listing_id), - file_type=db_file_type, - file_source=FileSourceEnum.ECMK, - ): - logger.debug("File already uploaded to s3, skipping") - continue - - file_path: str | None = download_with_retry( - page, report_type - ) - - if not file_path: - continue - - logger.info( - f"Successfully downloaded file {os.path.basename(file_path)} from ECMK" - ) - - try: - if report_type == FileDownloadButtonType.RAW_XML.value: - with open(file_path, "r", encoding="utf-8") as f: - xml_string = f.read() - details = parse_rdsap(xml_string) - row_data = flatten_sap_property(details) - write_row(local_dimensions_path, row_data) - upload_excel_to_sharepoint( - client=sharepoint_client, - file_path=local_dimensions_path, - sharepoint_path=sharepoint_excel_path, - ) - logger.info( - f"Written dimensions row and uploaded Dimensions.xlsx for {address}" - ) - else: - upload_file_to_sharepoint( - client=sharepoint_client, - file_path=file_path, - base_path=sharepoint_base_path, - subpath=sharepoint_address, - ) - logger.info( - f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" - ) - - # Upload to s3 and update db - upload_file_to_s3_and_record( - bucket=s3_bucket, - file_path=file_path, - hubspot_listing_id=hubspot_listing_id, - file_type=db_file_type, - ) - - except Exception: - raise - finally: - if os.path.exists(file_path): - os.remove(file_path) - - page.go_back() - page.wait_for_selector( - "#assessmentDatatable tbody tr", timeout=15000 - ) - - except Exception as e: - raise Exception(f"Row processing failed: {str(e)}") from e - - if not go_to_next_page(page): - break - - finally: - context.close() - browser.close() diff --git a/backend/ecmk_fetcher/tests/test_ecmk_service.py b/backend/ecmk_fetcher/tests/test_ecmk_service.py new file mode 100644 index 00000000..c2694489 --- /dev/null +++ b/backend/ecmk_fetcher/tests/test_ecmk_service.py @@ -0,0 +1,148 @@ +from typing import Dict +from unittest.mock import MagicMock, patch + +from backend.ecmk_fetcher.address_list import PropertyRow +from backend.ecmk_fetcher.ecmk_service import EcmkService +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient + + +FAKE_PROPERTY_MAP: Dict[str, PropertyRow] = { + "10 FAKE ST SW1A 1AA": PropertyRow( + row_index=2, address="10 Fake St SW1A 1AA", listing_id="hs-001" + ) +} + + +def make_service( + sharepoint_client: DomnaSharepointClient | None = None, + s3_bucket: str = "test-bucket", + property_list_filepath: str = "/fake/properties.xlsx", + sharepoint_base_path: str = "/base", + sharepoint_excel_path: str = "/excel", + local_dimensions_path: str = "/fake/Dimensions.xlsx", +) -> EcmkService: + return EcmkService( + sharepoint_client=sharepoint_client or MagicMock(spec=DomnaSharepointClient), + s3_bucket=s3_bucket, + property_list_filepath=property_list_filepath, + sharepoint_base_path=sharepoint_base_path, + sharepoint_excel_path=sharepoint_excel_path, + local_dimensions_path=local_dimensions_path, + ) + + +# --------------------------------------------------------------------------- +# __init__: loads property map from spreadsheet filepath +# --------------------------------------------------------------------------- + + +def test_init_loads_property_map_from_filepath() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ) as mock_extract: + _ = make_service(property_list_filepath="/some/props.xlsx") + + mock_extract.assert_called_once_with("/some/props.xlsx") + + +# --------------------------------------------------------------------------- +# run(): downloads Dimensions.xlsx before Playwright browser launches +# --------------------------------------------------------------------------- + + +def _make_playwright_mocks() -> tuple[MagicMock, MagicMock, MagicMock, MagicMock]: + mock_page = MagicMock() + mock_context = MagicMock() + mock_context.new_page.return_value = mock_page + mock_browser = MagicMock() + mock_browser.new_context.return_value = mock_context + mock_playwright = MagicMock() + mock_playwright.chromium.launch.return_value = mock_browser + return mock_page, mock_context, mock_browser, mock_playwright + + +def test_run_downloads_dimensions_before_browser_launch() -> None: + call_order: list[str] = [] + + mock_client = MagicMock(spec=DomnaSharepointClient) + + def _on_download(**_: object) -> None: + call_order.append("download") + + mock_client.download_file.side_effect = _on_download + + _, _, mock_browser, mock_playwright = _make_playwright_mocks() + + def _on_launch(**_: object) -> MagicMock: + call_order.append("browser") + return mock_browser + + mock_playwright.chromium.launch.side_effect = _on_launch + + with ( + patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ), + patch("backend.ecmk_fetcher.ecmk_service.sync_playwright") as mock_sync_pw, + ): + mock_sync_pw.return_value.__enter__.return_value = mock_playwright + service = make_service( + sharepoint_client=mock_client, + sharepoint_excel_path="/excel", + local_dimensions_path="/fake/Dimensions.xlsx", + ) + with patch.object(service, "_run_browser_session"): + service.run() + + assert call_order == ["download", "browser"] + + +def test_run_downloads_dimensions_with_correct_paths() -> None: + mock_client = MagicMock(spec=DomnaSharepointClient) + _, _, _, mock_playwright = _make_playwright_mocks() + + with ( + patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ), + patch("backend.ecmk_fetcher.ecmk_service.sync_playwright") as mock_sync_pw, + ): + mock_sync_pw.return_value.__enter__.return_value = mock_playwright + service = make_service( + sharepoint_client=mock_client, + sharepoint_excel_path="/excel", + local_dimensions_path="/fake/Dimensions.xlsx", + ) + with patch.object(service, "_run_browser_session"): + service.run() + + mock_client.download_file.assert_called_once_with( + sharepoint_path="/excel/Dimensions.xlsx", + local_path="/fake/Dimensions.xlsx", + ) + + +# --------------------------------------------------------------------------- +# run(): passes the Playwright Page into _run_browser_session +# --------------------------------------------------------------------------- + + +def test_run_passes_page_to_run_browser_session() -> None: + mock_page, _, _, mock_playwright = _make_playwright_mocks() + + with ( + patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ), + patch("backend.ecmk_fetcher.ecmk_service.sync_playwright") as mock_sync_pw, + ): + mock_sync_pw.return_value.__enter__.return_value = mock_playwright + service = make_service() + with patch.object(service, "_run_browser_session") as mock_session: + service.run() + + mock_session.assert_called_once_with(mock_page) From 6851a84edec0a4382214ec87b1e329433d887d24 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 29 Apr 2026 14:10:55 +0000 Subject: [PATCH 07/10] =?UTF-8?q?ECMK=20:=20refactor=20EcmkService=20class?= =?UTF-8?q?=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/ecmk_fetcher/ecmk_service.py | 123 +++-- .../ecmk_fetcher/tests/test_ecmk_service.py | 448 +++++++++++++++++- backend/ecmk_fetcher/tests/test_handler.py | 59 +++ 3 files changed, 598 insertions(+), 32 deletions(-) create mode 100644 backend/ecmk_fetcher/tests/test_handler.py diff --git a/backend/ecmk_fetcher/ecmk_service.py b/backend/ecmk_fetcher/ecmk_service.py index 138d0b28..35b8f552 100644 --- a/backend/ecmk_fetcher/ecmk_service.py +++ b/backend/ecmk_fetcher/ecmk_service.py @@ -3,11 +3,17 @@ from typing import Dict from playwright.sync_api import Browser, BrowserContext, Locator, Page, sync_playwright +from backend.app.db.connection import db_session from backend.app.db.functions.uploaded_files_functions import ( get_uploaded_file_by_listing_type_and_source, ) from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum -from backend.ecmk_fetcher.address_list import PropertyRow, extract_addresses_from_spreadsheet +from backend.documents_parser.db_writer import save_epc_property_data +from backend.documents_parser.parser import parse_site_notes_pdf +from backend.ecmk_fetcher.address_list import ( + PropertyRow, + extract_addresses_from_spreadsheet, +) from backend.ecmk_fetcher.browser import ( attach_debug_listeners, download_with_retry, @@ -105,7 +111,9 @@ class EcmkService: property_id: str = build_property_id(address, postcode) - property_row: PropertyRow | None = self._property_map.get(property_id) + property_row: PropertyRow | None = self._property_map.get( + property_id + ) if not property_row: continue @@ -146,38 +154,13 @@ class EcmkService: ) try: - if report_type == FileDownloadButtonType.RAW_XML.value: - with open(file_path, "r", encoding="utf-8") as f: - xml_string = f.read() - details = parse_rdsap(xml_string) - row_data = flatten_sap_property(details) - write_row(self._local_dimensions_path, row_data) - upload_excel_to_sharepoint( - client=self._sharepoint_client, - file_path=self._local_dimensions_path, - sharepoint_path=self._sharepoint_excel_path, - ) - logger.info( - f"Written dimensions row and uploaded Dimensions.xlsx for {address}" - ) - else: - upload_file_to_sharepoint( - client=self._sharepoint_client, - file_path=file_path, - base_path=self._sharepoint_base_path, - subpath=sharepoint_address, - ) - logger.info( - f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" - ) - - upload_file_to_s3_and_record( - bucket=self._s3_bucket, + self._process_file( file_path=file_path, + report_type=report_type, + db_file_type=db_file_type, + sharepoint_address=sharepoint_address, hubspot_listing_id=hubspot_listing_id, - file_type=db_file_type, ) - except Exception: raise finally: @@ -194,3 +177,81 @@ class EcmkService: if not go_to_next_page(page): break + + def _process_file( + self, + file_path: str, + report_type: int, + db_file_type: FileTypeEnum, + sharepoint_address: str, + hubspot_listing_id: str, + ) -> None: + if report_type == FileDownloadButtonType.RAW_XML.value: + self._process_xml_file( + file_path=file_path, + db_file_type=db_file_type, + hubspot_listing_id=hubspot_listing_id, + ) + else: + self._process_pdf_file( + file_path=file_path, + file_type=db_file_type, + sharepoint_address=sharepoint_address, + hubspot_listing_id=hubspot_listing_id, + ) + + def _process_xml_file( + self, + file_path: str, + db_file_type: FileTypeEnum, + hubspot_listing_id: str, + ) -> None: + with open(file_path, "r", encoding="utf-8") as f: + xml_string: str = f.read() + details = parse_rdsap(xml_string) + row_data = flatten_sap_property(details) + write_row(self._local_dimensions_path, row_data) + upload_excel_to_sharepoint( + client=self._sharepoint_client, + file_path=self._local_dimensions_path, + sharepoint_path=self._sharepoint_excel_path, + ) + upload_file_to_s3_and_record( + bucket=self._s3_bucket, + file_path=file_path, + hubspot_listing_id=hubspot_listing_id, + file_type=db_file_type, + ) + + def _process_pdf_file( + self, + file_path: str, + file_type: FileTypeEnum, + sharepoint_address: str, + hubspot_listing_id: str, + ) -> None: + upload_file_to_sharepoint( + client=self._sharepoint_client, + file_path=file_path, + base_path=self._sharepoint_base_path, + subpath=sharepoint_address, + ) + uploaded_file_id: int = upload_file_to_s3_and_record( + bucket=self._s3_bucket, + file_path=file_path, + hubspot_listing_id=hubspot_listing_id, + file_type=file_type, + ) + if file_type == FileTypeEnum.ECMK_RD_SAP_SITE_NOTE: + try: + epc_data = parse_site_notes_pdf(file_path) + with db_session() as session: + save_epc_property_data( + session=session, + data=epc_data, + uploaded_file_id=uploaded_file_id, + ) + except Exception: + logger.warning( + f"EPC extraction failed for {os.path.basename(file_path)} — file record retained" + ) diff --git a/backend/ecmk_fetcher/tests/test_ecmk_service.py b/backend/ecmk_fetcher/tests/test_ecmk_service.py index c2694489..703bc4c5 100644 --- a/backend/ecmk_fetcher/tests/test_ecmk_service.py +++ b/backend/ecmk_fetcher/tests/test_ecmk_service.py @@ -1,8 +1,10 @@ from typing import Dict -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, call, patch +from backend.app.db.models.uploaded_file import FileTypeEnum from backend.ecmk_fetcher.address_list import PropertyRow from backend.ecmk_fetcher.ecmk_service import EcmkService +from backend.ecmk_fetcher.reports import FileDownloadButtonType from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient @@ -146,3 +148,447 @@ def test_run_passes_page_to_run_browser_session() -> None: service.run() mock_session.assert_called_once_with(mock_page) + + +# --------------------------------------------------------------------------- +# _process_file: dispatches based on report_type +# --------------------------------------------------------------------------- + + +def test_process_file_dispatches_to_xml_for_raw_xml() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch.object(service, "_process_xml_file") as mock_xml, + patch.object(service, "_process_pdf_file") as mock_pdf, + ): + service._process_file( + file_path="/tmp/file.xml", + report_type=FileDownloadButtonType.RAW_XML.value, + db_file_type=FileTypeEnum.ECMK_SURVEY_XML, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_xml.assert_called_once() + mock_pdf.assert_not_called() + + +def test_process_file_dispatches_to_pdf_for_non_xml() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch.object(service, "_process_xml_file") as mock_xml, + patch.object(service, "_process_pdf_file") as mock_pdf, + ): + service._process_file( + file_path="/tmp/file.pdf", + report_type=FileDownloadButtonType.SITENOTE_REPORT.value, + db_file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_pdf.assert_called_once() + mock_xml.assert_not_called() + + +# --------------------------------------------------------------------------- +# _process_xml_file: parse → flatten → write row → upload excel → S3 +# --------------------------------------------------------------------------- + + +def test_process_xml_file_full_chain() -> None: + fake_details = MagicMock() + fake_row_data = MagicMock() + + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service( + s3_bucket="my-bucket", + sharepoint_excel_path="/excel", + local_dimensions_path="/dims/Dimensions.xlsx", + ) + + with ( + patch( + "backend.ecmk_fetcher.ecmk_service.parse_rdsap", return_value=fake_details + ) as mock_parse, + patch( + "backend.ecmk_fetcher.ecmk_service.flatten_sap_property", + return_value=fake_row_data, + ) as mock_flatten, + patch("backend.ecmk_fetcher.ecmk_service.write_row") as mock_write, + patch( + "backend.ecmk_fetcher.ecmk_service.upload_excel_to_sharepoint" + ) as mock_upload_excel, + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record" + ) as mock_s3, + patch( + "builtins.open", + MagicMock(return_value=MagicMock( + __enter__=lambda s: MagicMock(read=lambda: ""), + __exit__=MagicMock(return_value=False), + )), + ), + ): + service._process_xml_file( + file_path="/tmp/report.xml", + db_file_type=FileTypeEnum.ECMK_SURVEY_XML, + hubspot_listing_id="hs-001", + ) + + mock_parse.assert_called_once() + mock_flatten.assert_called_once_with(fake_details) + mock_write.assert_called_once_with("/dims/Dimensions.xlsx", fake_row_data) + mock_upload_excel.assert_called_once_with( + client=service._sharepoint_client, + file_path="/dims/Dimensions.xlsx", + sharepoint_path="/excel", + ) + mock_s3.assert_called_once_with( + bucket="my-bucket", + file_path="/tmp/report.xml", + hubspot_listing_id="hs-001", + file_type=FileTypeEnum.ECMK_SURVEY_XML, + ) + + +# --------------------------------------------------------------------------- +# _process_pdf_file: sharepoint upload → S3 upload +# --------------------------------------------------------------------------- + + +def test_process_pdf_file_uploads_to_sharepoint_then_s3() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service( + s3_bucket="my-bucket", + sharepoint_base_path="/base", + ) + + with ( + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint" + ) as mock_sp, + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record", + return_value=42, + ) as mock_s3, + patch("backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf"), + patch("backend.ecmk_fetcher.ecmk_service.db_session"), + ): + service._process_pdf_file( + file_path="/tmp/report.pdf", + file_type=FileTypeEnum.ECMK_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_sp.assert_called_once_with( + client=service._sharepoint_client, + file_path="/tmp/report.pdf", + base_path="/base", + subpath="10 Fake St", + ) + mock_s3.assert_called_once_with( + bucket="my-bucket", + file_path="/tmp/report.pdf", + hubspot_listing_id="hs-001", + file_type=FileTypeEnum.ECMK_SITE_NOTE, + ) + + +# --------------------------------------------------------------------------- +# _process_pdf_file: EPC extraction conditional on file_type +# --------------------------------------------------------------------------- + + +def test_process_pdf_file_runs_epc_extraction_for_rd_sap_site_note() -> None: + fake_epc_data = MagicMock() + fake_session = MagicMock() + + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"), + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record", + return_value=99, + ), + patch( + "backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf", + return_value=fake_epc_data, + ) as mock_parse, + patch( + "backend.ecmk_fetcher.ecmk_service.save_epc_property_data" + ) as mock_save, + patch( + "backend.ecmk_fetcher.ecmk_service.db_session" + ) as mock_db_session, + ): + mock_db_session.return_value.__enter__.return_value = fake_session + service._process_pdf_file( + file_path="/tmp/sitenote.pdf", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_parse.assert_called_once_with("/tmp/sitenote.pdf") + mock_save.assert_called_once_with( + session=fake_session, + data=fake_epc_data, + uploaded_file_id=99, + ) + + +def test_process_pdf_file_skips_epc_extraction_for_ecmk_site_note() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"), + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record", + return_value=42, + ), + patch( + "backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf" + ) as mock_parse, + patch("backend.ecmk_fetcher.ecmk_service.db_session") as mock_db_session, + ): + service._process_pdf_file( + file_path="/tmp/sitenote.pdf", + file_type=FileTypeEnum.ECMK_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_parse.assert_not_called() + mock_db_session.assert_not_called() + + +def test_process_pdf_file_epc_uses_separate_db_session_from_s3_upload() -> None: + """EPC db_session opens only after upload_file_to_s3_and_record returns.""" + call_order: list[str] = [] + + def _on_s3(**_: object) -> int: + call_order.append("s3") + return 77 + + def _on_db_session() -> MagicMock: + call_order.append("db_session") + ctx = MagicMock() + ctx.__enter__ = MagicMock(return_value=MagicMock()) + ctx.__exit__ = MagicMock(return_value=False) + return ctx + + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"), + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record", + side_effect=_on_s3, + ), + patch("backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf"), + patch("backend.ecmk_fetcher.ecmk_service.save_epc_property_data"), + patch( + "backend.ecmk_fetcher.ecmk_service.db_session", + side_effect=_on_db_session, + ), + ): + service._process_pdf_file( + file_path="/tmp/sitenote.pdf", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + assert call_order == ["s3", "db_session"] + + +# --------------------------------------------------------------------------- +# _process_pdf_file: EPC failures swallowed with warning +# --------------------------------------------------------------------------- + + +def _pdf_file_patches_for_failure() -> tuple: # type: ignore[type-arg] + return ( + patch("backend.ecmk_fetcher.ecmk_service.upload_file_to_sharepoint"), + patch( + "backend.ecmk_fetcher.ecmk_service.upload_file_to_s3_and_record", + return_value=1, + ), + ) + + +def test_process_pdf_file_parse_failure_logged_as_warning_not_raised() -> None: + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + sp_patch, s3_patch = _pdf_file_patches_for_failure() + with ( + sp_patch, + s3_patch, + patch( + "backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf", + side_effect=ValueError("bad pdf"), + ), + patch("backend.ecmk_fetcher.ecmk_service.save_epc_property_data") as mock_save, + patch("backend.ecmk_fetcher.ecmk_service.db_session"), + patch("backend.ecmk_fetcher.ecmk_service.logger") as mock_logger, + ): + service._process_pdf_file( + file_path="/tmp/sitenote.pdf", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_logger.warning.assert_called_once() + mock_save.assert_not_called() + + +def test_process_pdf_file_save_failure_logged_as_warning_not_raised() -> None: + fake_session = MagicMock() + + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=FAKE_PROPERTY_MAP, + ): + service = make_service() + + sp_patch, s3_patch = _pdf_file_patches_for_failure() + with ( + sp_patch, + s3_patch, + patch( + "backend.ecmk_fetcher.ecmk_service.parse_site_notes_pdf", + return_value=MagicMock(), + ), + patch( + "backend.ecmk_fetcher.ecmk_service.save_epc_property_data", + side_effect=RuntimeError("db exploded"), + ), + patch("backend.ecmk_fetcher.ecmk_service.db_session") as mock_db_session, + patch("backend.ecmk_fetcher.ecmk_service.logger") as mock_logger, + ): + mock_db_session.return_value.__enter__.return_value = fake_session + service._process_pdf_file( + file_path="/tmp/sitenote.pdf", + file_type=FileTypeEnum.ECMK_RD_SAP_SITE_NOTE, + sharepoint_address="10 Fake St", + hubspot_listing_id="hs-001", + ) + + mock_logger.warning.assert_called_once() + + +# --------------------------------------------------------------------------- +# _run_browser_session: delegates file processing to _process_file +# --------------------------------------------------------------------------- + + +def _make_page_mock_with_one_matching_row() -> MagicMock: + cells_nth: dict[int, MagicMock] = {n: MagicMock() for n in (1, 2, 5, 7, 9)} + cells_nth[1].inner_text.return_value = "John" + cells_nth[2].inner_text.return_value = "Doe" + cells_nth[5].inner_text.return_value = "10 FAKE ST" + cells_nth[7].inner_text.return_value = "SW1A 1AA" + cells_nth[9].inner_text.return_value = "Submitted (not Lodged)" + + cells_mock = MagicMock() + cells_mock.nth.side_effect = lambda n: cells_nth[n] + + row_mock = MagicMock() + row_mock.locator.return_value = cells_mock + + rows_mock = MagicMock() + rows_mock.count.return_value = 1 + rows_mock.nth.return_value = row_mock + + page = MagicMock() + page.locator.return_value = rows_mock + return page + + +# address "10 FAKE ST" + postcode "SW1A 1AA" → build_property_id → "10SW1A1AA" +_BROWSER_SESSION_PROPERTY_MAP: Dict[str, PropertyRow] = { + "10SW1A1AA": PropertyRow( + row_index=2, address="10 Fake St SW1A 1AA", listing_id="12345" + ) +} + + +def test_run_browser_session_calls_process_file_for_downloaded_file() -> None: + mock_page = _make_page_mock_with_one_matching_row() + + with patch( + "backend.ecmk_fetcher.ecmk_service.extract_addresses_from_spreadsheet", + return_value=_BROWSER_SESSION_PROPERTY_MAP, + ): + service = make_service() + + with ( + patch("backend.ecmk_fetcher.ecmk_service.attach_debug_listeners"), + patch("backend.ecmk_fetcher.ecmk_service.login"), + patch("backend.ecmk_fetcher.ecmk_service.go_to_assessments"), + patch("backend.ecmk_fetcher.ecmk_service.go_to_assessment_details"), + patch("backend.ecmk_fetcher.ecmk_service.go_to_next_page", return_value=False), + patch( + "backend.ecmk_fetcher.ecmk_service.get_uploaded_file_by_listing_type_and_source", + return_value=None, + ), + patch( + "backend.ecmk_fetcher.ecmk_service.download_with_retry", + return_value="/tmp/fake.pdf", + ), + patch( + "backend.ecmk_fetcher.ecmk_service.map_report_type_to_db_file_type", + return_value=FileTypeEnum.ECMK_SITE_NOTE, + ), + patch( + "backend.ecmk_fetcher.ecmk_service.REPORT_TYPES", + [FileDownloadButtonType.SITENOTE_REPORT.value], + ), + patch.object(service, "_process_file") as mock_process_file, + patch("os.path.exists", return_value=False), + ): + service._run_browser_session(mock_page) + + mock_process_file.assert_called_once_with( + file_path="/tmp/fake.pdf", + report_type=FileDownloadButtonType.SITENOTE_REPORT.value, + db_file_type=FileTypeEnum.ECMK_SITE_NOTE, + sharepoint_address="10 Fake St SW1A 1AA", + hubspot_listing_id="12345", + ) diff --git a/backend/ecmk_fetcher/tests/test_handler.py b/backend/ecmk_fetcher/tests/test_handler.py new file mode 100644 index 00000000..9de97253 --- /dev/null +++ b/backend/ecmk_fetcher/tests/test_handler.py @@ -0,0 +1,59 @@ +from unittest.mock import MagicMock, patch + +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient + + +def test_handler_constructs_ecmk_service_and_calls_run() -> None: + mock_service = MagicMock() + mock_service_cls = MagicMock(return_value=mock_service) + + with ( + patch( + "backend.ecmk_fetcher.handler.handler.EcmkService", + mock_service_cls, + ), + patch( + "backend.ecmk_fetcher.handler.handler.DomnaSharepointClient", + return_value=MagicMock(spec=DomnaSharepointClient), + ), + ): + from backend.ecmk_fetcher.handler.handler import handler + + handler({}, None) + + mock_service_cls.assert_called_once() + mock_service.run.assert_called_once() + + +def test_handler_passes_correct_config_to_ecmk_service() -> None: + mock_service = MagicMock() + mock_service_cls = MagicMock(return_value=mock_service) + + with ( + patch( + "backend.ecmk_fetcher.handler.handler.EcmkService", + mock_service_cls, + ), + patch( + "backend.ecmk_fetcher.handler.handler.DomnaSharepointClient", + return_value=MagicMock(spec=DomnaSharepointClient), + ), + ): + from backend.ecmk_fetcher.handler.handler import handler + + handler({}, None) + + _, kwargs = mock_service_cls.call_args + assert kwargs["s3_bucket"] == "retrofit-energy-assessments-dev" + assert ( + kwargs["sharepoint_base_path"] + == "/Projects/Southern Housing/SH-SURV-26-001/Assessments" + ) + assert ( + kwargs["sharepoint_excel_path"] + == "/Projects/Southern Housing/SH-SURV-26-001/Modelling" + ) + assert kwargs["property_list_filepath"].endswith( + "hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx" + ) + assert kwargs["local_dimensions_path"].endswith("Dimensions.xlsx") From bbd653f83cee6d5f1fd3d671bc2f13b2460e19c4 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 29 Apr 2026 14:50:59 +0000 Subject: [PATCH 08/10] =?UTF-8?q?PasHub:=20PashubService=20class=20?= =?UTF-8?q?=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/pashub_fetcher/handler/handler.py | 175 ++---------- backend/pashub_fetcher/pashub_service.py | 126 +++++++++ .../tests/test_pashub_service.py | 254 ++++++++++++++++++ pytest.ini | 2 +- 4 files changed, 397 insertions(+), 160 deletions(-) create mode 100644 backend/pashub_fetcher/pashub_service.py create mode 100644 backend/pashub_fetcher/tests/test_pashub_service.py diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 7874d686..0d12b6bf 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -1,33 +1,18 @@ -from datetime import datetime, timezone -import os -from typing import Any, Dict, List, Optional, Tuple, cast +from typing import Any, Dict, List from backend.app.config import get_settings -from backend.app.db.connection import db_session -from backend.app.db.models.uploaded_file import ( - FileSourceEnum, - FileTypeEnum, - UploadedFile, -) -from backend.documents_parser.db_writer import save_epc_property_data -from backend.documents_parser.parser import parse_site_notes_pdf -from backend.pashub_fetcher.core_files import infer_file_type from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError -from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( - PashubToAraTriggerRequest, -) -from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders +from backend.pashub_fetcher.pashub_service import PashubService +from backend.pashub_fetcher.pashub_to_ara_trigger_request import PashubToAraTriggerRequest from backend.pashub_fetcher.token_getter import get_token_from_local_storage from backend.utils.subtasks import task_handler -from datatypes.epc.domain.epc_property_data import EpcPropertyData from utils.logger import setup_logger -from utils.s3 import upload_file_to_s3 from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient from utils.sharepoint.domna_sites import DomnaSites - logger = setup_logger() +S3_BUCKET = "retrofit-energy-assessments-dev" def get_pashub_client(email: str, password: str) -> PashubClient: @@ -36,130 +21,6 @@ def get_pashub_client(email: str, password: str) -> PashubClient: return PashubClient(token=token) -def upload_job_to_sharepoint( - sharepoint_client: DomnaSharepointClient, - # base_path: str, - sharepoint_link: str, - job_files: List[str], -) -> None: - # job_path = f"{base_path}/{job['address']}" - - # Create main job folder - # sharepoint_client.makedir(job["address"], base_path) - - # Create subfolders - # for folder in SharepointSubfolders: - # sharepoint_client.makedir(folder.value, job_path) - - # Upload into assessment folder - assessment_path = f"{sharepoint_link}/{SharepointSubfolders.ASSESSMENT.value}" - - for file_path in job_files: - filename = file_path.split("/")[-1] - - sharepoint_client.upload_file( - file_path, - assessment_path, - filename, - ) - - -def upload_job_to_s3_and_update_db( - job_files: List[str], uprn: Optional[str], hubspot_deal_id: Optional[str] -) -> None: - bucket = "retrofit-energy-assessments-dev" - - if not uprn and not hubspot_deal_id: - return - - base_path = ( - f"documents/uprn/{uprn}" - if uprn - else f"documents/hubspot_deal_id/{hubspot_deal_id}" - ) - - uploaded_files: List[UploadedFile] = [] - site_notes_pairs: List[Tuple[UploadedFile, EpcPropertyData]] = [] - - for file_path in job_files: - filename = os.path.basename(file_path) - file_key = f"{base_path}/{filename}" - - upload_file_to_s3(file_path, bucket, file_key) - - # TODO: use same upload_file_to_s3_and_update_db method as ecmk fetcher does - uploaded_file = UploadedFile( - s3_file_bucket=bucket, - s3_file_key=file_key, - s3_upload_timestamp=datetime.now(timezone.utc), - uprn=int(uprn) if uprn else None, - hubspot_deal_id=hubspot_deal_id, - file_source=FileSourceEnum.PAS_HUB.value, - file_type=infer_file_type(filename), - ) - uploaded_files.append(uploaded_file) - - file_type: Optional[str] = cast(Optional[str], uploaded_file.file_type) - if ( - file_type is not None - and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE - ): - try: - site_notes_pairs.append( - (uploaded_file, parse_site_notes_pdf(file_path)) - ) - except Exception: - logger.warning(f"Failed to parse site notes {file_path}", exc_info=True) - - with db_session() as session: - session.add_all(uploaded_files) - session.flush() - - for uploaded_file, epc_data in site_notes_pairs: - save_epc_property_data( - session, epc_data, uploaded_file_id=cast(int, uploaded_file.id) - ) - - session.commit() - - -def process_job( - job: PashubToAraTriggerRequest, - pashub_client: PashubClient, - sharepoint_client: DomnaSharepointClient, -) -> List[str]: - job_id = job.pashub_job_id - - uprn: Optional[str] = job.uprn or pashub_client.get_uprn_by_job_id(job_id) - hubspot_deal_id: Optional[str] = job.hubspot_deal_id - - if uprn: - logger.info(f"Got UPRN {uprn} for job {job_id}") - else: - logger.info(f"No UPRN found for job {job_id}") - - job_files: List[str] = pashub_client.get_core_evidence_files_by_job_id(job_id) - - if uprn or hubspot_deal_id: - logger.info("Uploading files to s3") - upload_job_to_s3_and_update_db(job_files, uprn, hubspot_deal_id) - - # # Comment out sharepoint loading for now: - # Seems like the sharepoint link in pas hub is inconsistent in terms - # of whether it points to a property or a project - - # if job.sharepoint_link: - # upload_job_to_sharepoint(sharepoint_client, job.sharepoint_link, job_files) - - for file_path in job_files: - try: - os.remove(file_path) - except OSError: - logger.warning(f"Failed to delete temp file {file_path}") - - return job_files - - @task_handler() def handler(body: Dict[str, Any], context: Any) -> List[str]: logger.info("Received message") @@ -172,8 +33,6 @@ def handler(body: Dict[str, Any], context: Any) -> List[str]: if (not pas_hub_email) or (not pas_hub_password): raise ValueError("Pas Hub credentials not provided") - pashub_client = get_pashub_client(pas_hub_email, pas_hub_password) - sharepoint_client = DomnaSharepointClient( sharepoint_location=DomnaSites.SOCIAL_HOUSING_WAVE_3 ) @@ -182,26 +41,24 @@ def handler(body: Dict[str, Any], context: Any) -> List[str]: payload = PashubToAraTriggerRequest.model_validate(body) logger.debug("Successfully validated request body") + service = PashubService( + pashub_client=get_pashub_client(pas_hub_email, pas_hub_password), + sharepoint_client=sharepoint_client, + s3_bucket=S3_BUCKET, + ) + try: - files: List[str] = process_job( - payload, - pashub_client, - sharepoint_client, - ) + files: List[str] = service.run(payload) except UnauthorizedError: logger.warning("Token expired - refreshing") - pashub_client = get_pashub_client( - pas_hub_email, - pas_hub_password, + service = PashubService( + pashub_client=get_pashub_client(pas_hub_email, pas_hub_password), + sharepoint_client=sharepoint_client, + s3_bucket=S3_BUCKET, ) - # retry once - files = process_job( - payload, - pashub_client, - sharepoint_client, - ) + files = service.run(payload) logger.info(f"Saved {len(files)} files") diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py new file mode 100644 index 00000000..19c5c2f9 --- /dev/null +++ b/backend/pashub_fetcher/pashub_service.py @@ -0,0 +1,126 @@ +import os +from datetime import datetime, timezone +from typing import List, Optional, Tuple, cast + +from backend.app.db.connection import db_session +from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum, UploadedFile +from backend.documents_parser.db_writer import save_epc_property_data +from backend.documents_parser.parser import parse_site_notes_pdf +from backend.pashub_fetcher.core_files import infer_file_type +from backend.pashub_fetcher.pashub_client import PashubClient +from backend.pashub_fetcher.pashub_to_ara_trigger_request import PashubToAraTriggerRequest +from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from utils.logger import setup_logger +from utils.s3 import upload_file_to_s3 +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient + +logger = setup_logger() + + +class PashubService: + def __init__( + self, + pashub_client: PashubClient, + sharepoint_client: DomnaSharepointClient, + s3_bucket: str, + ) -> None: + self._pashub_client = pashub_client + self._sharepoint_client = sharepoint_client + self._s3_bucket = s3_bucket + + def run(self, request: PashubToAraTriggerRequest) -> List[str]: + job_id = request.pashub_job_id + + uprn: Optional[str] = request.uprn or self._pashub_client.get_uprn_by_job_id(job_id) + hubspot_deal_id: Optional[str] = request.hubspot_deal_id + + if uprn: + logger.info(f"Got UPRN {uprn} for job {job_id}") + else: + logger.info(f"No UPRN found for job {job_id}") + + job_files: List[str] = self._pashub_client.get_core_evidence_files_by_job_id(job_id) + + if uprn or hubspot_deal_id: + logger.info("Uploading files to s3") + self._upload_to_s3_and_update_db(job_files, uprn, hubspot_deal_id) + + # SharePoint upload disabled: pashub sharepoint_link is inconsistent + # (points to property or project unpredictably) + # if request.sharepoint_link: + # self._upload_to_sharepoint(request.sharepoint_link, job_files) + + for file_path in job_files: + try: + os.remove(file_path) + except OSError: + logger.warning(f"Failed to delete temp file {file_path}") + + return job_files + + def _upload_to_s3_and_update_db( + self, + job_files: List[str], + uprn: Optional[str], + hubspot_deal_id: Optional[str], + ) -> None: + if not uprn and not hubspot_deal_id: + return + + base_path = ( + f"documents/uprn/{uprn}" + if uprn + else f"documents/hubspot_deal_id/{hubspot_deal_id}" + ) + + uploaded_files: List[UploadedFile] = [] + site_notes_pairs: List[Tuple[UploadedFile, EpcPropertyData]] = [] + + for file_path in job_files: + filename = os.path.basename(file_path) + file_key = f"{base_path}/{filename}" + + upload_file_to_s3(file_path, self._s3_bucket, file_key) + + uploaded_file = UploadedFile( + s3_file_bucket=self._s3_bucket, + s3_file_key=file_key, + s3_upload_timestamp=datetime.now(timezone.utc), + uprn=int(uprn) if uprn else None, + hubspot_deal_id=hubspot_deal_id, + file_source=FileSourceEnum.PAS_HUB.value, + file_type=infer_file_type(filename), + ) + uploaded_files.append(uploaded_file) + + file_type: Optional[str] = cast(Optional[str], uploaded_file.file_type) + if file_type is not None and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE: + try: + site_notes_pairs.append( + (uploaded_file, parse_site_notes_pdf(file_path)) + ) + except Exception: + logger.warning(f"Failed to parse site notes {file_path}", exc_info=True) + + with db_session() as session: + session.add_all(uploaded_files) + session.flush() + + for uploaded_file, epc_data in site_notes_pairs: + save_epc_property_data( + session, epc_data, uploaded_file_id=cast(int, uploaded_file.id) + ) + + session.commit() + + def _upload_to_sharepoint( + self, + sharepoint_link: str, + job_files: List[str], + ) -> None: + assessment_path = f"{sharepoint_link}/{SharepointSubfolders.ASSESSMENT.value}" + + for file_path in job_files: + filename = file_path.split("/")[-1] + self._sharepoint_client.upload_file(file_path, assessment_path, filename) diff --git a/backend/pashub_fetcher/tests/test_pashub_service.py b/backend/pashub_fetcher/tests/test_pashub_service.py new file mode 100644 index 00000000..2aff416b --- /dev/null +++ b/backend/pashub_fetcher/tests/test_pashub_service.py @@ -0,0 +1,254 @@ +from typing import Optional +from unittest.mock import MagicMock, call, patch + + +from backend.pashub_fetcher.pashub_client import PashubClient +from backend.pashub_fetcher.pashub_service import PashubService +from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( + PashubToAraTriggerRequest, +) +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient + + +FAKE_JOB_LINK = "https://pashub.net/jobs/job-id-123/details" + + +def make_request( + pashub_link: str = FAKE_JOB_LINK, + uprn: Optional[str] = None, + hubspot_deal_id: Optional[str] = None, + sharepoint_link: Optional[str] = None, +) -> PashubToAraTriggerRequest: + return PashubToAraTriggerRequest( + pashub_link=pashub_link, + uprn=uprn, + hubspot_deal_id=hubspot_deal_id, + sharepoint_link=sharepoint_link, + ) + + +def make_service( + pashub_client: Optional[PashubClient] = None, + sharepoint_client: Optional[DomnaSharepointClient] = None, + s3_bucket: str = "test-bucket", +) -> PashubService: + return PashubService( + pashub_client=pashub_client or MagicMock(spec=PashubClient), + sharepoint_client=sharepoint_client or MagicMock(spec=DomnaSharepointClient), + s3_bucket=s3_bucket, + ) + + +# --------------------------------------------------------------------------- +# run(): returns file paths +# --------------------------------------------------------------------------- + + +def test_run_returns_file_paths() -> None: + mock_client = MagicMock(spec=PashubClient) + mock_client.get_uprn_by_job_id.return_value = None + mock_client.get_core_evidence_files_by_job_id.return_value = [ + "/tmp/a.pdf", + "/tmp/b.pdf", + ] + + service = make_service(pashub_client=mock_client) + + with patch("backend.pashub_fetcher.pashub_service.os.remove"): + result = service.run(make_request()) + + assert result == ["/tmp/a.pdf", "/tmp/b.pdf"] + + +# --------------------------------------------------------------------------- +# run(): skips upload when neither uprn nor hubspot_deal_id +# --------------------------------------------------------------------------- + + +def test_run_skips_upload_when_no_uprn_and_no_deal_id() -> None: + mock_client = MagicMock(spec=PashubClient) + mock_client.get_uprn_by_job_id.return_value = None + mock_client.get_core_evidence_files_by_job_id.return_value = ["/tmp/a.pdf"] + + service = make_service(pashub_client=mock_client) + + with ( + patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3") as mock_s3, + patch("backend.pashub_fetcher.pashub_service.os.remove"), + ): + service.run(make_request(uprn=None, hubspot_deal_id=None)) + + mock_s3.assert_not_called() + + +# --------------------------------------------------------------------------- +# run(): UPRN present → uploads each file to S3 with correct bucket/key +# --------------------------------------------------------------------------- + + +def test_run_uploads_files_to_s3_using_uprn_path() -> None: + mock_client = MagicMock(spec=PashubClient) + mock_client.get_uprn_by_job_id.return_value = None + mock_client.get_core_evidence_files_by_job_id.return_value = [ + "/tmp/SiteNote_001.pdf", + "/tmp/Photopack_002.pdf", + ] + + service = make_service(pashub_client=mock_client, s3_bucket="my-bucket") + + with ( + patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3") as mock_s3, + patch("backend.pashub_fetcher.pashub_service.db_session"), + patch("backend.pashub_fetcher.pashub_service.os.remove"), + ): + service.run(make_request(uprn="12345")) + + mock_s3.assert_has_calls( + [ + call( + "/tmp/SiteNote_001.pdf", + "my-bucket", + "documents/uprn/12345/SiteNote_001.pdf", + ), + call( + "/tmp/Photopack_002.pdf", + "my-bucket", + "documents/uprn/12345/Photopack_002.pdf", + ), + ], + any_order=False, + ) + + +# --------------------------------------------------------------------------- +# run(): UPRN present → UploadedFile records added to DB session +# --------------------------------------------------------------------------- + + +def test_run_persists_uploaded_file_records_to_db() -> None: + mock_client = MagicMock(spec=PashubClient) + mock_client.get_uprn_by_job_id.return_value = None + mock_client.get_core_evidence_files_by_job_id.return_value = [ + "/tmp/SiteNote_001.pdf" + ] + + fake_session = MagicMock() + service = make_service(pashub_client=mock_client) + + with ( + patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"), + patch("backend.pashub_fetcher.pashub_service.db_session") as mock_db, + patch("backend.pashub_fetcher.pashub_service.os.remove"), + ): + mock_db.return_value.__enter__.return_value = fake_session + service.run(make_request(uprn="12345")) + + fake_session.add_all.assert_called_once() + added: list = fake_session.add_all.call_args[0][0] + assert len(added) == 1 + assert added[0].s3_file_bucket == "test-bucket" + assert added[0].uprn == 12345 + + +# --------------------------------------------------------------------------- +# run(): hubspot_deal_id only → uses deal_id S3 path prefix +# --------------------------------------------------------------------------- + + +def test_run_uses_hubspot_deal_id_path_when_no_uprn() -> None: + mock_client = MagicMock(spec=PashubClient) + mock_client.get_uprn_by_job_id.return_value = None + mock_client.get_core_evidence_files_by_job_id.return_value = [ + "/tmp/SiteNote_001.pdf" + ] + + service = make_service(pashub_client=mock_client, s3_bucket="my-bucket") + + with ( + patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3") as mock_s3, + patch("backend.pashub_fetcher.pashub_service.db_session"), + patch("backend.pashub_fetcher.pashub_service.os.remove"), + ): + service.run(make_request(uprn=None, hubspot_deal_id="deal-abc")) + + mock_s3.assert_called_once_with( + "/tmp/SiteNote_001.pdf", + "my-bucket", + "documents/hubspot_deal_id/deal-abc/SiteNote_001.pdf", + ) + + +# --------------------------------------------------------------------------- +# run(): RD_SAP_SITE_NOTE file → site notes parsed and saved to DB +# --------------------------------------------------------------------------- + + +def test_run_parses_and_saves_site_notes_for_rd_sap_site_note_file() -> None: + mock_client = MagicMock(spec=PashubClient) + mock_client.get_uprn_by_job_id.return_value = None + mock_client.get_core_evidence_files_by_job_id.return_value = [ + "/tmp/RdSAP_SiteNote_001.pdf" + ] + + fake_epc_data = MagicMock() + fake_session = MagicMock() + fake_uploaded_file_id = 99 + + service = make_service(pashub_client=mock_client) + + with ( + patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"), + patch( + "backend.pashub_fetcher.pashub_service.parse_site_notes_pdf", + return_value=fake_epc_data, + ) as mock_parse, + patch( + "backend.pashub_fetcher.pashub_service.save_epc_property_data" + ) as mock_save, + patch("backend.pashub_fetcher.pashub_service.db_session") as mock_db, + patch("backend.pashub_fetcher.pashub_service.os.remove"), + ): + fake_session.add_all = MagicMock( + side_effect=lambda files: setattr(files[0], "id", fake_uploaded_file_id) + ) + mock_db.return_value.__enter__.return_value = fake_session + service.run(make_request(uprn="12345")) + + mock_parse.assert_called_once_with("/tmp/RdSAP_SiteNote_001.pdf") + mock_save.assert_called_once_with( + fake_session, fake_epc_data, uploaded_file_id=fake_uploaded_file_id + ) + + +# --------------------------------------------------------------------------- +# run(): site notes parse failure → warning logged, run returns normally +# --------------------------------------------------------------------------- + + +def test_run_warns_and_continues_when_site_notes_parsing_fails() -> None: + mock_client = MagicMock(spec=PashubClient) + mock_client.get_uprn_by_job_id.return_value = None + mock_client.get_core_evidence_files_by_job_id.return_value = [ + "/tmp/RdSAP_SiteNote_001.pdf" + ] + + service = make_service(pashub_client=mock_client) + + with ( + patch("backend.pashub_fetcher.pashub_service.upload_file_to_s3"), + patch( + "backend.pashub_fetcher.pashub_service.parse_site_notes_pdf", + side_effect=ValueError("corrupt pdf"), + ), + patch( + "backend.pashub_fetcher.pashub_service.save_epc_property_data" + ) as mock_save, + patch("backend.pashub_fetcher.pashub_service.db_session"), + patch("backend.pashub_fetcher.pashub_service.logger") as mock_logger, + patch("backend.pashub_fetcher.pashub_service.os.remove"), + ): + result = service.run(make_request(uprn="12345")) + + assert result == ["/tmp/RdSAP_SiteNote_001.pdf"] + mock_logger.warning.assert_called() + mock_save.assert_not_called() diff --git a/pytest.ini b/pytest.ini index 33231c61..db46c287 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,6 @@ pythonpath = . log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/documents_parser/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/pashub_fetcher/tests backend/documents_parser/tests markers = integration: mark a test as an integration test From d522fcdb94fa78e6ae0f605cc34f9021f3354acb Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 30 Apr 2026 12:59:26 +0000 Subject: [PATCH 09/10] Refactor Pashub service so that site notes parsing and upload is separate from saving of files --- backend/pashub_fetcher/pashub_service.py | 79 +++++++++++++++++------- 1 file changed, 55 insertions(+), 24 deletions(-) diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index 19c5c2f9..92554369 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -1,16 +1,21 @@ import os from datetime import datetime, timezone -from typing import List, Optional, Tuple, cast +from typing import List, NamedTuple, Optional, cast from backend.app.db.connection import db_session -from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum, UploadedFile +from backend.app.db.models.uploaded_file import ( + FileSourceEnum, + FileTypeEnum, + UploadedFile, +) from backend.documents_parser.db_writer import save_epc_property_data from backend.documents_parser.parser import parse_site_notes_pdf from backend.pashub_fetcher.core_files import infer_file_type from backend.pashub_fetcher.pashub_client import PashubClient -from backend.pashub_fetcher.pashub_to_ara_trigger_request import PashubToAraTriggerRequest +from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( + PashubToAraTriggerRequest, +) from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders -from datatypes.epc.domain.epc_property_data import EpcPropertyData from utils.logger import setup_logger from utils.s3 import upload_file_to_s3 from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient @@ -18,6 +23,12 @@ from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient logger = setup_logger() +class _FileUploadRecord(NamedTuple): + file_path: str + file_type: Optional[str] + uploaded_file_id: int + + class PashubService: def __init__( self, @@ -32,7 +43,9 @@ class PashubService: def run(self, request: PashubToAraTriggerRequest) -> List[str]: job_id = request.pashub_job_id - uprn: Optional[str] = request.uprn or self._pashub_client.get_uprn_by_job_id(job_id) + uprn: Optional[str] = request.uprn or self._pashub_client.get_uprn_by_job_id( + job_id + ) hubspot_deal_id: Optional[str] = request.hubspot_deal_id if uprn: @@ -40,11 +53,16 @@ class PashubService: else: logger.info(f"No UPRN found for job {job_id}") - job_files: List[str] = self._pashub_client.get_core_evidence_files_by_job_id(job_id) + job_files: List[str] = self._pashub_client.get_core_evidence_files_by_job_id( + job_id + ) if uprn or hubspot_deal_id: logger.info("Uploading files to s3") - self._upload_to_s3_and_update_db(job_files, uprn, hubspot_deal_id) + upload_records = self._upload_to_s3_and_update_db( + job_files, uprn, hubspot_deal_id + ) + self._save_site_notes(upload_records) # SharePoint upload disabled: pashub sharepoint_link is inconsistent # (points to property or project unpredictably) @@ -64,9 +82,9 @@ class PashubService: job_files: List[str], uprn: Optional[str], hubspot_deal_id: Optional[str], - ) -> None: + ) -> List[_FileUploadRecord]: if not uprn and not hubspot_deal_id: - return + return [] base_path = ( f"documents/uprn/{uprn}" @@ -74,8 +92,8 @@ class PashubService: else f"documents/hubspot_deal_id/{hubspot_deal_id}" ) + file_paths: List[str] = [] uploaded_files: List[UploadedFile] = [] - site_notes_pairs: List[Tuple[UploadedFile, EpcPropertyData]] = [] for file_path in job_files: filename = os.path.basename(file_path) @@ -92,27 +110,40 @@ class PashubService: file_source=FileSourceEnum.PAS_HUB.value, file_type=infer_file_type(filename), ) + file_paths.append(file_path) uploaded_files.append(uploaded_file) - file_type: Optional[str] = cast(Optional[str], uploaded_file.file_type) - if file_type is not None and FileTypeEnum(file_type) == FileTypeEnum.RD_SAP_SITE_NOTE: - try: - site_notes_pairs.append( - (uploaded_file, parse_site_notes_pdf(file_path)) - ) - except Exception: - logger.warning(f"Failed to parse site notes {file_path}", exc_info=True) - with db_session() as session: session.add_all(uploaded_files) session.flush() - - for uploaded_file, epc_data in site_notes_pairs: - save_epc_property_data( - session, epc_data, uploaded_file_id=cast(int, uploaded_file.id) + upload_records = [ + _FileUploadRecord( + file_path=fp, + file_type=cast(Optional[str], uf.file_type), + uploaded_file_id=cast(int, uf.id), ) + for fp, uf in zip(file_paths, uploaded_files) + ] - session.commit() + return upload_records + + def _save_site_notes(self, upload_records: List[_FileUploadRecord]) -> None: + for record in upload_records: + if ( + record.file_type is None + or FileTypeEnum(record.file_type) != FileTypeEnum.RD_SAP_SITE_NOTE + ): + continue + try: + epc_data = parse_site_notes_pdf(record.file_path) + with db_session() as session: + save_epc_property_data( + session, epc_data, uploaded_file_id=record.uploaded_file_id + ) + except Exception: + logger.warning( + f"Failed to parse site notes {record.file_path}", exc_info=True + ) def _upload_to_sharepoint( self, From 0f6d70b67ac4736a0ec07b754003aad11605b210 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 30 Apr 2026 13:32:13 +0000 Subject: [PATCH 10/10] typehint output of parse_site_notes_pdf --- backend/pashub_fetcher/pashub_service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/pashub_fetcher/pashub_service.py b/backend/pashub_fetcher/pashub_service.py index 92554369..316902f4 100644 --- a/backend/pashub_fetcher/pashub_service.py +++ b/backend/pashub_fetcher/pashub_service.py @@ -16,6 +16,7 @@ from backend.pashub_fetcher.pashub_to_ara_trigger_request import ( PashubToAraTriggerRequest, ) from backend.pashub_fetcher.sharepoint_subfolders import SharepointSubfolders +from datatypes.epc.domain.epc_property_data import EpcPropertyData from utils.logger import setup_logger from utils.s3 import upload_file_to_s3 from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient @@ -135,7 +136,7 @@ class PashubService: ): continue try: - epc_data = parse_site_notes_pdf(record.file_path) + epc_data: EpcPropertyData = parse_site_notes_pdf(record.file_path) with db_session() as session: save_epc_property_data( session, epc_data, uploaded_file_id=record.uploaded_file_id