From cd7b59a62f4fea2043073db93668270a32c5a8a5 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 2 Apr 2026 15:42:56 +0000 Subject: [PATCH 01/43] update spreadsheet with properties that have already been processed --- backend/ecmk_fetcher/address_list.py | 104 +++++++++++++++++++++------ backend/ecmk_fetcher/processor.py | 32 +++++++-- 2 files changed, 108 insertions(+), 28 deletions(-) diff --git a/backend/ecmk_fetcher/address_list.py b/backend/ecmk_fetcher/address_list.py index d273c45d..54c675d1 100644 --- a/backend/ecmk_fetcher/address_list.py +++ b/backend/ecmk_fetcher/address_list.py @@ -1,45 +1,107 @@ -from typing import Dict, Optional -from openpyxl import load_workbook import re +from dataclasses import dataclass +from typing import Any, Dict, Optional, cast +from openpyxl import Workbook, load_workbook +from openpyxl.worksheet.worksheet import Worksheet +from openpyxl.cell.cell import Cell -def extract_addresses_from_spreadsheet(filepath: str) -> Dict[str, str]: - wb = load_workbook(filepath, data_only=True) - ws = wb["Southern RA-Lite Programme 3103"] +@dataclass +class PropertyRow: + row_index: int + address: str + processed: bool - properties: Dict[str, str] = {} - header_row = 1 - id_col_index = None - deal_name_col_index = None +def extract_addresses_from_spreadsheet( + filepath: str, +) -> Dict[str, PropertyRow]: + wb: Workbook = load_workbook(filepath, data_only=True) + ws: Worksheet = wb["Southern RA-Lite Programme 3103"] + header_row: int = 1 + id_col: Optional[int] = None + deal_name_col: Optional[int] = None + processed_col: Optional[int] = None + + # find columns for col in range(1, ws.max_column + 1): - value = ws.cell(row=header_row, column=col).value + raw_value: Any = ws.cell(row=header_row, column=col).value + value: str = str(raw_value).strip().lower() if raw_value else "" - if value and str(value).strip().lower() == "id": - id_col_index = col + if value == "id": + id_col = col + elif value == "deal name": + deal_name_col = col + elif value == "processed": + processed_col = col - if value and str(value).strip().lower() == "deal name": - deal_name_col_index = col - break + if id_col is None or deal_name_col is None: + raise Exception("Missing required columns") - if id_col_index is None or deal_name_col_index is None: - raise Exception("Required columns not found") + # create processed column if missing + if processed_col is None: + processed_col = ws.max_column + 1 + cast(Cell, ws.cell(row=header_row, column=processed_col)).value = "processed" + + properties: Dict[str, PropertyRow] = {} for row in range(2, ws.max_row + 1): - id_val = ws.cell(row=row, column=id_col_index).value - deal_name = ws.cell(row=row, column=deal_name_col_index).value + id_val: Any = ws.cell(row=row, column=id_col).value + deal_name: Any = ws.cell(row=row, column=deal_name_col).value if not id_val or not deal_name: continue - properties[str(id_val).strip()] = extract_succinct_address( - str(deal_name).strip() + processed_val: Any = ws.cell(row=row, column=processed_col).value + processed: bool = str(processed_val).lower() == "true" + + property_id: str = str(id_val).strip() + + properties[property_id] = PropertyRow( + row_index=row, + address=extract_succinct_address(str(deal_name)), + processed=processed, ) return properties +def mark_properties_as_processed( + filepath: str, + property_map: Dict[str, PropertyRow], +) -> None: + wb: Workbook = load_workbook(filepath) + ws: Worksheet = wb["Southern RA-Lite Programme 3103"] + + header_row: int = 1 + + # find processed column + processed_col: int | None = None + + for col in range(1, ws.max_column + 1): + value = ws.cell(row=header_row, column=col).value + if value and str(value).strip().lower() == "processed": + processed_col = col + break + + if processed_col is None: + raise Exception("Processed column not found") + + # update rows + for property_row in property_map.values(): + if property_row.processed: + cast( + Cell, + ws.cell( + row=property_row.row_index, + column=processed_col, + ), + ).value = True + + wb.save(filepath) + + def extract_succinct_address(deal_name: str) -> str: left_part = deal_name.split("|")[0].strip() diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py index 1852b867..dce6c7ef 100644 --- a/backend/ecmk_fetcher/processor.py +++ b/backend/ecmk_fetcher/processor.py @@ -1,6 +1,5 @@ import os -from typing import Dict, List - +from typing import Dict from playwright.sync_api import ( sync_playwright, Locator, @@ -9,7 +8,11 @@ from playwright.sync_api import ( BrowserContext, ) -from backend.ecmk_fetcher.address_list import extract_addresses_from_spreadsheet +from backend.ecmk_fetcher.address_list import ( + PropertyRow, + extract_addresses_from_spreadsheet, + mark_properties_as_processed, +) from backend.ecmk_fetcher.browser import ( attach_debug_listeners, download_with_retry, @@ -35,8 +38,7 @@ def run_job() -> None: BASE_DIR: str = os.path.dirname(__file__) filepath: str = os.path.join(BASE_DIR, property_list_file) - property_map: Dict[str, str] = extract_addresses_from_spreadsheet(filepath) - property_ids: List[str] = list(property_map.keys()) + property_map: Dict[str, PropertyRow] = extract_addresses_from_spreadsheet(filepath) sharepoint_client: DomnaSharepointClient = DomnaSharepointClient( sharepoint_location=DomnaSites.PRIVATE_PAY @@ -79,19 +81,27 @@ def run_job() -> None: property_id: str = build_property_id(address, postcode) - if property_id not in property_ids: + property_row: PropertyRow | None = property_map.get(property_id) + + if not property_row: continue - sharepoint_address: str = property_map[property_id] + if property_row.processed: + continue + + sharepoint_address: str = property_row.address go_to_assessment_details(page, row) + all_uploaded: bool = True + for report_type in REPORT_TYPES: file_path: str | None = download_with_retry( page, report_type ) if not file_path: + all_uploaded = False continue try: @@ -101,10 +111,16 @@ def run_job() -> None: base_path=sharepoint_base_path, subpath=sharepoint_address, ) + except Exception: + all_uploaded = False + raise finally: if os.path.exists(file_path): os.remove(file_path) + if all_uploaded: + property_row.processed = True + page.go_back() page.wait_for_selector( "#assessmentDatatable tbody tr", timeout=15000 @@ -119,3 +135,5 @@ def run_job() -> None: finally: context.close() browser.close() + + mark_properties_as_processed(filepath, property_map) From ba30bccb07b7f14199a1955a1f60a8f95eed0f12 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 7 Apr 2026 11:29:10 +0000 Subject: [PATCH 02/43] revert spreadsheet update changes. add better logging --- backend/ecmk_fetcher/address_list.py | 51 +--------------------------- backend/ecmk_fetcher/browser.py | 1 + backend/ecmk_fetcher/processor.py | 23 ++++++------- 3 files changed, 13 insertions(+), 62 deletions(-) diff --git a/backend/ecmk_fetcher/address_list.py b/backend/ecmk_fetcher/address_list.py index 54c675d1..a2834366 100644 --- a/backend/ecmk_fetcher/address_list.py +++ b/backend/ecmk_fetcher/address_list.py @@ -1,16 +1,14 @@ import re from dataclasses import dataclass -from typing import Any, Dict, Optional, cast +from typing import Any, Dict, Optional from openpyxl import Workbook, load_workbook from openpyxl.worksheet.worksheet import Worksheet -from openpyxl.cell.cell import Cell @dataclass class PropertyRow: row_index: int address: str - processed: bool def extract_addresses_from_spreadsheet( @@ -22,7 +20,6 @@ def extract_addresses_from_spreadsheet( header_row: int = 1 id_col: Optional[int] = None deal_name_col: Optional[int] = None - processed_col: Optional[int] = None # find columns for col in range(1, ws.max_column + 1): @@ -33,17 +30,10 @@ def extract_addresses_from_spreadsheet( id_col = col elif value == "deal name": deal_name_col = col - elif value == "processed": - processed_col = col if id_col is None or deal_name_col is None: raise Exception("Missing required columns") - # create processed column if missing - if processed_col is None: - processed_col = ws.max_column + 1 - cast(Cell, ws.cell(row=header_row, column=processed_col)).value = "processed" - properties: Dict[str, PropertyRow] = {} for row in range(2, ws.max_row + 1): @@ -53,55 +43,16 @@ def extract_addresses_from_spreadsheet( if not id_val or not deal_name: continue - processed_val: Any = ws.cell(row=row, column=processed_col).value - processed: bool = str(processed_val).lower() == "true" - property_id: str = str(id_val).strip() properties[property_id] = PropertyRow( row_index=row, address=extract_succinct_address(str(deal_name)), - processed=processed, ) return properties -def mark_properties_as_processed( - filepath: str, - property_map: Dict[str, PropertyRow], -) -> None: - wb: Workbook = load_workbook(filepath) - ws: Worksheet = wb["Southern RA-Lite Programme 3103"] - - header_row: int = 1 - - # find processed column - processed_col: int | None = None - - for col in range(1, ws.max_column + 1): - value = ws.cell(row=header_row, column=col).value - if value and str(value).strip().lower() == "processed": - processed_col = col - break - - if processed_col is None: - raise Exception("Processed column not found") - - # update rows - for property_row in property_map.values(): - if property_row.processed: - cast( - Cell, - ws.cell( - row=property_row.row_index, - column=processed_col, - ), - ).value = True - - wb.save(filepath) - - def extract_succinct_address(deal_name: str) -> str: left_part = deal_name.split("|")[0].strip() diff --git a/backend/ecmk_fetcher/browser.py b/backend/ecmk_fetcher/browser.py index 6d018537..de349b92 100644 --- a/backend/ecmk_fetcher/browser.py +++ b/backend/ecmk_fetcher/browser.py @@ -50,6 +50,7 @@ def get_first_row_signature(page: Page) -> str: def go_to_next_page(page: Page) -> bool: + logger.info("Going to next page") before = get_first_row_signature(page) page.locator("#assessmentDatatable_next a").click() diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py index dce6c7ef..e774fc9a 100644 --- a/backend/ecmk_fetcher/processor.py +++ b/backend/ecmk_fetcher/processor.py @@ -11,7 +11,6 @@ from playwright.sync_api import ( from backend.ecmk_fetcher.address_list import ( PropertyRow, extract_addresses_from_spreadsheet, - mark_properties_as_processed, ) from backend.ecmk_fetcher.browser import ( attach_debug_listeners, @@ -23,9 +22,12 @@ from backend.ecmk_fetcher.browser import ( ) from backend.ecmk_fetcher.reports import REPORT_TYPES, build_property_id from backend.ecmk_fetcher.sharepoint import upload_file_to_sharepoint +from utils.logger import setup_logger from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient from utils.sharepoint.domna_sites import DomnaSites +logger = setup_logger() + def run_job() -> None: username: str = "" @@ -86,24 +88,24 @@ def run_job() -> None: if not property_row: continue - if property_row.processed: - continue + logger.info(f"Match found for property {address}") sharepoint_address: str = property_row.address go_to_assessment_details(page, row) - all_uploaded: bool = True - for report_type in REPORT_TYPES: file_path: str | None = download_with_retry( page, report_type ) if not file_path: - all_uploaded = False continue + logger.info( + f"Successfully downloaded file {os.path.basename(file_path)} from ECMK" + ) + try: upload_file_to_sharepoint( client=sharepoint_client, @@ -111,16 +113,15 @@ def run_job() -> None: base_path=sharepoint_base_path, subpath=sharepoint_address, ) + logger.info( + f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" + ) except Exception: - all_uploaded = False raise finally: if os.path.exists(file_path): os.remove(file_path) - if all_uploaded: - property_row.processed = True - page.go_back() page.wait_for_selector( "#assessmentDatatable tbody tr", timeout=15000 @@ -135,5 +136,3 @@ def run_job() -> None: finally: context.close() browser.close() - - mark_properties_as_processed(filepath, property_map) From 849a272974c15c16cb743441a7dad5192af07f4c Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 7 Apr 2026 11:47:47 +0000 Subject: [PATCH 03/43] get hubspot listing id from spreadsheet --- backend/ecmk_fetcher/address_list.py | 10 ++++++++-- backend/ecmk_fetcher/processor.py | 3 +++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/backend/ecmk_fetcher/address_list.py b/backend/ecmk_fetcher/address_list.py index a2834366..ba636a70 100644 --- a/backend/ecmk_fetcher/address_list.py +++ b/backend/ecmk_fetcher/address_list.py @@ -9,6 +9,7 @@ from openpyxl.worksheet.worksheet import Worksheet class PropertyRow: row_index: int address: str + listing_id: str def extract_addresses_from_spreadsheet( @@ -20,6 +21,7 @@ def extract_addresses_from_spreadsheet( header_row: int = 1 id_col: Optional[int] = None deal_name_col: Optional[int] = None + listing_id_col: Optional[int] = None # find columns for col in range(1, ws.max_column + 1): @@ -30,8 +32,10 @@ def extract_addresses_from_spreadsheet( id_col = col elif value == "deal name": deal_name_col = col + elif value == "associated listing ids": + listing_id_col = col - if id_col is None or deal_name_col is None: + if id_col is None or deal_name_col is None or listing_id_col is None: raise Exception("Missing required columns") properties: Dict[str, PropertyRow] = {} @@ -39,8 +43,9 @@ def extract_addresses_from_spreadsheet( for row in range(2, ws.max_row + 1): id_val: Any = ws.cell(row=row, column=id_col).value deal_name: Any = ws.cell(row=row, column=deal_name_col).value + listing_id: Any = ws.cell(row=row, column=listing_id_col).value - if not id_val or not deal_name: + if not id_val or not deal_name or not listing_id: continue property_id: str = str(id_val).strip() @@ -48,6 +53,7 @@ def extract_addresses_from_spreadsheet( properties[property_id] = PropertyRow( row_index=row, address=extract_succinct_address(str(deal_name)), + listing_id=listing_id, ) return properties diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py index e774fc9a..4c841a19 100644 --- a/backend/ecmk_fetcher/processor.py +++ b/backend/ecmk_fetcher/processor.py @@ -92,6 +92,9 @@ def run_job() -> None: sharepoint_address: str = property_row.address + # Check whether files have already been processed before continuing with this property + # hubspot_listing_id: str = property_row.listing_id + go_to_assessment_details(page, row) for report_type in REPORT_TYPES: From 15f1fde16a0e2828d56a65d9a8cb374c94b6d7b0 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 7 Apr 2026 14:34:33 +0000 Subject: [PATCH 04/43] skip file if already processed according to db --- .../db/functions/uploaded_files_functions.py | 25 +++++++++++++ backend/app/db/models/uploaded_file.py | 2 ++ ...-ra-lite-programme-3103-2026-03-31-2.xlsx# | 1 + backend/ecmk_fetcher/processor.py | 35 ++++++++++++++++--- backend/ecmk_fetcher/reports.py | 12 +++++++ 5 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 backend/app/db/functions/uploaded_files_functions.py create mode 100644 backend/ecmk_fetcher/.~lock.hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx# diff --git a/backend/app/db/functions/uploaded_files_functions.py b/backend/app/db/functions/uploaded_files_functions.py new file mode 100644 index 00000000..3708813a --- /dev/null +++ b/backend/app/db/functions/uploaded_files_functions.py @@ -0,0 +1,25 @@ +from typing import Optional + +from sqlalchemy import select + +from backend.app.db.connection import db_read_session +from backend.app.db.models.uploaded_file import ( + FileSourceEnum, + FileTypeEnum, + UploadedFile, +) + + +def get_uploaded_file_by_listing_type_and_source( + hubspot_listing_id: int, + file_type: FileTypeEnum, + file_source: FileSourceEnum, +) -> Optional[UploadedFile]: + with db_read_session() as session: + statement = select(UploadedFile).where( + UploadedFile.hubspot_listing_id == hubspot_listing_id, + UploadedFile.file_type == file_type, + UploadedFile.file_source == file_source, + ) + + return session.exec(statement).one_or_none() diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py index 9b751d34..8decfd1b 100644 --- a/backend/app/db/models/uploaded_file.py +++ b/backend/app/db/models/uploaded_file.py @@ -14,6 +14,8 @@ class FileTypeEnum(enum.Enum): PAR_PHOTO_PACK = "par_photo_pack" PAS_2023_PROPERTY = "pas_2023_property" PAS_2023_OCCUPANCY = "pas_2023_occupancy" + ECMK_SITE_NOTE = "ecmk_site_note" + ECMK_RD_SAP_SITE_NOTE = "ecmk_rd_sap_site_note" class FileSourceEnum(enum.Enum): diff --git a/backend/ecmk_fetcher/.~lock.hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx# b/backend/ecmk_fetcher/.~lock.hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx# new file mode 100644 index 00000000..4b57053e --- /dev/null +++ b/backend/ecmk_fetcher/.~lock.hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx# @@ -0,0 +1 @@ +,daniel,daniel-Dell-15-DC15250,07.04.2026 11:47,/home/daniel/snap/onlyoffice-desktopeditors/1067/.local/share/onlyoffice; \ No newline at end of file diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py index 4c841a19..dc52c342 100644 --- a/backend/ecmk_fetcher/processor.py +++ b/backend/ecmk_fetcher/processor.py @@ -8,6 +8,10 @@ from playwright.sync_api import ( BrowserContext, ) +from backend.app.db.functions.uploaded_files_functions import ( + get_uploaded_file_by_listing_type_and_source, +) +from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum from backend.ecmk_fetcher.address_list import ( PropertyRow, extract_addresses_from_spreadsheet, @@ -20,7 +24,11 @@ from backend.ecmk_fetcher.browser import ( go_to_next_page, login, ) -from backend.ecmk_fetcher.reports import REPORT_TYPES, build_property_id +from backend.ecmk_fetcher.reports import ( + REPORT_TYPES, + build_property_id, + map_report_type_to_db_file_type, +) from backend.ecmk_fetcher.sharepoint import upload_file_to_sharepoint from utils.logger import setup_logger from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient @@ -48,6 +56,8 @@ def run_job() -> None: sharepoint_base_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Assessments" + # s3_bucket: str = "retrofit-energy-assessments-dev" + with sync_playwright() as p: browser: Browser = p.chromium.launch(headless=True) context: BrowserContext = browser.new_context() @@ -92,12 +102,29 @@ def run_job() -> None: sharepoint_address: str = property_row.address - # Check whether files have already been processed before continuing with this property - # hubspot_listing_id: str = property_row.listing_id - go_to_assessment_details(page, row) for report_type in REPORT_TYPES: + hubspot_listing_id: str = property_row.listing_id + try: + db_file_type: FileTypeEnum = ( + map_report_type_to_db_file_type(report_type) + ) + + except ValueError: + logger.error( + f"Unknown report type {report_type}, skipping file" + ) + continue + + if get_uploaded_file_by_listing_type_and_source( + hubspot_listing_id=int(hubspot_listing_id), + file_type=db_file_type, + file_source=FileSourceEnum.ECMK, + ): + logger.debug("File already uploaded to s3, skipping") + continue + file_path: str | None = download_with_retry( page, report_type ) diff --git a/backend/ecmk_fetcher/reports.py b/backend/ecmk_fetcher/reports.py index a8f12792..d8d11d50 100644 --- a/backend/ecmk_fetcher/reports.py +++ b/backend/ecmk_fetcher/reports.py @@ -1,5 +1,7 @@ from enum import Enum +from backend.app.db.models.uploaded_file import FileTypeEnum + class FileDownloadButtonType(Enum): ASSESSOR_HUB_SITENOTE_REPORT = 11 @@ -15,6 +17,16 @@ REPORT_TYPES = [ ] +def map_report_type_to_db_file_type(report_type: int) -> FileTypeEnum: + match report_type: + case FileDownloadButtonType.ASSESSOR_HUB_SITENOTE_REPORT.value: + return FileTypeEnum.ECMK_SITE_NOTE + case FileDownloadButtonType.SITENOTE_REPORT.value: + return FileTypeEnum.ECMK_RD_SAP_SITE_NOTE + case _: + raise ValueError("Unknown report type") + + def build_report_selector(report_type: int) -> str: return f"a.download-report-btn[data-report-type='{report_type}']" From d229e2faf8a675d149fdb8bab0e16ef69f617a7e Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 7 Apr 2026 14:55:43 +0000 Subject: [PATCH 05/43] upload file to s3 and update db after doing so --- backend/ecmk_fetcher/processor.py | 16 ++++++++-- backend/ecmk_fetcher/sharepoint.py | 20 ------------ backend/ecmk_fetcher/upload.py | 49 ++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 22 deletions(-) delete mode 100644 backend/ecmk_fetcher/sharepoint.py create mode 100644 backend/ecmk_fetcher/upload.py diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py index dc52c342..0ca53c4c 100644 --- a/backend/ecmk_fetcher/processor.py +++ b/backend/ecmk_fetcher/processor.py @@ -29,7 +29,10 @@ from backend.ecmk_fetcher.reports import ( build_property_id, map_report_type_to_db_file_type, ) -from backend.ecmk_fetcher.sharepoint import upload_file_to_sharepoint +from backend.ecmk_fetcher.upload import ( + upload_file_to_s3_and_update_db, + upload_file_to_sharepoint, +) from utils.logger import setup_logger from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient from utils.sharepoint.domna_sites import DomnaSites @@ -56,7 +59,7 @@ def run_job() -> None: sharepoint_base_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Assessments" - # s3_bucket: str = "retrofit-energy-assessments-dev" + s3_bucket: str = "retrofit-energy-assessments-dev" with sync_playwright() as p: browser: Browser = p.chromium.launch(headless=True) @@ -146,6 +149,15 @@ def run_job() -> None: logger.info( f"Successfully loaded {os.path.basename(file_path)} to sharepoint for {address}" ) + + # Upload to s3 and update db + upload_file_to_s3_and_update_db( + bucket=s3_bucket, + file_path=file_path, + hubspot_listing_id=hubspot_listing_id, + file_type=db_file_type, + ) + except Exception: raise finally: diff --git a/backend/ecmk_fetcher/sharepoint.py b/backend/ecmk_fetcher/sharepoint.py deleted file mode 100644 index 79db1294..00000000 --- a/backend/ecmk_fetcher/sharepoint.py +++ /dev/null @@ -1,20 +0,0 @@ -import os - -from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient - - -def upload_file_to_sharepoint( - client: DomnaSharepointClient, - file_path: str, - base_path: str, - subpath: str, -) -> None: - filename = os.path.basename(file_path) - - full_path = f"{base_path}/{subpath}/1. Retrofit Assessment/A. Assessment" - - client.upload_file( - file_path=file_path, - sharepoint_path=full_path, - file_name=filename, - ) diff --git a/backend/ecmk_fetcher/upload.py b/backend/ecmk_fetcher/upload.py new file mode 100644 index 00000000..00e2ec32 --- /dev/null +++ b/backend/ecmk_fetcher/upload.py @@ -0,0 +1,49 @@ +from datetime import datetime, timezone +import os + +from backend.app.db.connection import db_session +from backend.app.db.models.uploaded_file import ( + FileSourceEnum, + FileTypeEnum, + UploadedFile, +) +from utils.s3 import upload_file_to_s3 +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient + + +def upload_file_to_sharepoint( + client: DomnaSharepointClient, + file_path: str, + base_path: str, + subpath: str, +) -> None: + filename = os.path.basename(file_path) + + full_path = f"{base_path}/{subpath}/1. Retrofit Assessment/A. Assessment" + + client.upload_file( + file_path=file_path, + sharepoint_path=full_path, + file_name=filename, + ) + + +def upload_file_to_s3_and_update_db( + bucket: str, file_path: str, hubspot_listing_id: str, file_type: FileTypeEnum +) -> None: + key: str = f"documents/hubspot_listing_id/{hubspot_listing_id}" + upload_file_to_s3(file_path, bucket, key) + + uploaded_file = UploadedFile( + s3_file_bucket=bucket, + s3_file_key=key, + s3_upload_timestamp=datetime.now(timezone.utc), + hubspot_listing_id=hubspot_listing_id, + file_source=FileSourceEnum.ECMK.value, + file_type=file_type, + ) + + with db_session() as session: + # TODO: we should do multiple files at once to reduce db trips + session.add(uploaded_file) + session.commit() From 7cd4d4c5b3f55a97c84fdc6e3a5733ee7c76aa00 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 07:42:00 +0000 Subject: [PATCH 06/43] bug fixes to get runner working --- backend/app/db/models/uploaded_file.py | 16 ++++++++++++++-- backend/ecmk_fetcher/upload.py | 6 ++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py index 8decfd1b..71763790 100644 --- a/backend/app/db/models/uploaded_file.py +++ b/backend/app/db/models/uploaded_file.py @@ -39,9 +39,21 @@ class UploadedFile(Base): hubspot_listing_id = Column(BigInteger, nullable=True) file_type = Column( - SqlEnum(FileTypeEnum, name="file_type", create_type=False), nullable=True + SqlEnum( + FileTypeEnum, + name="file_type", + create_type=False, + values_callable=lambda enum_cls: [e.value for e in enum_cls], + ), + nullable=True, ) file_source = Column( - SqlEnum(FileSourceEnum, name="file_source", create_type=False), nullable=True + SqlEnum( + FileSourceEnum, + name="file_source", + create_type=False, + values_callable=lambda enum_cls: [e.value for e in enum_cls], + ), + nullable=True, ) diff --git a/backend/ecmk_fetcher/upload.py b/backend/ecmk_fetcher/upload.py index 00e2ec32..0a744e53 100644 --- a/backend/ecmk_fetcher/upload.py +++ b/backend/ecmk_fetcher/upload.py @@ -31,7 +31,9 @@ def upload_file_to_sharepoint( def upload_file_to_s3_and_update_db( bucket: str, file_path: str, hubspot_listing_id: str, file_type: FileTypeEnum ) -> None: - key: str = f"documents/hubspot_listing_id/{hubspot_listing_id}" + filename: str = os.path.basename(file_path) + key: str = f"documents/hubspot_listing_id/{hubspot_listing_id}/{filename}" + upload_file_to_s3(file_path, bucket, key) uploaded_file = UploadedFile( @@ -40,7 +42,7 @@ def upload_file_to_s3_and_update_db( s3_upload_timestamp=datetime.now(timezone.utc), hubspot_listing_id=hubspot_listing_id, file_source=FileSourceEnum.ECMK.value, - file_type=file_type, + file_type=file_type.value, ) with db_session() as session: From 09ee2699b6eeeccee8c62b8505edea919a8927c5 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 07:43:09 +0000 Subject: [PATCH 07/43] remove accidentally committed lock file --- ...rm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx# | 1 - 1 file changed, 1 deletion(-) delete mode 100644 backend/ecmk_fetcher/.~lock.hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx# diff --git a/backend/ecmk_fetcher/.~lock.hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx# b/backend/ecmk_fetcher/.~lock.hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx# deleted file mode 100644 index 4b57053e..00000000 --- a/backend/ecmk_fetcher/.~lock.hubspot-crm-exports-southern-ra-lite-programme-3103-2026-03-31-2.xlsx# +++ /dev/null @@ -1 +0,0 @@ -,daniel,daniel-Dell-15-DC15250,07.04.2026 11:47,/home/daniel/snap/onlyoffice-desktopeditors/1067/.local/share/onlyoffice; \ No newline at end of file From 9471854dfba062be4b371ad8670a04a7bcf40d53 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 08:27:12 +0000 Subject: [PATCH 08/43] dockerfile, requirements, and local handler --- backend/ecmk_fetcher/handler/Dockerfile | 26 +++++++++++++++++++ backend/ecmk_fetcher/handler/handler.py | 4 +++ backend/ecmk_fetcher/handler/requirements.txt | 12 +++++++++ .../local_handler/docker-compose.yml | 11 ++++++++ .../local_handler/invoke_local_lambda.py | 26 +++++++++++++++++++ backend/ecmk_fetcher/processor.py | 3 ++- 6 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 backend/ecmk_fetcher/handler/Dockerfile create mode 100644 backend/ecmk_fetcher/handler/requirements.txt create mode 100644 backend/ecmk_fetcher/local_handler/docker-compose.yml create mode 100644 backend/ecmk_fetcher/local_handler/invoke_local_lambda.py diff --git a/backend/ecmk_fetcher/handler/Dockerfile b/backend/ecmk_fetcher/handler/Dockerfile new file mode 100644 index 00000000..2b6007d9 --- /dev/null +++ b/backend/ecmk_fetcher/handler/Dockerfile @@ -0,0 +1,26 @@ +FROM mcr.microsoft.com/playwright/python:v1.58.0-jammy + +# Install AWS Lambda RIE +ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie /usr/local/bin/aws-lambda-rie +RUN chmod +x /usr/local/bin/aws-lambda-rie + +# Set working directory (Lambda task root) +WORKDIR /var/task + +COPY backend/ecmk_fetcher/handler/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY utils/ utils/ +COPY backend/ backend/ +COPY datatypes/ datatypes/ + +# Local lambda entrypoint +ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] + +#AWS lambda entrypoint +# ENTRYPOINT ["python", "-m", "awslambdaric"] + +# ----------------------------- +# Lambda handler +# ----------------------------- +CMD ["backend.ecmk_fetcher.handler.handler.handler"] \ No newline at end of file diff --git a/backend/ecmk_fetcher/handler/handler.py b/backend/ecmk_fetcher/handler/handler.py index 4ce3a949..b777cc9f 100644 --- a/backend/ecmk_fetcher/handler/handler.py +++ b/backend/ecmk_fetcher/handler/handler.py @@ -1,9 +1,13 @@ from typing import Any, Mapping from backend.ecmk_fetcher.processor import run_job +from utils.logger import setup_logger + +logger = setup_logger() def handler(event: Mapping[str, Any], context: Any) -> None: + logger.info("Entered handler") run_job() diff --git a/backend/ecmk_fetcher/handler/requirements.txt b/backend/ecmk_fetcher/handler/requirements.txt new file mode 100644 index 00000000..2692484e --- /dev/null +++ b/backend/ecmk_fetcher/handler/requirements.txt @@ -0,0 +1,12 @@ +awslambdaric +playwright==1.58.0 +msal +openpyxl +sqlalchemy==2.0.36 +sqlmodel +pytz==2024.2 +psycopg2-binary==2.9.10 +pydantic-settings==2.6.0 +boto3==1.35.44 +pandas==2.2.2 +numpy<2.0 \ No newline at end of file diff --git a/backend/ecmk_fetcher/local_handler/docker-compose.yml b/backend/ecmk_fetcher/local_handler/docker-compose.yml new file mode 100644 index 00000000..fd642499 --- /dev/null +++ b/backend/ecmk_fetcher/local_handler/docker-compose.yml @@ -0,0 +1,11 @@ +version: "3.9" + +services: + ecmk-fetcher-lambda: + build: + context: ../../../ + dockerfile: backend/ecmk_fetcher/handler/Dockerfile + ports: + - "9000:8080" + env_file: + - ../../../.env \ No newline at end of file diff --git a/backend/ecmk_fetcher/local_handler/invoke_local_lambda.py b/backend/ecmk_fetcher/local_handler/invoke_local_lambda.py new file mode 100644 index 00000000..ba76301e --- /dev/null +++ b/backend/ecmk_fetcher/local_handler/invoke_local_lambda.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +import json +import requests + +HOST = "localhost" +PORT = "9000" + +LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" + +payload = { + "Records": [ + { + "body": json.dumps( + { + "test": 123456, + } + ) + } + ] +} + +response = requests.post(LAMBDA_URL, json=payload) + +print("Status code:", response.status_code) +print("Response:") +print(response.text) diff --git a/backend/ecmk_fetcher/processor.py b/backend/ecmk_fetcher/processor.py index 0ca53c4c..2f122080 100644 --- a/backend/ecmk_fetcher/processor.py +++ b/backend/ecmk_fetcher/processor.py @@ -41,7 +41,8 @@ logger = setup_logger() def run_job() -> None: - username: str = "" + + username: str = "" # TODO: get from github secrets password: str = "" property_list_file: str = ( From 5df2318bb5dfa5812835da530416b91932983fd9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 08:44:45 +0000 Subject: [PATCH 09/43] start defining infrastructure including ecr --- backend/ecmk_fetcher/handler/Dockerfile | 4 +- .../terraform/lambda/ecmk_to_ara/main.tf | 27 ++++++++++++++ .../terraform/lambda/ecmk_to_ara/provider.tf | 16 ++++++++ .../terraform/lambda/ecmk_to_ara/variables.tf | 37 +++++++++++++++++++ infrastructure/terraform/shared/main.tf | 14 +++++++ 5 files changed, 96 insertions(+), 2 deletions(-) create mode 100644 infrastructure/terraform/lambda/ecmk_to_ara/main.tf create mode 100644 infrastructure/terraform/lambda/ecmk_to_ara/provider.tf create mode 100644 infrastructure/terraform/lambda/ecmk_to_ara/variables.tf diff --git a/backend/ecmk_fetcher/handler/Dockerfile b/backend/ecmk_fetcher/handler/Dockerfile index 2b6007d9..fa2126fd 100644 --- a/backend/ecmk_fetcher/handler/Dockerfile +++ b/backend/ecmk_fetcher/handler/Dockerfile @@ -15,10 +15,10 @@ COPY backend/ backend/ COPY datatypes/ datatypes/ # Local lambda entrypoint -ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] +# ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] #AWS lambda entrypoint -# ENTRYPOINT ["python", "-m", "awslambdaric"] +ENTRYPOINT ["python", "-m", "awslambdaric"] # ----------------------------- # Lambda handler diff --git a/infrastructure/terraform/lambda/ecmk_to_ara/main.tf b/infrastructure/terraform/lambda/ecmk_to_ara/main.tf new file mode 100644 index 00000000..357c2f87 --- /dev/null +++ b/infrastructure/terraform/lambda/ecmk_to_ara/main.tf @@ -0,0 +1,27 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +module "lambda" { + source = "../../modules/lambda_with_sqs" + + name = "ecmk_to_ara" #"address2uprn" for example + stage = var.stage + + image_uri = local.image_uri + + # Optional: Set maximum_concurrency to limit concurrent SQS-triggered invocations (2-1000) + maximum_concurrency = var.maximum_concurrency + + batch_size = var.batch_size + + environment = { + STAGE = var.stage + LOG_LEVEL = "info" + } +} diff --git a/infrastructure/terraform/lambda/ecmk_to_ara/provider.tf b/infrastructure/terraform/lambda/ecmk_to_ara/provider.tf new file mode 100644 index 00000000..87a94150 --- /dev/null +++ b/infrastructure/terraform/lambda/ecmk_to_ara/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "ecmk-to-ara-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/ecmk_to_ara/variables.tf b/infrastructure/terraform/lambda/ecmk_to_ara/variables.tf new file mode 100644 index 00000000..984e3908 --- /dev/null +++ b/infrastructure/terraform/lambda/ecmk_to_ara/variables.tf @@ -0,0 +1,37 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +variable "maximum_concurrency" { + type = number + default = 2 + description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit." +} + +variable "batch_size" { + type = number + default = 1 +} + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 9d272eb6..47866c92 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -538,6 +538,20 @@ module "pashub_to_ara_registry" { stage = var.stage } +################################################ +# ECMK to Ara – Lambda +################################################ +module "ecmk_to_ara_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "ecmk-to-ara-terraform-state" +} + +module "ecmk_to_ara_registry" { + source = "../modules/container_registry" + name = "ecmk_to_ara" + stage = var.stage +} + ################################################ # Engine – Lambda ECR ################################################ From 942c2923daf5a18d528ef47847bc2b2b2b8d512e Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 10:55:48 +0000 Subject: [PATCH 10/43] initial setup --- .../hubspot_trigger_orchestrator/handler.py | 61 +++++++++++++++++++ .../hubspot_deal_differ.py | 21 +++++++ ...ot_trigger_orchestrator_trigger_request.py | 5 ++ etl/hubspot/hubspotDataTodB.py | 6 +- etl/hubspot/scripts/scraper/main.py | 10 +-- 5 files changed, 96 insertions(+), 7 deletions(-) create mode 100644 backend/hubspot_trigger_orchestrator/handler.py create mode 100644 backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py create mode 100644 backend/hubspot_trigger_orchestrator/hubspot_trigger_orchestrator_trigger_request.py diff --git a/backend/hubspot_trigger_orchestrator/handler.py b/backend/hubspot_trigger_orchestrator/handler.py new file mode 100644 index 00000000..1f83ed80 --- /dev/null +++ b/backend/hubspot_trigger_orchestrator/handler.py @@ -0,0 +1,61 @@ +import json +from typing import Any, Dict, Mapping, Optional + +from backend.app.db.models.organisation import HubspotDealData +from backend.hubspot_trigger_orchestrator.hubspot_deal_differ import HubspotDealDiffer +from backend.hubspot_trigger_orchestrator.hubspot_trigger_orchestrator_trigger_request import ( + HubspotTriggerOrchestratorTriggerRequest, +) +from backend.utils.subtasks import task_handler +from etl.hubspot.hubspotClient import HubspotClient +from etl.hubspot.hubspotDataTodB import HubspotDataToDb +from utils.logger import setup_logger + +logger = setup_logger() + + +@task_handler() +def handler(event: Mapping[str, Any], context: Any) -> None: + + db_client = HubspotDataToDb() + hubspot_client = HubspotClient() + + for record in event.get("Records", []): + body_dict = json.loads(record["body"]) + + logger.debug("Validating request body") + payload = HubspotTriggerOrchestratorTriggerRequest.model_validate(body_dict) + logger.debug("Successfully validated request body") + + hubspot_deal_id: str = payload.hubspot_deal_id + + db_deal: Optional[HubspotDealData] = db_client.find_deal_with_deal_id( + hubspot_deal_id + ) + if not db_deal: + # new hubspot deal, no diffing to do + # TODO: trigger hubspot to db ETL + return + + hubspot_deal: Dict[str, str] + company: Optional[str] + listing: Optional[dict[str, str]] + + hubspot_deal, company, listing = hubspot_client.get_deal_info_for_db( + hubspot_deal_id + ) + + if HubspotDealDiffer.check_for_pashub_trigger( + new_deal=hubspot_deal, old_deal=db_deal + ): + # TODO: trigger pashub file fetcher + return + + if HubspotDealDiffer.check_for_db_update_trigger( + new_deal=hubspot_deal, + new_company=company, + new_listing=listing, + old_deal=db_deal, + ): + # TODO: trigger db upsert + return diff --git a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py new file mode 100644 index 00000000..9d66c637 --- /dev/null +++ b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py @@ -0,0 +1,21 @@ +from typing import Dict, Optional + +from backend.app.db.models.organisation import HubspotDealData + + +class HubspotDealDiffer: + + @staticmethod + def check_for_pashub_trigger( + new_deal: Dict[str, str], old_deal: HubspotDealData + ) -> bool: + raise NotImplementedError + + @staticmethod + def check_for_db_update_trigger( + new_deal: Dict[str, str], + new_company: Optional[str], + new_listing: Optional[Dict[str, str]], + old_deal: HubspotDealData, + ) -> bool: + raise NotImplementedError diff --git a/backend/hubspot_trigger_orchestrator/hubspot_trigger_orchestrator_trigger_request.py b/backend/hubspot_trigger_orchestrator/hubspot_trigger_orchestrator_trigger_request.py new file mode 100644 index 00000000..1adfa07c --- /dev/null +++ b/backend/hubspot_trigger_orchestrator/hubspot_trigger_orchestrator_trigger_request.py @@ -0,0 +1,5 @@ +from pydantic import BaseModel + + +class HubspotTriggerOrchestratorTriggerRequest(BaseModel): + hubspot_deal_id: str diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 6325efc2..36167bf0 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -74,7 +74,7 @@ class HubspotDataToDb: .all() ) - def find_deal_with_deal_id(self, deal_id): + def find_deal_with_deal_id(self, deal_id: str) -> Optional[HubspotDealData]: with db_read_session() as session: return ( session.query(HubspotDealData) @@ -477,7 +477,9 @@ class HubspotDataToDb: dealname=deal_data.get("dealname"), dealstage=deal_data.get("dealstage"), listing_id=listing.get("listing_id", None) if listing else None, - landlord_property_id=listing.get("owner_property_id") if listing else None, + landlord_property_id=( + listing.get("owner_property_id") if listing else None + ), uprn=listing.get("national_uprn") if listing else None, outcome=deal_data.get("outcome"), outcome_notes=deal_data.get("outcome_notes"), diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 4f71c6d0..a003ad28 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -16,9 +16,9 @@ def handler(body: dict[str, Any], context: Any) -> None: hubspot: HubspotClient = HubspotClient() dbloader: HubspotDataToDb = HubspotDataToDb() - deal = dbloader.find_deal_with_deal_id(hubspot_deal_id) - if deal: - dbloader.update_deal_with_checks(deal, hubspot) + db_deal = dbloader.find_deal_with_deal_id(hubspot_deal_id) + if db_deal: + dbloader.update_deal_with_checks(db_deal, hubspot) else: - deal, company, listing = hubspot.get_deal_info_for_db(hubspot_deal_id) - dbloader.upsert_deal(deal, company, listing, hubspot) + hubspot_deal, company, listing = hubspot.get_deal_info_for_db(hubspot_deal_id) + dbloader.upsert_deal(hubspot_deal, company, listing, hubspot) From d6bfef59aff595c9aa3baa9a2aaccd4581d69b4d Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 14:48:17 +0000 Subject: [PATCH 11/43] remove db update from hubspot client get method --- .../hubspot_trigger_orchestrator/handler.py | 2 +- etl/hubspot/hubspotClient.py | 10 ++---- etl/hubspot/hubspotDataTodB.py | 23 ++++++++------ etl/hubspot/scripts/scraper/main.py | 31 ++++++++++++++----- .../tests/test_hubspot_client_integration.py | 2 +- 5 files changed, 41 insertions(+), 27 deletions(-) diff --git a/backend/hubspot_trigger_orchestrator/handler.py b/backend/hubspot_trigger_orchestrator/handler.py index 1f83ed80..c79fe2b9 100644 --- a/backend/hubspot_trigger_orchestrator/handler.py +++ b/backend/hubspot_trigger_orchestrator/handler.py @@ -41,7 +41,7 @@ def handler(event: Mapping[str, Any], context: Any) -> None: company: Optional[str] listing: Optional[dict[str, str]] - hubspot_deal, company, listing = hubspot_client.get_deal_info_for_db( + hubspot_deal, company, listing = hubspot_client.get_deal_company_listing( hubspot_deal_id ) diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index a9ea535d..777ad482 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -26,10 +26,10 @@ from hubspot.crm.associations.v4.models import ( # type: ignore[reportMissingTy ForwardPaging as AssociationsPaging, NextPage as AssociationsPagingNext, ) -from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb from backend.app.config import get_settings +from etl.hubspot.company_data import CompanyData from utils.logger import setup_logger import mimetypes @@ -279,18 +279,12 @@ class HubspotClient: deal_info: dict[str, str] = cast(dict[str, str], deal.properties) # type: ignore[reportUnknownMemberType] return deal_info - def get_deal_info_for_db( + def get_deal_company_listing( self, deal_id: str ) -> tuple[dict[str, str], Optional[str], Optional[dict[str, str]]]: deal: dict[str, str] = self.from_deal_id_get_info(deal_id) company: Optional[str] = self.from_deal_id_get_associated_company_id(deal_id) - - if company: - company_data: CompanyData = self.get_company_information(company) - dbloader: HubspotDataToDb = HubspotDataToDb() - dbloader.upsert_company(company_data) - listing: Optional[dict[str, str]] = self.from_deal_id_get_associated_listing( deal_id ) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 36167bf0..49dd1685 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -2,17 +2,14 @@ from backend.app.db.connection import db_read_session from backend.app.db.models.organisation import Organisation, HubspotDealData from sqlmodel import select from datetime import datetime, timezone -from typing import TypedDict, Optional +from typing import Dict, Optional +from etl.hubspot.company_data import CompanyData +from etl.hubspot.hubspotClient import HubspotClient from etl.hubspot.s3_uploader import S3Uploader import hashlib import os -class CompanyData(TypedDict): - hs_object_id: str - name: str - - class HubspotDataToDb: def __init__(self): self.s3 = S3Uploader( @@ -98,7 +95,9 @@ class HubspotDataToDb: sha256.update(chunk) return sha256.hexdigest() - def update_deal_with_checks(self, deal_in_db, hubspot_client) -> bool: + def update_deal_with_checks( + self, deal_in_db: HubspotDealData, hubspot_client: HubspotClient + ) -> bool: """ Checks if a deal needs updating and syncs it with HubSpot. Also handles major_condition_issue_photos file upload to S3 with integrity check. @@ -112,7 +111,7 @@ class HubspotDataToDb: print(f"🔍 Checking if deal needs updating (deal_id={deal_in_db.deal_id})") - hs_deal, hs_company_id, hs_listing = hubspot_client.get_deal_info_for_db( + hs_deal, hs_company_id, hs_listing = hubspot_client.get_deal_company_listing( deal_in_db.deal_id ) @@ -346,7 +345,13 @@ class HubspotDataToDb: return True - def upsert_deal(self, deal_data, company, listing, hubspot_client): + def upsert_deal( + self, + deal_data: Dict[str, str], + company: Optional[str], + listing: Optional[dict[str, str]], + hubspot_client: HubspotClient, + ): """ Inserts or updates a deal record. Also uploads photos if present and adds S3 URL. diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index a003ad28..e5658a20 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -1,7 +1,8 @@ +from backend.app.db.models.organisation import HubspotDealData from etl.hubspot.hubspotClient import HubspotClient -from etl.hubspot.hubspotDataTodB import HubspotDataToDb +from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb from backend.utils.subtasks import task_handler -from typing import Any +from typing import Any, Dict, Optional @task_handler() @@ -14,11 +15,25 @@ def handler(body: dict[str, Any], context: Any) -> None: ) hubspot_deal_id = "327170793707" - hubspot: HubspotClient = HubspotClient() - dbloader: HubspotDataToDb = HubspotDataToDb() - db_deal = dbloader.find_deal_with_deal_id(hubspot_deal_id) + hubspot_client = HubspotClient() + db_client = HubspotDataToDb() + db_deal: Optional[HubspotDealData] = db_client.find_deal_with_deal_id( + hubspot_deal_id + ) if db_deal: - dbloader.update_deal_with_checks(db_deal, hubspot) + db_client.update_deal_with_checks(db_deal, hubspot_client) else: - hubspot_deal, company, listing = hubspot.get_deal_info_for_db(hubspot_deal_id) - dbloader.upsert_deal(hubspot_deal, company, listing, hubspot) + hubspot_deal: Dict[str, str] + company: Optional[str] + listing: Optional[dict[str, str]] + + hubspot_deal, company, listing = hubspot_client.get_deal_company_listing( + hubspot_deal_id + ) + + if company: + company_data: CompanyData = hubspot_client.get_company_information(company) + db_client: HubspotDataToDb = HubspotDataToDb() + db_client.upsert_company(company_data) + + db_client.upsert_deal(hubspot_deal, company, listing, hubspot_client) diff --git a/etl/hubspot/tests/test_hubspot_client_integration.py b/etl/hubspot/tests/test_hubspot_client_integration.py index a3d8ae54..d0dd818a 100644 --- a/etl/hubspot/tests/test_hubspot_client_integration.py +++ b/etl/hubspot/tests/test_hubspot_client_integration.py @@ -71,7 +71,7 @@ class TestHubspotClientIntegration: def test_get_deal_info_for_db(self, client: HubspotClient): deal_id: str = "263490768079" - deal, company, listing = client.get_deal_info_for_db(deal_id) + deal, company, listing = client.get_deal_company_listing(deal_id) assert "dealname" in deal assert "dealstage" in deal From b968fbab448c39aaabd21251c406f5ca7c0a8f83 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 14:48:29 +0000 Subject: [PATCH 12/43] include missing file --- etl/hubspot/company_data.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 etl/hubspot/company_data.py diff --git a/etl/hubspot/company_data.py b/etl/hubspot/company_data.py new file mode 100644 index 00000000..13b2ee88 --- /dev/null +++ b/etl/hubspot/company_data.py @@ -0,0 +1,6 @@ +from typing import TypedDict + + +class CompanyData(TypedDict): + hs_object_id: str + name: str From 540054e12f83514aa5baf2861235514e4450bae1 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 14:53:11 +0000 Subject: [PATCH 13/43] rename method --- backend/hubspot_trigger_orchestrator/handler.py | 4 ++-- etl/hubspot/hubspotClient.py | 2 +- etl/hubspot/hubspotDataTodB.py | 4 ++-- etl/hubspot/scripts/scraper/main.py | 4 ++-- etl/hubspot/tests/test_hubspot_client_integration.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/backend/hubspot_trigger_orchestrator/handler.py b/backend/hubspot_trigger_orchestrator/handler.py index c79fe2b9..38724812 100644 --- a/backend/hubspot_trigger_orchestrator/handler.py +++ b/backend/hubspot_trigger_orchestrator/handler.py @@ -41,8 +41,8 @@ def handler(event: Mapping[str, Any], context: Any) -> None: company: Optional[str] listing: Optional[dict[str, str]] - hubspot_deal, company, listing = hubspot_client.get_deal_company_listing( - hubspot_deal_id + hubspot_deal, company, listing = ( + hubspot_client.get_deal_and_company_and_listing(hubspot_deal_id) ) if HubspotDealDiffer.check_for_pashub_trigger( diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index 777ad482..cedaa7f3 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -279,7 +279,7 @@ class HubspotClient: deal_info: dict[str, str] = cast(dict[str, str], deal.properties) # type: ignore[reportUnknownMemberType] return deal_info - def get_deal_company_listing( + def get_deal_and_company_and_listing( self, deal_id: str ) -> tuple[dict[str, str], Optional[str], Optional[dict[str, str]]]: diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 49dd1685..e7008618 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -111,8 +111,8 @@ class HubspotDataToDb: print(f"🔍 Checking if deal needs updating (deal_id={deal_in_db.deal_id})") - hs_deal, hs_company_id, hs_listing = hubspot_client.get_deal_company_listing( - deal_in_db.deal_id + hs_deal, hs_company_id, hs_listing = ( + hubspot_client.get_deal_and_company_and_listing(deal_in_db.deal_id) ) # Soft compare key fields diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index e5658a20..d8d4a357 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -27,8 +27,8 @@ def handler(body: dict[str, Any], context: Any) -> None: company: Optional[str] listing: Optional[dict[str, str]] - hubspot_deal, company, listing = hubspot_client.get_deal_company_listing( - hubspot_deal_id + hubspot_deal, company, listing = ( + hubspot_client.get_deal_and_company_and_listing(hubspot_deal_id) ) if company: diff --git a/etl/hubspot/tests/test_hubspot_client_integration.py b/etl/hubspot/tests/test_hubspot_client_integration.py index d0dd818a..0f4b425c 100644 --- a/etl/hubspot/tests/test_hubspot_client_integration.py +++ b/etl/hubspot/tests/test_hubspot_client_integration.py @@ -71,7 +71,7 @@ class TestHubspotClientIntegration: def test_get_deal_info_for_db(self, client: HubspotClient): deal_id: str = "263490768079" - deal, company, listing = client.get_deal_company_listing(deal_id) + deal, company, listing = client.get_deal_and_company_and_listing(deal_id) assert "dealname" in deal assert "dealstage" in deal From 21ca0d7649bc2ff9be36dd236d80b3abed391894 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 15:25:50 +0000 Subject: [PATCH 14/43] =?UTF-8?q?diff=20checker=20for=20pashub=20trigger?= =?UTF-8?q?=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pytest.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index db7afaf5..792b27e0 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,6 @@ pythonpath = . log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests markers = integration: mark a test as an integration test From 39f37f1668907db733ffa602e1a2b84c9e766fd0 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 15:26:32 +0000 Subject: [PATCH 15/43] =?UTF-8?q?diff=20checker=20for=20pashub=20trigger?= =?UTF-8?q?=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_hubspot_deal_differ.py | 424 ++++++++++++++++++ 1 file changed, 424 insertions(+) create mode 100644 backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py diff --git a/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py b/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py new file mode 100644 index 00000000..ddca766a --- /dev/null +++ b/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py @@ -0,0 +1,424 @@ +from datetime import datetime +from typing import Dict +import uuid + +from backend.app.db.models.organisation import HubspotDealData +from backend.hubspot_trigger_orchestrator.hubspot_deal_differ import HubspotDealDiffer + + +def test_pashub_trigger__outcome_note_added__returns_false() -> None: + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "outcome_notes": "test note", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = False + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__pashub_link_changed__returns_true() -> None: + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + pashub_link="www.google.co.uk", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "pashub_link": "www.bbc.co.uk", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = True + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__coordination_completed_and_pashub_link_set__returns_true() -> ( + None +): + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + pashub_link="www.google.co.uk", + coordination_status="random", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "coordination_status": "v1 ioe/mtp complete", + "pashub_link": "www.google.co.uk", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = True + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__coordination_completed_and_pashub_link_set__returns_true_2() -> ( + None +): + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + pashub_link="www.google.co.uk", + coordination_status="random", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "coordination_status": "v2 ioe/mtp complete", + "pashub_link": "www.google.co.uk", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = True + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__coordination_completed_and_pashub_link_not_set__returns_false() -> ( + None +): + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + coordination_status="random", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "coordination_status": "v2 ioe/mtp complete", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = False + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__coordination_status_not_completed_and_pashub_link_set__returns_false() -> ( + None +): + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + pashub_link="www.google.co.uk", + coordination_status="random", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "coordination_status": "not complete", + "pashub_link": "www.google.co.uk", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = False + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__design_completed_and_pashub_link_set__returns_true() -> None: + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + pashub_link="www.google.co.uk", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "pashub_link": "www.google.co.uk", + "design_status": "uploaded", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = True + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__design_completed_and_pashub_link_not_set__returns_false() -> ( + None +): + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "design_status": "uploaded", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = False + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__design_not_completed_and_pashub_link_set__returns_false() -> ( + None +): + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + pashub_link="www.google.co.uk", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "pashub_link": "www.google.co.uk", + "design_status": "not uploaded", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = False + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__lodgement_completed_and_pashub_link_set__returns_true() -> ( + None +): + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + pashub_link="www.google.co.uk", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "pashub_link": "www.google.co.uk", + "lodgement_status": "lodgement complete", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = True + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__lodgement_completed_and_pashub_link_set__returns_true_2() -> ( + None +): + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + pashub_link="www.google.co.uk", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "pashub_link": "www.google.co.uk", + "lodgement_status": "measures lodged", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = True + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__lodgement_completed_and_pashub_link_not_set__returns_false() -> ( + None +): + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "design_status": "lodgement complete", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = False + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output + + +def test_pashub_trigger__lodgement_not_completed_and_pashub_link_set__returns_false() -> ( + None +): + # arrange + deal_id = uuid.uuid4() + + old_deal = HubspotDealData( + id=deal_id, + deal_id="1", + pashub_link="www.google.co.uk", + created_at=datetime(2025, 12, 1, 12, 0, 0), + updated_at=datetime(2025, 12, 1, 12, 0, 0), + ) + new_deal: Dict[str, str] = { + "id": str(deal_id), + "deal_id": "1", + "pashub_link": "www.google.co.uk", + "lodgement_status": "lodgement not complete", + "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + } + + expected_output = False + + # act + actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, old_deal=old_deal + ) + + # assert + assert actual_output == expected_output From 9f7448ac438cbfd6ece4f91d556fa58f2798abce Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 15:35:42 +0000 Subject: [PATCH 16/43] =?UTF-8?q?pashub=20trigger=20true=20if=20pashub=20l?= =?UTF-8?q?ink=20is=20changed=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hubspot_deal_differ.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py index 9d66c637..50f3af04 100644 --- a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py +++ b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py @@ -9,6 +9,23 @@ class HubspotDealDiffer: def check_for_pashub_trigger( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: + """ + Case 1: PasHub Link is updated + Case 2: Coordination is completed (and PasHub Link is populated) + Case 3: Design is completed (and PasHub Link is populated) + Case 4: Lodgement is completed (and PasHub Link is populated) + """ + new_pashub_link: Optional[str] = new_deal["pashub_link"] + # Case 1 + if not new_pashub_link: + return False + + if not old_deal.pashub_link: + return True + + if old_deal.pashub_link != new_pashub_link: + return True + raise NotImplementedError @staticmethod From ad2c979b155840f36a14b239aa1cccaa20e361ca Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 15:35:52 +0000 Subject: [PATCH 17/43] =?UTF-8?q?pashub=20trigger=20false=20if=20pashub=20?= =?UTF-8?q?link=20not=20set=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py index 50f3af04..ab2b667e 100644 --- a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py +++ b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py @@ -15,7 +15,7 @@ class HubspotDealDiffer: Case 3: Design is completed (and PasHub Link is populated) Case 4: Lodgement is completed (and PasHub Link is populated) """ - new_pashub_link: Optional[str] = new_deal["pashub_link"] + new_pashub_link: Optional[str] = new_deal.get("pashub_link", "") # Case 1 if not new_pashub_link: return False From 832bcd96e457a71453e1d4d2aa97a73dcba1b243 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 15:42:29 +0000 Subject: [PATCH 18/43] =?UTF-8?q?pashub=20trigger=20true=20if=20coordinati?= =?UTF-8?q?on=20complete=20and=20pashub=20link=20set=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hubspot_deal_differ.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py index ab2b667e..77208432 100644 --- a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py +++ b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional +from typing import Dict, List, Optional from backend.app.db.models.organisation import HubspotDealData @@ -15,7 +15,11 @@ class HubspotDealDiffer: Case 3: Design is completed (and PasHub Link is populated) Case 4: Lodgement is completed (and PasHub Link is populated) """ - new_pashub_link: Optional[str] = new_deal.get("pashub_link", "") + new_pashub_link: str = new_deal.get("pashub_link", "") + COORDINATION_COMPLETE: List[str] = [ + "v1 ioe/mtp complete", + "v2 ioe/mtp complete", + ] # Case 1 if not new_pashub_link: return False @@ -26,6 +30,16 @@ class HubspotDealDiffer: if old_deal.pashub_link != new_pashub_link: return True + # Case 2 + new_coordination_status: str = new_deal.get("coordination_status", "") + + if ( + new_coordination_status + and new_coordination_status in COORDINATION_COMPLETE + and new_coordination_status != old_deal.coordination_status + ): + return True + raise NotImplementedError @staticmethod From 0dfd3f5238e969d4d233135c857f2084461b84c5 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 15:45:41 +0000 Subject: [PATCH 19/43] =?UTF-8?q?pashub=20trigger=20true=20if=20design=20c?= =?UTF-8?q?omplete=20and=20pashub=20link=20set=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hubspot_deal_differ.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py index 77208432..ad20aca7 100644 --- a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py +++ b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py @@ -15,11 +15,14 @@ class HubspotDealDiffer: Case 3: Design is completed (and PasHub Link is populated) Case 4: Lodgement is completed (and PasHub Link is populated) """ - new_pashub_link: str = new_deal.get("pashub_link", "") COORDINATION_COMPLETE: List[str] = [ "v1 ioe/mtp complete", "v2 ioe/mtp complete", ] + RETROFIT_DESIGN_COMPLETE = "uploaded" + + new_pashub_link: str = new_deal.get("pashub_link", "") + # Case 1 if not new_pashub_link: return False @@ -40,6 +43,16 @@ class HubspotDealDiffer: ): return True + # Case 3 + new_design_status: str = new_deal.get("design_status", "") + + if ( + new_design_status + and new_design_status == RETROFIT_DESIGN_COMPLETE + and new_design_status != old_deal.design_status + ): + return True + raise NotImplementedError @staticmethod From 9da0cabb0ffcbb7338d5dd0c9796234202acbb0a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 15:52:41 +0000 Subject: [PATCH 20/43] =?UTF-8?q?pashub=20trigger=20true=20if=20lodgement?= =?UTF-8?q?=20complete=20and=20pashub=20link=20set=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hubspot_deal_differ.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py index ad20aca7..8f96ce73 100644 --- a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py +++ b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py @@ -20,6 +20,7 @@ class HubspotDealDiffer: "v2 ioe/mtp complete", ] RETROFIT_DESIGN_COMPLETE = "uploaded" + LODGEMENT_COMPLETE: List[str] = ["lodgement complete", "measures lodged"] new_pashub_link: str = new_deal.get("pashub_link", "") @@ -53,7 +54,17 @@ class HubspotDealDiffer: ): return True - raise NotImplementedError + # Case 4 + new_lodgement_status: str = new_deal.get("lodgement_status", "") + + if ( + new_lodgement_status + and new_lodgement_status in LODGEMENT_COMPLETE + and new_lodgement_status != old_deal.lodgement_status + ): + return True + + return False @staticmethod def check_for_db_update_trigger( From 2d0bc67731239d045c6f7106ba87e6a7b640b2ff Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 15:59:29 +0000 Subject: [PATCH 21/43] =?UTF-8?q?diff=20checker=20for=20pashub=20trigger?= =?UTF-8?q?=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../hubspot_deal_differ.py | 133 ++++++++++-------- 1 file changed, 72 insertions(+), 61 deletions(-) diff --git a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py index 8f96ce73..1dd4ed51 100644 --- a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py +++ b/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py @@ -4,67 +4,12 @@ from backend.app.db.models.organisation import HubspotDealData class HubspotDealDiffer: - - @staticmethod - def check_for_pashub_trigger( - new_deal: Dict[str, str], old_deal: HubspotDealData - ) -> bool: - """ - Case 1: PasHub Link is updated - Case 2: Coordination is completed (and PasHub Link is populated) - Case 3: Design is completed (and PasHub Link is populated) - Case 4: Lodgement is completed (and PasHub Link is populated) - """ - COORDINATION_COMPLETE: List[str] = [ - "v1 ioe/mtp complete", - "v2 ioe/mtp complete", - ] - RETROFIT_DESIGN_COMPLETE = "uploaded" - LODGEMENT_COMPLETE: List[str] = ["lodgement complete", "measures lodged"] - - new_pashub_link: str = new_deal.get("pashub_link", "") - - # Case 1 - if not new_pashub_link: - return False - - if not old_deal.pashub_link: - return True - - if old_deal.pashub_link != new_pashub_link: - return True - - # Case 2 - new_coordination_status: str = new_deal.get("coordination_status", "") - - if ( - new_coordination_status - and new_coordination_status in COORDINATION_COMPLETE - and new_coordination_status != old_deal.coordination_status - ): - return True - - # Case 3 - new_design_status: str = new_deal.get("design_status", "") - - if ( - new_design_status - and new_design_status == RETROFIT_DESIGN_COMPLETE - and new_design_status != old_deal.design_status - ): - return True - - # Case 4 - new_lodgement_status: str = new_deal.get("lodgement_status", "") - - if ( - new_lodgement_status - and new_lodgement_status in LODGEMENT_COMPLETE - and new_lodgement_status != old_deal.lodgement_status - ): - return True - - return False + COORDINATION_COMPLETE: List[str] = [ + "v1 ioe/mtp complete", + "v2 ioe/mtp complete", + ] + RETROFIT_DESIGN_COMPLETE = "uploaded" + LODGEMENT_COMPLETE: List[str] = ["lodgement complete", "measures lodged"] @staticmethod def check_for_db_update_trigger( @@ -74,3 +19,69 @@ class HubspotDealDiffer: old_deal: HubspotDealData, ) -> bool: raise NotImplementedError + + @staticmethod + def check_for_pashub_trigger( + new_deal: Dict[str, str], old_deal: HubspotDealData + ) -> bool: + new_pashub_link: str = new_deal.get("pashub_link", "") + + if not HubspotDealDiffer._has_valid_pashub_link(new_pashub_link): + return False + + if HubspotDealDiffer._new_or_updated_pashub_link(new_pashub_link, old_deal): + return True + + if HubspotDealDiffer._coordination_completed(new_deal, old_deal): + return True + + if HubspotDealDiffer._design_completed(new_deal, old_deal): + return True + + if HubspotDealDiffer._lodgement_completed(new_deal, old_deal): + return True + + return False + + @staticmethod + def _has_valid_pashub_link(new_pashub_link: str) -> bool: + return bool(new_pashub_link) + + @staticmethod + def _new_or_updated_pashub_link( + new_pashub_link: str, old_deal: HubspotDealData + ) -> bool: + if not old_deal.pashub_link: + return True + return old_deal.pashub_link != new_pashub_link + + @staticmethod + def _coordination_completed( + new_deal: Dict[str, str], old_deal: HubspotDealData + ) -> bool: + new_status: str = new_deal.get("coordination_status", "") + return ( + new_status != "" + and new_status in HubspotDealDiffer.COORDINATION_COMPLETE + and new_status != old_deal.coordination_status + ) + + @staticmethod + def _design_completed(new_deal: Dict[str, str], old_deal: HubspotDealData) -> bool: + new_status: str = new_deal.get("design_status", "") + return ( + new_status != "" + and new_status == HubspotDealDiffer.RETROFIT_DESIGN_COMPLETE + and new_status != old_deal.design_status + ) + + @staticmethod + def _lodgement_completed( + new_deal: Dict[str, str], old_deal: HubspotDealData + ) -> bool: + new_status: str = new_deal.get("lodgement_status", "") + return ( + new_status != "" + and new_status in HubspotDealDiffer.LODGEMENT_COMPLETE + and new_status != old_deal.lodgement_status + ) From f719149c03fe1827499fbc66df1facc0ab325676 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 8 Apr 2026 16:04:30 +0000 Subject: [PATCH 22/43] =?UTF-8?q?replace=20incorrect=20tests=20?= =?UTF-8?q?=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_hubspot_deal_differ.py | 73 +------------------ 1 file changed, 4 insertions(+), 69 deletions(-) diff --git a/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py b/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py index ddca766a..ba6b80e4 100644 --- a/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py +++ b/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py @@ -165,40 +165,6 @@ def test_pashub_trigger__coordination_completed_and_pashub_link_not_set__returns assert actual_output == expected_output -def test_pashub_trigger__coordination_status_not_completed_and_pashub_link_set__returns_false() -> ( - None -): - # arrange - deal_id = uuid.uuid4() - - old_deal = HubspotDealData( - id=deal_id, - deal_id="1", - pashub_link="www.google.co.uk", - coordination_status="random", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "coordination_status": "not complete", - "pashub_link": "www.google.co.uk", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } - - expected_output = False - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal - ) - - # assert - assert actual_output == expected_output - - def test_pashub_trigger__design_completed_and_pashub_link_set__returns_true() -> None: # arrange deal_id = uuid.uuid4() @@ -261,39 +227,6 @@ def test_pashub_trigger__design_completed_and_pashub_link_not_set__returns_false assert actual_output == expected_output -def test_pashub_trigger__design_not_completed_and_pashub_link_set__returns_false() -> ( - None -): - # arrange - deal_id = uuid.uuid4() - - old_deal = HubspotDealData( - id=deal_id, - deal_id="1", - pashub_link="www.google.co.uk", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "pashub_link": "www.google.co.uk", - "design_status": "not uploaded", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } - - expected_output = False - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal - ) - - # assert - assert actual_output == expected_output - - def test_pashub_trigger__lodgement_completed_and_pashub_link_set__returns_true() -> ( None ): @@ -391,7 +324,7 @@ def test_pashub_trigger__lodgement_completed_and_pashub_link_not_set__returns_fa assert actual_output == expected_output -def test_pashub_trigger__lodgement_not_completed_and_pashub_link_set__returns_false() -> ( +def test_pashub_trigger__coordination_design_lodgement_not_completed_and_pashub_link_set__returns_false() -> ( None ): # arrange @@ -408,7 +341,9 @@ def test_pashub_trigger__lodgement_not_completed_and_pashub_link_set__returns_fa "id": str(deal_id), "deal_id": "1", "pashub_link": "www.google.co.uk", - "lodgement_status": "lodgement not complete", + "coordination_status": "not uploaded", + "design_status": "not uploaded", + "lodgement_status": "not uploaded", "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), } From dd0522713e85456b0b93fbb7a1d114795b1cb56d Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 08:30:28 +0000 Subject: [PATCH 23/43] refactor upsert_deal by introducing helper methods --- etl/hubspot/hubspotDataTodB.py | 383 +++++++++++++++++---------------- 1 file changed, 198 insertions(+), 185 deletions(-) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index e7008618..f0beeee8 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -366,109 +366,9 @@ class HubspotDataToDb: if existing: print(f"🔄 Updating existing deal (deal_id={deal_id})") + self._update_existing_deal(existing, deal_data, listing, company) - for attr, value in { - "dealname": deal_data.get("dealname"), - "dealstage": deal_data.get("dealstage"), - "listing_id": listing.get("listing_id", None) if listing else None, - "landlord_property_id": ( - listing.get("owner_property_id", None) if listing else None - ), - "uprn": listing.get("national_uprn", None) if listing else None, - "outcome": deal_data.get("outcome"), - "outcome_notes": deal_data.get("outcome_notes"), - "project_code": deal_data.get("project_code"), - "company_id": company, - "major_condition_issue_description": deal_data.get( - "major_condition_issue_description" - ), - "major_condition_issue_photos": deal_data.get( - "major_condition_issue_photos" - ), - "coordination_status": deal_data.get( - "coordination_status__stage_1_" - ), - "design_status": deal_data.get("retrofit_design_status"), - "pashub_link": deal_data.get("pashub_link"), - "sharepoint_link": deal_data.get("sharepoint_link"), - "dampmould_growth": deal_data.get("dampmould_growth"), - "damp_mould_and_repairs_comments": deal_data.get( - "damp_mould_and_repairs_comments" - ), - "pre_sap": deal_data.get("pre_sap"), - "coordinator": deal_data.get("coordinator"), - "mtp_completion_date": self._parse_hs_date( - deal_data.get("mtp_completion_date") - ), - "mtp_re_model_completion_date": self._parse_hs_date( - deal_data.get("mtp_re_model_completion_date") - ), - "ioe_v3_completion_date": self._parse_hs_date( - deal_data.get("ioe_v3_completion_date") - ), - "proposed_measures": deal_data.get("proposed_measures"), - "approved_package": deal_data.get("approved_package"), - "designer": deal_data.get("designer"), - "design_completion_date": self._parse_hs_date( - deal_data.get("design_completion_date") - ), - "actual_measures_installed": deal_data.get( - "actual_measures_installed" - ), - "installer": deal_data.get("installer"), - "installer_handover": deal_data.get("installer_handover"), - "lodgement_status": deal_data.get("lodgement_status"), - "measures_lodgement_date": self._parse_hs_date( - deal_data.get("measures_lodgement_date") - ), - "lodgement_date": self._parse_hs_date( - deal_data.get("lodgement_date") - ), - "expected_commencement_date": self._parse_hs_date( - deal_data.get("expected_commencement_date") - ), - "surveyor": deal_data.get("surveyor"), - "confirmed_survey_date": self._parse_hs_date( - deal_data.get("confirmed_survey_date") - ), - "confirmed_survey_time": deal_data.get("confirmed_survey_time"), - "surveyed_date": self._parse_hs_date( - deal_data.get("surveyed_date") - ), - "design_type": deal_data.get("design_type"), - }.items(): - setattr(existing, attr, value or getattr(existing, attr)) - - # Upload if photo exists but S3 link missing - if ( - existing.major_condition_issue_photos - and not existing.major_condition_issue_evidence_s3_url - ): - # Fetch fresh URL from HubSpot instead of using potentially expired stored URL - fresh_deal = hubspot_client.from_deal_id_get_info(existing.deal_id) - photo_url = fresh_deal.get("major_condition_issue_photos") - - if photo_url: - try: - local_file = hubspot_client.download_file_from_url( - photo_url - ) - s3_url = self.s3.upload_file( - local_file, - "retrofit-data-dev", - prefix="hubspot/awaabs_law_evidence/", - ) - existing.major_condition_issue_evidence_s3_url = s3_url - except Exception as e: - print( - f"⚠️ Failed to download photo for deal_id {existing.deal_id}: {e}" - ) - # Continue without the file — don't crash the update - finally: - if "local_file" in locals() and os.path.exists(local_file): - os.remove(local_file) - else: - print(f"⚠️ Photo URL missing for deal_id {existing.deal_id}") + self._handle_existing_photo_upload(existing, hubspot_client) session.add(existing) session.commit() @@ -477,94 +377,207 @@ class HubspotDataToDb: else: print(f"🆕 Inserting new deal (deal_id={deal_id})") - new_record = HubspotDealData( - deal_id=deal_id, - dealname=deal_data.get("dealname"), - dealstage=deal_data.get("dealstage"), - listing_id=listing.get("listing_id", None) if listing else None, - landlord_property_id=( - listing.get("owner_property_id") if listing else None - ), - uprn=listing.get("national_uprn") if listing else None, - outcome=deal_data.get("outcome"), - outcome_notes=deal_data.get("outcome_notes"), - project_code=deal_data.get("project_code"), - company_id=company, - major_condition_issue_description=deal_data.get( - "major_condition_issue_description" - ), - major_condition_issue_photos=deal_data.get( - "major_condition_issue_photos" - ), - coordination_status=deal_data.get("coordination_status__stage_1_"), - design_status=deal_data.get("retrofit_design_status"), - pashub_link=deal_data.get("pashub_link"), - sharepoint_link=deal_data.get("sharepoint_link"), - dampmould_growth=deal_data.get("dampmould_growth"), - damp_mould_and_repairs_comments=deal_data.get( - "damp_mould_and_repairs_comments" - ), - pre_sap=deal_data.get("pre_sap"), - coordinator=deal_data.get("coordinator"), - mtp_completion_date=self._parse_hs_date( - deal_data.get("mtp_completion_date") - ), - mtp_re_model_completion_date=self._parse_hs_date( - deal_data.get("mtp_re_model_completion_date") - ), - ioe_v3_completion_date=self._parse_hs_date( - deal_data.get("ioe_v3_completion_date") - ), - proposed_measures=deal_data.get("proposed_measures"), - approved_package=deal_data.get("approved_package"), - designer=deal_data.get("designer"), - design_completion_date=self._parse_hs_date( - deal_data.get("design_completion_date") - ), - actual_measures_installed=deal_data.get( - "actual_measures_installed" - ), - installer=deal_data.get("installer"), - installer_handover=deal_data.get("installer_handover"), - lodgement_status=deal_data.get("lodgement_status"), - measures_lodgement_date=self._parse_hs_date( - deal_data.get("measures_lodgement_date") - ), - lodgement_date=self._parse_hs_date(deal_data.get("lodgement_date")), - expected_commencement_date=self._parse_hs_date( - deal_data.get("expected_commencement_date") - ), - surveyor=deal_data.get("surveyor"), - confirmed_survey_date=self._parse_hs_date( - deal_data.get("confirmed_survey_date") - ), - confirmed_survey_time=deal_data.get("confirmed_survey_time"), - surveyed_date=self._parse_hs_date(deal_data.get("surveyed_date")), - design_type=deal_data.get("design_type"), + new_record: HubspotDealData = self._build_new_deal( + deal_id, deal_data, listing, company ) # Handle upload at insert time - if new_record.major_condition_issue_photos: - try: - local_file = hubspot_client.download_file_from_url( - new_record.major_condition_issue_photos - ) - s3_url = self.s3.upload_file( - local_file, - "retrofit-data-dev", - prefix="hubspot/awaabs_law_evidence/", - ) - new_record.major_condition_issue_evidence_s3_url = s3_url - except Exception as e: - print( - f"⚠️ Failed to download photo for deal_id {new_record.deal_id}: {e}" - ) - # Continue without the file — don't crash the insert - finally: - if "local_file" in locals() and os.path.exists(local_file): - os.remove(local_file) + self._handle_new_photo_upload(new_record, hubspot_client) session.add(new_record) session.commit() session.refresh(new_record) return new_record + + def _update_existing_deal( + self, + existing: HubspotDealData, + deal_data: Dict[str, str], + listing: Optional[dict[str, str]], + company: Optional[str], + ): + for attr, value in { + "dealname": deal_data.get("dealname"), + "dealstage": deal_data.get("dealstage"), + "listing_id": listing.get("listing_id", None) if listing else None, + "landlord_property_id": ( + listing.get("owner_property_id", None) if listing else None + ), + "uprn": listing.get("national_uprn", None) if listing else None, + "outcome": deal_data.get("outcome"), + "outcome_notes": deal_data.get("outcome_notes"), + "project_code": deal_data.get("project_code"), + "company_id": company, + "major_condition_issue_description": deal_data.get( + "major_condition_issue_description" + ), + "major_condition_issue_photos": deal_data.get( + "major_condition_issue_photos" + ), + "coordination_status": deal_data.get("coordination_status__stage_1_"), + "design_status": deal_data.get("retrofit_design_status"), + "pashub_link": deal_data.get("pashub_link"), + "sharepoint_link": deal_data.get("sharepoint_link"), + "dampmould_growth": deal_data.get("dampmould_growth"), + "damp_mould_and_repairs_comments": deal_data.get( + "damp_mould_and_repairs_comments" + ), + "pre_sap": deal_data.get("pre_sap"), + "coordinator": deal_data.get("coordinator"), + "mtp_completion_date": self._parse_hs_date( + deal_data.get("mtp_completion_date") + ), + "mtp_re_model_completion_date": self._parse_hs_date( + deal_data.get("mtp_re_model_completion_date") + ), + "ioe_v3_completion_date": self._parse_hs_date( + deal_data.get("ioe_v3_completion_date") + ), + "proposed_measures": deal_data.get("proposed_measures"), + "approved_package": deal_data.get("approved_package"), + "designer": deal_data.get("designer"), + "design_completion_date": self._parse_hs_date( + deal_data.get("design_completion_date") + ), + "actual_measures_installed": deal_data.get("actual_measures_installed"), + "installer": deal_data.get("installer"), + "installer_handover": deal_data.get("installer_handover"), + "lodgement_status": deal_data.get("lodgement_status"), + "measures_lodgement_date": self._parse_hs_date( + deal_data.get("measures_lodgement_date") + ), + "lodgement_date": self._parse_hs_date(deal_data.get("lodgement_date")), + "expected_commencement_date": self._parse_hs_date( + deal_data.get("expected_commencement_date") + ), + "surveyor": deal_data.get("surveyor"), + "confirmed_survey_date": self._parse_hs_date( + deal_data.get("confirmed_survey_date") + ), + "confirmed_survey_time": deal_data.get("confirmed_survey_time"), + "surveyed_date": self._parse_hs_date(deal_data.get("surveyed_date")), + "design_type": deal_data.get("design_type"), + }.items(): + setattr(existing, attr, value or getattr(existing, attr)) + + def _build_new_deal( + self, + deal_id: str, + deal_data: Dict[str, str], + listing: Optional[dict[str, str]], + company: Optional[str], + ) -> HubspotDealData: + return HubspotDealData( + deal_id=deal_id, + dealname=deal_data.get("dealname"), + dealstage=deal_data.get("dealstage"), + listing_id=listing.get("listing_id") if listing else None, + landlord_property_id=( + listing.get("owner_property_id") if listing else None + ), + uprn=listing.get("national_uprn") if listing else None, + outcome=deal_data.get("outcome"), + outcome_notes=deal_data.get("outcome_notes"), + project_code=deal_data.get("project_code"), + company_id=company, + major_condition_issue_description=deal_data.get( + "major_condition_issue_description" + ), + major_condition_issue_photos=deal_data.get("major_condition_issue_photos"), + coordination_status=deal_data.get("coordination_status__stage_1_"), + design_status=deal_data.get("retrofit_design_status"), + pashub_link=deal_data.get("pashub_link"), + sharepoint_link=deal_data.get("sharepoint_link"), + dampmould_growth=deal_data.get("dampmould_growth"), + damp_mould_and_repairs_comments=deal_data.get( + "damp_mould_and_repairs_comments" + ), + pre_sap=deal_data.get("pre_sap"), + coordinator=deal_data.get("coordinator"), + mtp_completion_date=self._parse_hs_date( + deal_data.get("mtp_completion_date") + ), + mtp_re_model_completion_date=self._parse_hs_date( + deal_data.get("mtp_re_model_completion_date") + ), + ioe_v3_completion_date=self._parse_hs_date( + deal_data.get("ioe_v3_completion_date") + ), + proposed_measures=deal_data.get("proposed_measures"), + approved_package=deal_data.get("approved_package"), + designer=deal_data.get("designer"), + design_completion_date=self._parse_hs_date( + deal_data.get("design_completion_date") + ), + actual_measures_installed=deal_data.get("actual_measures_installed"), + installer=deal_data.get("installer"), + installer_handover=deal_data.get("installer_handover"), + lodgement_status=deal_data.get("lodgement_status"), + measures_lodgement_date=self._parse_hs_date( + deal_data.get("measures_lodgement_date") + ), + lodgement_date=self._parse_hs_date(deal_data.get("lodgement_date")), + expected_commencement_date=self._parse_hs_date( + deal_data.get("expected_commencement_date") + ), + surveyor=deal_data.get("surveyor"), + confirmed_survey_date=self._parse_hs_date( + deal_data.get("confirmed_survey_date") + ), + confirmed_survey_time=deal_data.get("confirmed_survey_time"), + surveyed_date=self._parse_hs_date(deal_data.get("surveyed_date")), + design_type=deal_data.get("design_type"), + ) + + def _handle_existing_photo_upload( + self, + existing: HubspotDealData, + hubspot_client: HubspotClient, + ): + if ( + existing.major_condition_issue_photos + and not existing.major_condition_issue_evidence_s3_url + ): + fresh_deal = hubspot_client.from_deal_id_get_info(existing.deal_id) + photo_url = fresh_deal.get("major_condition_issue_photos") + + if not photo_url: + print(f"⚠️ Photo URL missing for deal_id {existing.deal_id}") + return + + self._upload_photo_to_s3(existing, photo_url, hubspot_client) + + def _handle_new_photo_upload( + self, + record: HubspotDealData, + hubspot_client: HubspotClient, + ): + if record.major_condition_issue_photos: + self._upload_photo_to_s3( + record, + record.major_condition_issue_photos, + hubspot_client, + ) + + def _upload_photo_to_s3( + self, + record: HubspotDealData, + photo_url: str, + hubspot_client: HubspotClient, + ): + try: + local_file = hubspot_client.download_file_from_url(photo_url) + + s3_url = self.s3.upload_file( + local_file, + "retrofit-data-dev", + prefix="hubspot/awaabs_law_evidence/", + ) + + record.major_condition_issue_evidence_s3_url = s3_url + + except Exception as e: + print(f"⚠️ Failed to upload photo for deal_id {record.deal_id}: {e}") + finally: + if "local_file" in locals() and os.path.exists(local_file): + os.remove(local_file) From 8ce76190442e5a7f8a9c1af038651521b7edb105 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 09:14:20 +0000 Subject: [PATCH 24/43] refactor pashub trigger tests --- .../tests/test_hubspot_deal_differ.py | 470 ++++++++---------- etl/hubspot/hubspotDataTodB.py | 72 ++- 2 files changed, 233 insertions(+), 309 deletions(-) diff --git a/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py b/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py index ba6b80e4..75fa7927 100644 --- a/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py +++ b/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py @@ -1,359 +1,295 @@ from datetime import datetime -from typing import Dict +from typing import Any, Dict import uuid +import pytest + from backend.app.db.models.organisation import HubspotDealData from backend.hubspot_trigger_orchestrator.hubspot_deal_differ import HubspotDealDiffer -def test_pashub_trigger__outcome_note_added__returns_false() -> None: - # arrange - deal_id = uuid.uuid4() +BASE_TIME = datetime(2025, 12, 1, 12, 0, 0) - old_deal = HubspotDealData( - id=deal_id, + +def make_old_deal(**overrides: Any) -> HubspotDealData: + return HubspotDealData( + id=overrides.get("id", uuid.uuid4()), deal_id="1", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), + created_at=BASE_TIME, + updated_at=BASE_TIME, + **{k: v for k, v in overrides.items() if k != "id"}, ) - new_deal: Dict[str, str] = { + + +def make_new_deal(deal_id: uuid.UUID, **overrides: Any) -> Dict[str, str]: + return { "id": str(deal_id), "deal_id": "1", - "outcome_notes": "test note", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), + "created_at": BASE_TIME.isoformat(), "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), + **overrides, } - expected_output = False - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal +# ------------------------------------- +# Random change we aren't interested in +# ------------------------------------- + + +@pytest.mark.parametrize( + "new_overrides,expected", + [ + ({"outcome_notes": "test note"}, False), + ], +) +def test_pashub_trigger__outcome_note_added__returns_false( + new_overrides: Dict[str, str], + expected: bool, +) -> None: + deal_id = uuid.uuid4() + old_deal = make_old_deal(id=deal_id) + new_deal = make_new_deal(deal_id, **new_overrides) + + assert ( + HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + == expected ) - # assert - assert actual_output == expected_output + +# ------------------------- +# Pashub link changes +# ------------------------- -def test_pashub_trigger__pashub_link_changed__returns_true() -> None: - # arrange +@pytest.mark.parametrize( + "old_overrides,new_overrides,expected", + [ + ( + {"pashub_link": "www.google.co.uk"}, + {"pashub_link": "www.bbc.co.uk"}, + True, + ), + ], +) +def test_pashub_trigger__pashub_link_changed__returns_true( + old_overrides: Dict[str, str], + new_overrides: Dict[str, str], + expected: bool, +) -> None: + deal_id = uuid.uuid4() + old_deal = make_old_deal(id=deal_id, **old_overrides) + new_deal = make_new_deal(deal_id, **new_overrides) + + assert ( + HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + == expected + ) + + +# ------------------------- +# Coordination +# ------------------------- + + +@pytest.mark.parametrize( + "coordination_status,expected", + [ + ("v1 ioe/mtp complete", True), + ("v2 ioe/mtp complete", True), + ], +) +def test_pashub_trigger__coordination_completed_and_pashub_link_set__returns_true( + coordination_status: str, + expected: bool, +) -> None: deal_id = uuid.uuid4() - old_deal = HubspotDealData( + old_deal = make_old_deal( id=deal_id, - deal_id="1", - pashub_link="www.google.co.uk", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "pashub_link": "www.bbc.co.uk", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } - - expected_output = True - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal - ) - - # assert - assert actual_output == expected_output - - -def test_pashub_trigger__coordination_completed_and_pashub_link_set__returns_true() -> ( - None -): - # arrange - deal_id = uuid.uuid4() - - old_deal = HubspotDealData( - id=deal_id, - deal_id="1", pashub_link="www.google.co.uk", coordination_status="random", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "coordination_status": "v1 ioe/mtp complete", - "pashub_link": "www.google.co.uk", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } - - expected_output = True - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal ) - # assert - assert actual_output == expected_output - - -def test_pashub_trigger__coordination_completed_and_pashub_link_set__returns_true_2() -> ( - None -): - # arrange - deal_id = uuid.uuid4() - - old_deal = HubspotDealData( - id=deal_id, - deal_id="1", + new_deal = make_new_deal( + deal_id, pashub_link="www.google.co.uk", - coordination_status="random", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "coordination_status": "v2 ioe/mtp complete", - "pashub_link": "www.google.co.uk", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } - - expected_output = True - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal + coordination_status=coordination_status, ) - # assert - assert actual_output == expected_output + assert ( + HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + == expected + ) def test_pashub_trigger__coordination_completed_and_pashub_link_not_set__returns_false() -> ( None ): - # arrange deal_id = uuid.uuid4() - old_deal = HubspotDealData( + old_deal = make_old_deal( id=deal_id, - deal_id="1", coordination_status="random", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "coordination_status": "v2 ioe/mtp complete", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } - - expected_output = False - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal ) - # assert - assert actual_output == expected_output + new_deal = make_new_deal( + deal_id, + coordination_status="v2 ioe/mtp complete", + ) + + assert ( + HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + is False + ) + + +# ------------------------- +# Design +# ------------------------- def test_pashub_trigger__design_completed_and_pashub_link_set__returns_true() -> None: - # arrange deal_id = uuid.uuid4() - old_deal = HubspotDealData( + old_deal = make_old_deal( id=deal_id, - deal_id="1", pashub_link="www.google.co.uk", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "pashub_link": "www.google.co.uk", - "design_status": "uploaded", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } - - expected_output = True - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal ) - # assert - assert actual_output == expected_output + new_deal = make_new_deal( + deal_id, + pashub_link="www.google.co.uk", + design_status="uploaded", + ) + + assert ( + HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + is True + ) def test_pashub_trigger__design_completed_and_pashub_link_not_set__returns_false() -> ( None ): - # arrange deal_id = uuid.uuid4() - old_deal = HubspotDealData( - id=deal_id, - deal_id="1", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "design_status": "uploaded", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } + old_deal = make_old_deal(id=deal_id) - expected_output = False - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal + new_deal = make_new_deal( + deal_id, + design_status="uploaded", ) - # assert - assert actual_output == expected_output + assert ( + HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + is False + ) -def test_pashub_trigger__lodgement_completed_and_pashub_link_set__returns_true() -> ( - None -): - # arrange +# ------------------------- +# Lodgement +# ------------------------- + + +@pytest.mark.parametrize( + "lodgement_status,expected", + [ + ("lodgement complete", True), + ("measures lodged", True), + ], +) +def test_pashub_trigger__lodgement_completed_and_pashub_link_set__returns_true( + lodgement_status: str, + expected: bool, +) -> None: deal_id = uuid.uuid4() - old_deal = HubspotDealData( + old_deal = make_old_deal( id=deal_id, - deal_id="1", pashub_link="www.google.co.uk", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "pashub_link": "www.google.co.uk", - "lodgement_status": "lodgement complete", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } - - expected_output = True - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal ) - # assert - assert actual_output == expected_output - - -def test_pashub_trigger__lodgement_completed_and_pashub_link_set__returns_true_2() -> ( - None -): - # arrange - deal_id = uuid.uuid4() - - old_deal = HubspotDealData( - id=deal_id, - deal_id="1", + new_deal = make_new_deal( + deal_id, pashub_link="www.google.co.uk", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "pashub_link": "www.google.co.uk", - "lodgement_status": "measures lodged", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } - - expected_output = True - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal + lodgement_status=lodgement_status, ) - # assert - assert actual_output == expected_output + assert ( + HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + == expected + ) def test_pashub_trigger__lodgement_completed_and_pashub_link_not_set__returns_false() -> ( None ): - # arrange deal_id = uuid.uuid4() - old_deal = HubspotDealData( - id=deal_id, - deal_id="1", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "design_status": "lodgement complete", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } + old_deal = make_old_deal(id=deal_id) - expected_output = False - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal + new_deal = make_new_deal( + deal_id, + design_status="lodgement complete", ) - # assert - assert actual_output == expected_output + assert ( + HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + is False + ) + + +# ------------------------- +# Negative case +# ------------------------- def test_pashub_trigger__coordination_design_lodgement_not_completed_and_pashub_link_set__returns_false() -> ( None ): - # arrange deal_id = uuid.uuid4() - old_deal = HubspotDealData( + old_deal = make_old_deal( id=deal_id, - deal_id="1", pashub_link="www.google.co.uk", - created_at=datetime(2025, 12, 1, 12, 0, 0), - updated_at=datetime(2025, 12, 1, 12, 0, 0), - ) - new_deal: Dict[str, str] = { - "id": str(deal_id), - "deal_id": "1", - "pashub_link": "www.google.co.uk", - "coordination_status": "not uploaded", - "design_status": "not uploaded", - "lodgement_status": "not uploaded", - "created_at": datetime(2025, 12, 1, 12, 0, 0).isoformat(), - "updated_at": datetime(2025, 12, 1, 12, 30, 0).isoformat(), - } - - expected_output = False - - # act - actual_output: bool = HubspotDealDiffer.check_for_pashub_trigger( - new_deal=new_deal, old_deal=old_deal ) - # assert - assert actual_output == expected_output + new_deal = make_new_deal( + deal_id, + pashub_link="www.google.co.uk", + coordination_status="not uploaded", + design_status="not uploaded", + lodgement_status="not uploaded", + ) + + assert ( + HubspotDealDiffer.check_for_pashub_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + is False + ) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index f0beeee8..06cc3be9 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -291,54 +291,33 @@ class HubspotDataToDb: return False # Handle photo upload if it exists but S3 URL is missing - if ( - deal_in_db.major_condition_issue_photos - and not deal_in_db.major_condition_issue_evidence_s3_url - ): + if self._needs_photo_upload(deal_in_db): print( f"🖼️ Found photo for deal_id {deal_in_db.deal_id} — uploading to S3..." ) photo_url = hs_deal.get("major_condition_issue_photos") + if photo_url: - try: - # Download from HubSpot using fresh URL from hs_deal (not stale DB URL) - local_file = hubspot_client.download_file_from_url(photo_url) + self._upload_photo_to_s3( + deal_in_db, + photo_url, + hubspot_client, + verify=True, # 👈 key difference + ) - # Upload to S3 - bucket = "retrofit-data-dev" - s3_url = self.s3.upload_file( - local_file, bucket, prefix="hubspot/awaabs_law_evidence/" + # persist change + with db_read_session() as session: + db_record = session.get(HubspotDealData, deal_in_db.id) + db_record.major_condition_issue_evidence_s3_url = ( + deal_in_db.major_condition_issue_evidence_s3_url ) + session.add(db_record) + session.commit() - # Download again to verify integrity - downloaded = self.s3.download_from_url(s3_url) - if self._sha256(local_file) == self._sha256(downloaded): - print("✅ SHA256 match verified — upload successful.") - else: - print("❌ SHA256 mismatch — integrity check failed.") - raise ValueError("File integrity check failed after S3 upload.") - - # Update DB record with S3 URL - with db_read_session() as session: - db_record = session.get(HubspotDealData, deal_in_db.id) - db_record.major_condition_issue_evidence_s3_url = s3_url - session.add(db_record) - session.commit() - print( - f"✅ Updated DB with S3 URL for deal_id={deal_in_db.deal_id}" - ) - return False - except Exception as e: - print( - f"⚠️ Failed to download/upload photo for deal_id {deal_in_db.deal_id}: {e}" - ) - # Continue without the file — don't crash the entire update - finally: - if "local_file" in locals() and os.path.exists(local_file): - os.remove(local_file) + return False else: - print(f"⚠️ Photo URL missing for deal_id {deal_in_db.deal_id}") + print(f"⚠️ Photo URL missing for deal_id {deal_in_db.deal_id}") else: print(f"✅ No update or upload required for deal_id {deal_in_db.deal_id}.") @@ -534,10 +513,7 @@ class HubspotDataToDb: existing: HubspotDealData, hubspot_client: HubspotClient, ): - if ( - existing.major_condition_issue_photos - and not existing.major_condition_issue_evidence_s3_url - ): + if self._needs_photo_upload(existing): fresh_deal = hubspot_client.from_deal_id_get_info(existing.deal_id) photo_url = fresh_deal.get("major_condition_issue_photos") @@ -564,6 +540,7 @@ class HubspotDataToDb: record: HubspotDealData, photo_url: str, hubspot_client: HubspotClient, + verify: bool = False, ): try: local_file = hubspot_client.download_file_from_url(photo_url) @@ -574,6 +551,11 @@ class HubspotDataToDb: prefix="hubspot/awaabs_law_evidence/", ) + if verify: + downloaded = self.s3.download_from_url(s3_url) + if self._sha256(local_file) != self._sha256(downloaded): + raise ValueError("File integrity check failed after S3 upload.") + record.major_condition_issue_evidence_s3_url = s3_url except Exception as e: @@ -581,3 +563,9 @@ class HubspotDataToDb: finally: if "local_file" in locals() and os.path.exists(local_file): os.remove(local_file) + + def _needs_photo_upload(self, deal: HubspotDealData) -> bool: + return bool( + deal.major_condition_issue_photos + and not deal.major_condition_issue_evidence_s3_url + ) From c439d5f55794f8b1c8f9041d568f28d44f43d0fd Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 09:41:43 +0000 Subject: [PATCH 25/43] move everything to etl/hubspot/ --- .../hubspot_trigger_orchestrator/handler.py | 61 -------------- .../hubspot}/hubspot_deal_differ.py | 0 ...ot_trigger_orchestrator_trigger_request.py | 0 etl/hubspot/scripts/scraper/main.py | 80 +++++++++++++------ .../tests/test_hubspot_deal_differ.py | 2 +- 5 files changed, 57 insertions(+), 86 deletions(-) delete mode 100644 backend/hubspot_trigger_orchestrator/handler.py rename {backend/hubspot_trigger_orchestrator => etl/hubspot}/hubspot_deal_differ.py (100%) rename {backend/hubspot_trigger_orchestrator => etl/hubspot}/hubspot_trigger_orchestrator_trigger_request.py (100%) rename {backend/hubspot_trigger_orchestrator => etl/hubspot}/tests/test_hubspot_deal_differ.py (98%) diff --git a/backend/hubspot_trigger_orchestrator/handler.py b/backend/hubspot_trigger_orchestrator/handler.py deleted file mode 100644 index 38724812..00000000 --- a/backend/hubspot_trigger_orchestrator/handler.py +++ /dev/null @@ -1,61 +0,0 @@ -import json -from typing import Any, Dict, Mapping, Optional - -from backend.app.db.models.organisation import HubspotDealData -from backend.hubspot_trigger_orchestrator.hubspot_deal_differ import HubspotDealDiffer -from backend.hubspot_trigger_orchestrator.hubspot_trigger_orchestrator_trigger_request import ( - HubspotTriggerOrchestratorTriggerRequest, -) -from backend.utils.subtasks import task_handler -from etl.hubspot.hubspotClient import HubspotClient -from etl.hubspot.hubspotDataTodB import HubspotDataToDb -from utils.logger import setup_logger - -logger = setup_logger() - - -@task_handler() -def handler(event: Mapping[str, Any], context: Any) -> None: - - db_client = HubspotDataToDb() - hubspot_client = HubspotClient() - - for record in event.get("Records", []): - body_dict = json.loads(record["body"]) - - logger.debug("Validating request body") - payload = HubspotTriggerOrchestratorTriggerRequest.model_validate(body_dict) - logger.debug("Successfully validated request body") - - hubspot_deal_id: str = payload.hubspot_deal_id - - db_deal: Optional[HubspotDealData] = db_client.find_deal_with_deal_id( - hubspot_deal_id - ) - if not db_deal: - # new hubspot deal, no diffing to do - # TODO: trigger hubspot to db ETL - return - - hubspot_deal: Dict[str, str] - company: Optional[str] - listing: Optional[dict[str, str]] - - hubspot_deal, company, listing = ( - hubspot_client.get_deal_and_company_and_listing(hubspot_deal_id) - ) - - if HubspotDealDiffer.check_for_pashub_trigger( - new_deal=hubspot_deal, old_deal=db_deal - ): - # TODO: trigger pashub file fetcher - return - - if HubspotDealDiffer.check_for_db_update_trigger( - new_deal=hubspot_deal, - new_company=company, - new_listing=listing, - old_deal=db_deal, - ): - # TODO: trigger db upsert - return diff --git a/backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py similarity index 100% rename from backend/hubspot_trigger_orchestrator/hubspot_deal_differ.py rename to etl/hubspot/hubspot_deal_differ.py diff --git a/backend/hubspot_trigger_orchestrator/hubspot_trigger_orchestrator_trigger_request.py b/etl/hubspot/hubspot_trigger_orchestrator_trigger_request.py similarity index 100% rename from backend/hubspot_trigger_orchestrator/hubspot_trigger_orchestrator_trigger_request.py rename to etl/hubspot/hubspot_trigger_orchestrator_trigger_request.py diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index d8d4a357..8c4af1a7 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -1,39 +1,71 @@ from backend.app.db.models.organisation import HubspotDealData from etl.hubspot.hubspotClient import HubspotClient -from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb + +# from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb +from etl.hubspot.hubspotDataTodB import HubspotDataToDb from backend.utils.subtasks import task_handler from typing import Any, Dict, Optional +from etl.hubspot.hubspot_deal_differ import HubspotDealDiffer +from etl.hubspot.hubspot_trigger_orchestrator_trigger_request import ( + HubspotTriggerOrchestratorTriggerRequest, +) + @task_handler() def handler(body: dict[str, Any], context: Any) -> None: - hubspot_deal_id = body.get("hubspot_deal_id", "") - - if hubspot_deal_id == "": - raise RuntimeError( - "Missing Hubspot Deal ID in SQS body request, 'hubspot_deal_id'" - ) - hubspot_deal_id = "327170793707" - - hubspot_client = HubspotClient() db_client = HubspotDataToDb() + hubspot_client = HubspotClient() + + payload = HubspotTriggerOrchestratorTriggerRequest.model_validate(body) + hubspot_deal_id: str = payload.hubspot_deal_id + db_deal: Optional[HubspotDealData] = db_client.find_deal_with_deal_id( hubspot_deal_id ) - if db_deal: - db_client.update_deal_with_checks(db_deal, hubspot_client) - else: - hubspot_deal: Dict[str, str] - company: Optional[str] - listing: Optional[dict[str, str]] - hubspot_deal, company, listing = ( - hubspot_client.get_deal_and_company_and_listing(hubspot_deal_id) - ) + if not db_deal: + # New hubspot deal, no diffing to do + # TODO: Trigger hubspot to db ETL + return - if company: - company_data: CompanyData = hubspot_client.get_company_information(company) - db_client: HubspotDataToDb = HubspotDataToDb() - db_client.upsert_company(company_data) + hubspot_deal: Dict[str, str] + company: Optional[str] + listing: Optional[dict[str, str]] - db_client.upsert_deal(hubspot_deal, company, listing, hubspot_client) + hubspot_deal, company, listing = hubspot_client.get_deal_and_company_and_listing( + hubspot_deal_id + ) + + if HubspotDealDiffer.check_for_pashub_trigger( + new_deal=hubspot_deal, old_deal=db_deal + ): + # TODO: trigger pashub file fetcher + return + + if HubspotDealDiffer.check_for_db_update_trigger( + new_deal=hubspot_deal, + new_company=company, + new_listing=listing, + old_deal=db_deal, + ): + # TODO: trigger db upsert + return + + # if db_deal: + # db_client.update_deal_with_checks(db_deal, hubspot_client) + # else: + # hubspot_deal: Dict[str, str] + # company: Optional[str] + # listing: Optional[dict[str, str]] + + # hubspot_deal, company, listing = ( + # hubspot_client.get_deal_and_company_and_listing(hubspot_deal_id) + # ) + + # if company: + # company_data: CompanyData = hubspot_client.get_company_information(company) + # db_client: HubspotDataToDb = HubspotDataToDb() + # db_client.upsert_company(company_data) + + # db_client.upsert_deal(hubspot_deal, company, listing, hubspot_client) diff --git a/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py b/etl/hubspot/tests/test_hubspot_deal_differ.py similarity index 98% rename from backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py rename to etl/hubspot/tests/test_hubspot_deal_differ.py index 75fa7927..12c5a288 100644 --- a/backend/hubspot_trigger_orchestrator/tests/test_hubspot_deal_differ.py +++ b/etl/hubspot/tests/test_hubspot_deal_differ.py @@ -5,7 +5,7 @@ import uuid import pytest from backend.app.db.models.organisation import HubspotDealData -from backend.hubspot_trigger_orchestrator.hubspot_deal_differ import HubspotDealDiffer +from etl.hubspot.hubspot_deal_differ import HubspotDealDiffer BASE_TIME = datetime(2025, 12, 1, 12, 0, 0) From 605652b30969156a445bc5aebf8cf083246aabc9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 09:56:58 +0000 Subject: [PATCH 26/43] =?UTF-8?q?diff=20checker=20for=20db=20load=20trigge?= =?UTF-8?q?r=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspotDataTodB.py | 8 +- etl/hubspot/tests/test_hubspot_deal_differ.py | 138 ++++++++++++++---- 2 files changed, 116 insertions(+), 30 deletions(-) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 06cc3be9..4f43f1f7 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -2,7 +2,7 @@ from backend.app.db.connection import db_read_session from backend.app.db.models.organisation import Organisation, HubspotDealData from sqlmodel import select from datetime import datetime, timezone -from typing import Dict, Optional +from typing import Dict, Optional, Tuple from etl.hubspot.company_data import CompanyData from etl.hubspot.hubspotClient import HubspotClient from etl.hubspot.s3_uploader import S3Uploader @@ -103,7 +103,7 @@ class HubspotDataToDb: Also handles major_condition_issue_photos file upload to S3 with integrity check. """ - def soft_assert(condition, message="Assertion Failed"): + def soft_assert(condition: bool, message: str = "Assertion Failed"): if not condition: print(f"⚠️ Soft Assert Failed: {message}") return False @@ -111,6 +111,10 @@ class HubspotDataToDb: print(f"🔍 Checking if deal needs updating (deal_id={deal_in_db.deal_id})") + hs_deal: Dict[str, str] + hs_company_id: Optional[str] + hs_listing: Optional[Dict[str, str]] + hs_deal, hs_company_id, hs_listing = ( hubspot_client.get_deal_and_company_and_listing(deal_in_db.deal_id) ) diff --git a/etl/hubspot/tests/test_hubspot_deal_differ.py b/etl/hubspot/tests/test_hubspot_deal_differ.py index 12c5a288..876fcab9 100644 --- a/etl/hubspot/tests/test_hubspot_deal_differ.py +++ b/etl/hubspot/tests/test_hubspot_deal_differ.py @@ -31,9 +31,9 @@ def make_new_deal(deal_id: uuid.UUID, **overrides: Any) -> Dict[str, str]: } -# ------------------------------------- -# Random change we aren't interested in -# ------------------------------------- +# ==================== +# PASHUB TRIGGER TESTS +# ==================== @pytest.mark.parametrize( @@ -59,11 +59,6 @@ def test_pashub_trigger__outcome_note_added__returns_false( ) -# ------------------------- -# Pashub link changes -# ------------------------- - - @pytest.mark.parametrize( "old_overrides,new_overrides,expected", [ @@ -92,11 +87,6 @@ def test_pashub_trigger__pashub_link_changed__returns_true( ) -# ------------------------- -# Coordination -# ------------------------- - - @pytest.mark.parametrize( "coordination_status,expected", [ @@ -155,11 +145,6 @@ def test_pashub_trigger__coordination_completed_and_pashub_link_not_set__returns ) -# ------------------------- -# Design -# ------------------------- - - def test_pashub_trigger__design_completed_and_pashub_link_set__returns_true() -> None: deal_id = uuid.uuid4() @@ -204,11 +189,6 @@ def test_pashub_trigger__design_completed_and_pashub_link_not_set__returns_false ) -# ------------------------- -# Lodgement -# ------------------------- - - @pytest.mark.parametrize( "lodgement_status,expected", [ @@ -263,11 +243,6 @@ def test_pashub_trigger__lodgement_completed_and_pashub_link_not_set__returns_fa ) -# ------------------------- -# Negative case -# ------------------------- - - def test_pashub_trigger__coordination_design_lodgement_not_completed_and_pashub_link_set__returns_false() -> ( None ): @@ -293,3 +268,110 @@ def test_pashub_trigger__coordination_design_lodgement_not_completed_and_pashub_ ) is False ) + + +# ======================= +# DB UPDATE TRIGGER TESTS +# ======================= + + +def test_db_update_trigger__no_changes__returns_false() -> None: + deal_id = uuid.uuid4() + + old_deal = make_old_deal( + id=deal_id, + dealname="Test Deal", + dealstage="stage_1", + outcome="won", + ) + + new_deal = make_new_deal( + deal_id, + hs_object_id="1", + dealname="Test Deal", + dealstage="stage_1", + outcome="won", + ) + + result = HubspotDealDiffer.check_for_db_update_trigger( + new_deal=new_deal, + new_company=None, + new_listing=None, + old_deal=old_deal, + ) + + assert result is False + + +def test_db_update_trigger__dealname_changed__returns_true() -> None: + deal_id = uuid.uuid4() + + old_deal = make_old_deal( + id=deal_id, + dealname="Old Name", + ) + + new_deal = make_new_deal( + deal_id, + hs_object_id="1", + dealname="New Name", + ) + + result = HubspotDealDiffer.check_for_db_update_trigger( + new_deal=new_deal, + new_company=None, + new_listing=None, + old_deal=old_deal, + ) + + assert result is True + + +def test_db_update_trigger__company_changed__returns_true() -> None: + deal_id = uuid.uuid4() + + old_deal = make_old_deal( + id=deal_id, + company_id="old_company", + ) + + new_deal = make_new_deal( + deal_id, + hs_object_id="1", + ) + + new_company = "new_company" + + result = HubspotDealDiffer.check_for_db_update_trigger( + new_deal=new_deal, + new_company=new_company, + new_listing=None, + old_deal=old_deal, + ) + + assert result is True + + +def test_db_update_trigger__listing_changed__returns_true() -> None: + deal_id = uuid.uuid4() + + old_deal = make_old_deal( + id=deal_id, + listing_id="abc", + ) + + new_deal = make_new_deal( + deal_id, + hs_object_id="1", + ) + + new_listing = {"listing_id": "xyz"} + + result = HubspotDealDiffer.check_for_db_update_trigger( + new_deal=new_deal, + new_company=None, + new_listing=new_listing, + old_deal=old_deal, + ) + + assert result is True From 01636514aaa58f8749af194fac0ac31cd8a79284 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 10:32:59 +0000 Subject: [PATCH 27/43] pull diffing logic out of loading method --- etl/hubspot/hubspotDataTodB.py | 181 ++++++++++++++-------------- etl/hubspot/scripts/scraper/main.py | 37 ++++-- 2 files changed, 118 insertions(+), 100 deletions(-) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 4f43f1f7..1c4b6b54 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -96,13 +96,103 @@ class HubspotDataToDb: return sha256.hexdigest() def update_deal_with_checks( - self, deal_in_db: HubspotDealData, hubspot_client: HubspotClient + self, + deal_in_db: HubspotDealData, + hubspot_client: HubspotClient, + hs_deal: Dict[str, str], + hs_company_id: Optional[str], + hs_listing: Optional[Dict[str, str]], ) -> bool: """ - Checks if a deal needs updating and syncs it with HubSpot. - Also handles major_condition_issue_photos file upload to S3 with integrity check. + Updates deal in database and handles major_condition_issue_photos file upload to S3 with integrity check. """ + self.upsert_deal(hs_deal, hs_company_id, hs_listing, hubspot_client) + # Handle photo upload if it exists but S3 URL is missing + if self._needs_photo_upload(deal_in_db): + print( + f"🖼️ Found photo for deal_id {deal_in_db.deal_id} — uploading to S3..." + ) + + photo_url = hs_deal.get("major_condition_issue_photos") + + if photo_url: + self._upload_photo_to_s3( + deal_in_db, + photo_url, + hubspot_client, + verify=True, + ) + + # persist change + with db_read_session() as session: + db_record = session.get(HubspotDealData, deal_in_db.id) + db_record.major_condition_issue_evidence_s3_url = ( + deal_in_db.major_condition_issue_evidence_s3_url + ) + session.add(db_record) + session.commit() + + return False + else: + print(f"⚠️ Photo URL missing for deal_id {deal_in_db.deal_id}") + + else: + print(f"✅ No update or upload required for deal_id {deal_in_db.deal_id}.") + + return True + + def upsert_deal( + self, + deal_data: Dict[str, str], + company: Optional[str], + listing: Optional[dict[str, str]], + hubspot_client: HubspotClient, + ): + """ + Inserts or updates a deal record. + Also uploads photos if present and adds S3 URL. + """ + with db_read_session() as session: + deal_id = deal_data.get("hs_object_id") + + statement = select(HubspotDealData).where( + HubspotDealData.deal_id == deal_id + ) + existing = session.exec(statement).first() + + if existing: + print(f"🔄 Updating existing deal (deal_id={deal_id})") + self._update_existing_deal(existing, deal_data, listing, company) + + self._handle_existing_photo_upload(existing, hubspot_client) + + session.add(existing) + session.commit() + session.refresh(existing) + return existing + + else: + print(f"🆕 Inserting new deal (deal_id={deal_id})") + new_record: HubspotDealData = self._build_new_deal( + deal_id, deal_data, listing, company + ) + + # Handle upload at insert time + self._handle_new_photo_upload(new_record, hubspot_client) + + session.add(new_record) + session.commit() + session.refresh(new_record) + return new_record + + def _deprecated_diff( + self, + deal_in_db: HubspotDealData, + hs_deal: Dict[str, str], + hs_company_id: Optional[str], + hs_listing: Optional[Dict[str, str]], + ): def soft_assert(condition: bool, message: str = "Assertion Failed"): if not condition: print(f"⚠️ Soft Assert Failed: {message}") @@ -111,14 +201,6 @@ class HubspotDataToDb: print(f"🔍 Checking if deal needs updating (deal_id={deal_in_db.deal_id})") - hs_deal: Dict[str, str] - hs_company_id: Optional[str] - hs_listing: Optional[Dict[str, str]] - - hs_deal, hs_company_id, hs_listing = ( - hubspot_client.get_deal_and_company_and_listing(deal_in_db.deal_id) - ) - # Soft compare key fields checks = [ soft_assert( @@ -291,87 +373,10 @@ class HubspotDataToDb: print( f"❗ Discrepancies found for deal_id {deal_in_db.deal_id} — syncing with HubSpot." ) - self.upsert_deal(hs_deal, hs_company_id, hs_listing, hubspot_client) return False - # Handle photo upload if it exists but S3 URL is missing - if self._needs_photo_upload(deal_in_db): - print( - f"🖼️ Found photo for deal_id {deal_in_db.deal_id} — uploading to S3..." - ) - - photo_url = hs_deal.get("major_condition_issue_photos") - - if photo_url: - self._upload_photo_to_s3( - deal_in_db, - photo_url, - hubspot_client, - verify=True, # 👈 key difference - ) - - # persist change - with db_read_session() as session: - db_record = session.get(HubspotDealData, deal_in_db.id) - db_record.major_condition_issue_evidence_s3_url = ( - deal_in_db.major_condition_issue_evidence_s3_url - ) - session.add(db_record) - session.commit() - - return False - else: - print(f"⚠️ Photo URL missing for deal_id {deal_in_db.deal_id}") - - else: - print(f"✅ No update or upload required for deal_id {deal_in_db.deal_id}.") - return True - def upsert_deal( - self, - deal_data: Dict[str, str], - company: Optional[str], - listing: Optional[dict[str, str]], - hubspot_client: HubspotClient, - ): - """ - Inserts or updates a deal record. - Also uploads photos if present and adds S3 URL. - """ - with db_read_session() as session: - deal_id = deal_data.get("hs_object_id") - - statement = select(HubspotDealData).where( - HubspotDealData.deal_id == deal_id - ) - existing = session.exec(statement).first() - - if existing: - print(f"🔄 Updating existing deal (deal_id={deal_id})") - self._update_existing_deal(existing, deal_data, listing, company) - - self._handle_existing_photo_upload(existing, hubspot_client) - - session.add(existing) - session.commit() - session.refresh(existing) - return existing - - else: - print(f"🆕 Inserting new deal (deal_id={deal_id})") - new_record: HubspotDealData = self._build_new_deal( - deal_id, deal_data, listing, company - ) - - # Handle upload at insert time - self._handle_new_photo_upload(new_record, hubspot_client) - - session.add(new_record) - session.commit() - session.refresh(new_record) - return new_record - def _update_existing_deal( self, existing: HubspotDealData, diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 8c4af1a7..768a86eb 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -1,8 +1,7 @@ from backend.app.db.models.organisation import HubspotDealData from etl.hubspot.hubspotClient import HubspotClient -# from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb -from etl.hubspot.hubspotDataTodB import HubspotDataToDb +from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb from backend.utils.subtasks import task_handler from typing import Any, Dict, Optional @@ -24,11 +23,6 @@ def handler(body: dict[str, Any], context: Any) -> None: hubspot_deal_id ) - if not db_deal: - # New hubspot deal, no diffing to do - # TODO: Trigger hubspot to db ETL - return - hubspot_deal: Dict[str, str] company: Optional[str] listing: Optional[dict[str, str]] @@ -37,10 +31,14 @@ def handler(body: dict[str, Any], context: Any) -> None: hubspot_deal_id ) - if HubspotDealDiffer.check_for_pashub_trigger( - new_deal=hubspot_deal, old_deal=db_deal - ): - # TODO: trigger pashub file fetcher + if not db_deal: + # New hubspot deal, no diffing to do + if company: + company_data: CompanyData = hubspot_client.get_company_information(company) + db_client: HubspotDataToDb = HubspotDataToDb() + db_client.upsert_company(company_data) + + db_client.upsert_deal(hubspot_deal, company, listing, hubspot_client) return if HubspotDealDiffer.check_for_db_update_trigger( @@ -49,7 +47,22 @@ def handler(body: dict[str, Any], context: Any) -> None: new_listing=listing, old_deal=db_deal, ): - # TODO: trigger db upsert + db_client.update_deal_with_checks( + deal_in_db=db_deal, + hubspot_client=hubspot_client, + hs_deal=hubspot_deal, + hs_company_id=company, + hs_listing=listing, + ) + return + + # ============================== + # Orchestration of other lambdas + # ============================== + if HubspotDealDiffer.check_for_pashub_trigger( + new_deal=hubspot_deal, old_deal=db_deal + ): + # TODO: trigger pashub file fetcher return # if db_deal: From 125527baa9a56573c3f8120fbf9daca805d8b20e Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 10:46:52 +0000 Subject: [PATCH 28/43] move HubspotDealData object to dedicated file --- backend/app/db/models/hubspot_deal_data.py | 77 +++++++++++++++++++ backend/app/db/models/organisation.py | 77 +------------------ etl/hubspot/hubspotDataTodB.py | 10 ++- etl/hubspot/hubspot_deal_differ.py | 2 +- etl/hubspot/scripts/scraper/main.py | 9 +-- etl/hubspot/tests/test_hubspot_deal_differ.py | 2 +- 6 files changed, 91 insertions(+), 86 deletions(-) create mode 100644 backend/app/db/models/hubspot_deal_data.py diff --git a/backend/app/db/models/hubspot_deal_data.py b/backend/app/db/models/hubspot_deal_data.py new file mode 100644 index 00000000..d5a51ace --- /dev/null +++ b/backend/app/db/models/hubspot_deal_data.py @@ -0,0 +1,77 @@ +import uuid +from sqlmodel import SQLModel, Field, Column, text +from datetime import datetime +from typing import Optional +from sqlalchemy import DateTime +from sqlalchemy.sql import func + + +class HubspotDealData(SQLModel, table=True): + __tablename__ = "hubspot_deal_data" + + id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) + + # HubSpot Deal identifiers + deal_id: str = Field(index=True, nullable=False) + dealname: Optional[str] = Field(default=None) + dealstage: Optional[str] = Field(default=None) + company_id: Optional[str] = Field(default=None) + project_code: Optional[str] = Field(default=None) + + # HubSpot custom properties + landlord_property_id: Optional[str] = Field(default=None) + uprn: Optional[str] = Field(default=None) + outcome: Optional[str] = Field(default=None) + outcome_notes: Optional[str] = Field(default=None) + + major_condition_issue_description: Optional[str] = Field(default=None) + major_condition_issue_photos: Optional[str] = Field(default=None) + major_condition_issue_evidence_s3_url: Optional[str] = Field(default=None) + + coordination_status: Optional[str] = Field(default=None) + coordination_comments: Optional[str] = Field(default=None) + design_status: Optional[str] = Field(default=None) + + listing_id: Optional[str] = Field(default=None) + pashub_link: Optional[str] = Field(default=None) + sharepoint_link: Optional[str] = Field(default=None) + dampmould_growth: Optional[str] = Field(default=None) + damp_mould_and_repairs_comments: Optional[str] = Field(default=None) + pre_sap: Optional[str] = Field(default=None) + coordinator: Optional[str] = Field(default=None) + mtp_completion_date: Optional[datetime] = Field(default=None) + mtp_re_model_completion_date: Optional[datetime] = Field(default=None) + ioe_v3_completion_date: Optional[datetime] = Field(default=None) + proposed_measures: Optional[str] = Field(default=None) + approved_package: Optional[str] = Field(default=None) + designer: Optional[str] = Field(default=None) + design_completion_date: Optional[datetime] = Field(default=None) + actual_measures_installed: Optional[str] = Field(default=None) + installer: Optional[str] = Field(default=None) + installer_handover: Optional[str] = Field(default=None) + lodgement_status: Optional[str] = Field(default=None) + measures_lodgement_date: Optional[datetime] = Field(default=None) + lodgement_date: Optional[datetime] = Field(default=None) + expected_commencement_date: Optional[datetime] = Field(default=None) + surveyor: Optional[str] = Field(default=None) + confirmed_survey_date: Optional[datetime] = Field(default=None) + confirmed_survey_time: Optional[str] = Field(default=None) + surveyed_date: Optional[datetime] = Field(default=None) + design_type: Optional[str] = Field(default=None) + + created_at: datetime = Field( + sa_column=Column( + DateTime(timezone=True), + server_default=text("(NOW() AT TIME ZONE 'utc')"), + nullable=False, + ) + ) + + updated_at: datetime = Field( + sa_column=Column( + DateTime(timezone=True), + server_default=text("(NOW() AT TIME ZONE 'utc')"), + onupdate=func.now(), + nullable=False, + ) + ) diff --git a/backend/app/db/models/organisation.py b/backend/app/db/models/organisation.py index 784cc4ad..8afc5d63 100644 --- a/backend/app/db/models/organisation.py +++ b/backend/app/db/models/organisation.py @@ -1,9 +1,7 @@ -from sqlmodel import SQLModel, Field, Column, text +import uuid +from sqlmodel import SQLModel, Field from datetime import datetime, timezone from typing import Optional -from sqlalchemy import DateTime -from sqlalchemy.sql import func -import uuid class Organisation(SQLModel, table=True): @@ -13,74 +11,3 @@ class Organisation(SQLModel, table=True): updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) hubspot_company_id: Optional[str] = None name: Optional[str] = None - - -class HubspotDealData(SQLModel, table=True): - __tablename__ = "hubspot_deal_data" - - id: uuid.UUID = Field(default_factory=uuid.uuid4, primary_key=True) - - # HubSpot Deal identifiers - deal_id: str = Field(index=True, nullable=False) - dealname: Optional[str] = Field(default=None) - dealstage: Optional[str] = Field(default=None) - company_id: Optional[str] = Field(default=None) - project_code: Optional[str] = Field(default=None) - - # HubSpot custom properties - landlord_property_id: Optional[str] = Field(default=None) - uprn: Optional[str] = Field(default=None) - outcome: Optional[str] = Field(default=None) - outcome_notes: Optional[str] = Field(default=None) - - major_condition_issue_description: Optional[str] = Field(default=None) - major_condition_issue_photos: Optional[str] = Field(default=None) - major_condition_issue_evidence_s3_url: Optional[str] = Field(default=None) - - coordination_status: Optional[str] = Field(default=None) - coordination_comments: Optional[str] = Field(default=None) - design_status: Optional[str] = Field(default=None) - - listing_id: Optional[str] = Field(default=None) - pashub_link: Optional[str] = Field(default=None) - sharepoint_link: Optional[str] = Field(default=None) - dampmould_growth: Optional[str] = Field(default=None) - damp_mould_and_repairs_comments: Optional[str] = Field(default=None) - pre_sap: Optional[str] = Field(default=None) - coordinator: Optional[str] = Field(default=None) - mtp_completion_date: Optional[datetime] = Field(default=None) - mtp_re_model_completion_date: Optional[datetime] = Field(default=None) - ioe_v3_completion_date: Optional[datetime] = Field(default=None) - proposed_measures: Optional[str] = Field(default=None) - approved_package: Optional[str] = Field(default=None) - designer: Optional[str] = Field(default=None) - design_completion_date: Optional[datetime] = Field(default=None) - actual_measures_installed: Optional[str] = Field(default=None) - installer: Optional[str] = Field(default=None) - installer_handover: Optional[str] = Field(default=None) - lodgement_status: Optional[str] = Field(default=None) - measures_lodgement_date: Optional[datetime] = Field(default=None) - lodgement_date: Optional[datetime] = Field(default=None) - expected_commencement_date: Optional[datetime] = Field(default=None) - surveyor: Optional[str] = Field(default=None) - confirmed_survey_date: Optional[datetime] = Field(default=None) - confirmed_survey_time: Optional[str] = Field(default=None) - surveyed_date: Optional[datetime] = Field(default=None) - design_type: Optional[str] = Field(default=None) - - created_at: datetime = Field( - sa_column=Column( - DateTime(timezone=True), - server_default=text("(NOW() AT TIME ZONE 'utc')"), - nullable=False, - ) - ) - - updated_at: datetime = Field( - sa_column=Column( - DateTime(timezone=True), - server_default=text("(NOW() AT TIME ZONE 'utc')"), - onupdate=func.now(), - nullable=False, - ) - ) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 3c017f0e..5ebc8c73 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -1,13 +1,15 @@ -from backend.app.db.connection import db_read_session -from backend.app.db.models.organisation import Organisation, HubspotDealData +import hashlib +import os from sqlmodel import select from datetime import datetime, timezone from typing import Dict, Optional + +from backend.app.db.models.hubspot_deal_data import HubspotDealData from etl.hubspot.company_data import CompanyData from etl.hubspot.hubspotClient import HubspotClient from etl.hubspot.s3_uploader import S3Uploader -import hashlib -import os +from backend.app.db.connection import db_read_session +from backend.app.db.models.organisation import Organisation class HubspotDataToDb: diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index 1dd4ed51..dd992243 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -1,6 +1,6 @@ from typing import Dict, List, Optional -from backend.app.db.models.organisation import HubspotDealData +from backend.app.db.models.hubspot_deal_data import HubspotDealData class HubspotDealDiffer: diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 768a86eb..826d7e05 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -1,14 +1,13 @@ -from backend.app.db.models.organisation import HubspotDealData -from etl.hubspot.hubspotClient import HubspotClient - -from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb -from backend.utils.subtasks import task_handler from typing import Any, Dict, Optional +from etl.hubspot.hubspotClient import HubspotClient +from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb from etl.hubspot.hubspot_deal_differ import HubspotDealDiffer from etl.hubspot.hubspot_trigger_orchestrator_trigger_request import ( HubspotTriggerOrchestratorTriggerRequest, ) +from backend.utils.subtasks import task_handler +from backend.app.db.models.hubspot_deal_data import HubspotDealData @task_handler() diff --git a/etl/hubspot/tests/test_hubspot_deal_differ.py b/etl/hubspot/tests/test_hubspot_deal_differ.py index 876fcab9..74d3f057 100644 --- a/etl/hubspot/tests/test_hubspot_deal_differ.py +++ b/etl/hubspot/tests/test_hubspot_deal_differ.py @@ -4,7 +4,7 @@ import uuid import pytest -from backend.app.db.models.organisation import HubspotDealData +from backend.app.db.models.hubspot_deal_data import HubspotDealData from etl.hubspot.hubspot_deal_differ import HubspotDealDiffer From 36aaabb3cfa05d776484f2af8fa53973936dc5b5 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 11:19:33 +0000 Subject: [PATCH 29/43] =?UTF-8?q?diff=20checker=20for=20db=20load=20trigge?= =?UTF-8?q?r=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/db/models/hubspot_deal_data.py | 10 +- etl/hubspot/hubspotDataTodB.py | 247 ++------------------- etl/hubspot/hubspot_deal_differ.py | 90 +++++++- etl/hubspot/scripts/scraper/main.py | 18 -- etl/hubspot/utils.py | 11 + 5 files changed, 127 insertions(+), 249 deletions(-) create mode 100644 etl/hubspot/utils.py diff --git a/backend/app/db/models/hubspot_deal_data.py b/backend/app/db/models/hubspot_deal_data.py index d5a51ace..1d7607e0 100644 --- a/backend/app/db/models/hubspot_deal_data.py +++ b/backend/app/db/models/hubspot_deal_data.py @@ -59,19 +59,21 @@ class HubspotDealData(SQLModel, table=True): surveyed_date: Optional[datetime] = Field(default=None) design_type: Optional[str] = Field(default=None) - created_at: datetime = Field( + created_at: Optional[datetime] = Field( sa_column=Column( DateTime(timezone=True), server_default=text("(NOW() AT TIME ZONE 'utc')"), nullable=False, - ) + ), + default=None, # Nullable in db but optional here as value is set on db save for new record ) - updated_at: datetime = Field( + updated_at: Optional[datetime] = Field( sa_column=Column( DateTime(timezone=True), server_default=text("(NOW() AT TIME ZONE 'utc')"), onupdate=func.now(), nullable=False, - ) + ), + default=None, # Nullable in db but optional here as value is set on db save for new record ) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 5ebc8c73..210c9593 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -10,6 +10,7 @@ from etl.hubspot.hubspotClient import HubspotClient from etl.hubspot.s3_uploader import S3Uploader from backend.app.db.connection import db_read_session from backend.app.db.models.organisation import Organisation +from etl.hubspot.utils import parse_hs_date class HubspotDataToDb: @@ -60,11 +61,7 @@ class HubspotDataToDb: session.commit() return record - def new_record_to_hubspot_data(self, deal_data, company, listing, hubspot_client): - print("⚠️ Deprecated — use the new interface instead.") - return self.upsert_deal(deal_data, company, listing, hubspot_client) - - def find_all_deals_with_company_id(self, company_id): + def find_all_deals_with_company_id(self, company_id: str): """Returns a list of deals for a given company_id.""" with db_read_session() as session: return ( @@ -137,7 +134,7 @@ class HubspotDataToDb: return False else: - print(f"⚠️ Photo URL missing for deal_id {deal_in_db.deal_id}") + print(f"⚠️ Photo URL missing for deal_id {deal_in_db.deal_id}") else: print(f"✅ No update or upload required for deal_id {deal_in_db.deal_id}.") @@ -188,202 +185,6 @@ class HubspotDataToDb: session.refresh(new_record) return new_record - def _deprecated_diff( - self, - deal_in_db: HubspotDealData, - hs_deal: Dict[str, str], - hs_company_id: Optional[str], - hs_listing: Optional[Dict[str, str]], - ): - def soft_assert(condition: bool, message: str = "Assertion Failed"): - if not condition: - print(f"⚠️ Soft Assert Failed: {message}") - return False - return True - - print(f"🔍 Checking if deal needs updating (deal_id={deal_in_db.deal_id})") - - # Soft compare key fields - checks = [ - soft_assert( - deal_in_db.deal_id == hs_deal.get("hs_object_id"), "deal_id mismatch" - ), - soft_assert(deal_in_db.company_id == hs_company_id, "company_id mismatch"), - soft_assert( - deal_in_db.listing_id == hs_listing.get("listing_id"), - "listing_id mismatch", - ), - soft_assert( - deal_in_db.landlord_property_id == hs_listing.get("owner_property_id"), - "landlord_property_id mismatch", - ), - soft_assert( - deal_in_db.outcome == hs_deal.get("outcome"), "outcome mismatch" - ), - soft_assert( - deal_in_db.dealstage == hs_deal.get("dealstage"), "dealstage mismatch" - ), - soft_assert( - deal_in_db.dealname == hs_deal.get("dealname"), "dealname mismatch" - ), - soft_assert( - deal_in_db.project_code == hs_deal.get("project_code"), - "project_code mismatch", - ), - soft_assert( - deal_in_db.uprn == hs_listing.get("national_uprn"), "uprn mismatch" - ), - soft_assert( - deal_in_db.outcome_notes == hs_deal.get("outcome_notes"), - "outcome_notes mismatch", - ), - soft_assert( - deal_in_db.major_condition_issue_description - == hs_deal.get("major_condition_issue_description"), - "major condition description mismatch", - ), - soft_assert( - deal_in_db.major_condition_issue_photos - == hs_deal.get("major_condition_issue_photos"), - "major condition issue photos mismatch", - ), - soft_assert( - deal_in_db.coordination_status - == hs_deal.get("coordination_status__stage_1_"), - "coordination stage 1 status mismatch", - ), - soft_assert( - deal_in_db.coordination_comments - == hs_deal.get("coordination_comments"), - "coordination_comments mismatch", - ), - soft_assert( - deal_in_db.design_status == hs_deal.get("retrofit_design_status"), - "retrofit design mismatch", - ), - soft_assert( - deal_in_db.pashub_link == hs_deal.get("pashub_link"), - "pashub_link mismatch", - ), - soft_assert( - deal_in_db.sharepoint_link == hs_deal.get("sharepoint_link"), - "sharepoint_link mismatch", - ), - soft_assert( - deal_in_db.dampmould_growth == hs_deal.get("dampmould_growth"), - "dampmould_growth mismatch", - ), - soft_assert( - deal_in_db.damp_mould_and_repairs_comments - == hs_deal.get("damp_mould_and_repairs_comments"), - "damp_mould_and_repairs_comments mismatch", - ), - soft_assert( - deal_in_db.pre_sap == hs_deal.get("pre_sap"), - "pre_sap mismatch", - ), - soft_assert( - deal_in_db.coordinator == hs_deal.get("coordinator"), - "coordinator mismatch", - ), - soft_assert( - deal_in_db.mtp_completion_date - == self._parse_hs_date(hs_deal.get("mtp_completion_date")), - "mtp_completion_date mismatch", - ), - soft_assert( - deal_in_db.mtp_re_model_completion_date - == self._parse_hs_date(hs_deal.get("mtp_re_model_completion_date")), - "mtp_re_model_completion_date mismatch", - ), - soft_assert( - deal_in_db.ioe_v3_completion_date - == self._parse_hs_date(hs_deal.get("ioe_v3_completion_date")), - "ioe_v3_completion_date mismatch", - ), - soft_assert( - deal_in_db.proposed_measures == hs_deal.get("proposed_measures"), - "proposed_measures mismatch", - ), - soft_assert( - deal_in_db.approved_package == hs_deal.get("approved_package"), - "approved_package mismatch", - ), - soft_assert( - deal_in_db.designer == hs_deal.get("designer"), - "designer mismatch", - ), - soft_assert( - deal_in_db.design_completion_date - == self._parse_hs_date(hs_deal.get("design_completion_date")), - "design_completion_date mismatch", - ), - soft_assert( - deal_in_db.actual_measures_installed - == hs_deal.get("actual_measures_installed"), - "actual_measures_installed mismatch", - ), - soft_assert( - deal_in_db.installer == hs_deal.get("installer"), - "installer mismatch", - ), - soft_assert( - deal_in_db.installer_handover == hs_deal.get("installer_handover"), - "installer_handover mismatch", - ), - soft_assert( - deal_in_db.lodgement_status == hs_deal.get("lodgement_status"), - "lodgement_status mismatch", - ), - soft_assert( - deal_in_db.measures_lodgement_date - == self._parse_hs_date(hs_deal.get("measures_lodgement_date")), - "measures_lodgement_date mismatch", - ), - soft_assert( - deal_in_db.lodgement_date - == self._parse_hs_date(hs_deal.get("lodgement_date")), - "lodgement_date mismatch", - ), - soft_assert( - deal_in_db.expected_commencement_date - == self._parse_hs_date(hs_deal.get("expected_commencement_date")), - "expected_commencement_date mismatch", - ), - soft_assert( - deal_in_db.surveyor == hs_deal.get("surveyor"), - "surveyor mismatch", - ), - soft_assert( - deal_in_db.confirmed_survey_date - == self._parse_hs_date(hs_deal.get("confirmed_survey_date")), - "confirmed_survey_date mismatch", - ), - soft_assert( - deal_in_db.confirmed_survey_time - == hs_deal.get("confirmed_survey_time"), - "confirmed_survey_time mismatch", - ), - soft_assert( - deal_in_db.surveyed_date - == self._parse_hs_date(hs_deal.get("surveyed_date")), - "surveyed_date mismatch", - ), - soft_assert( - deal_in_db.design_type == hs_deal.get("design_type"), - "design_type mismatch", - ), - ] - - # If discrepancies found, update from HubSpot - if not all(checks): - print( - f"❗ Discrepancies found for deal_id {deal_in_db.deal_id} — syncing with HubSpot." - ) - return False - - return True - def _update_existing_deal( self, existing: HubspotDealData, @@ -420,38 +221,36 @@ class HubspotDataToDb: ), "pre_sap": deal_data.get("pre_sap"), "coordinator": deal_data.get("coordinator"), - "mtp_completion_date": self._parse_hs_date( - deal_data.get("mtp_completion_date") - ), - "mtp_re_model_completion_date": self._parse_hs_date( + "mtp_completion_date": parse_hs_date(deal_data.get("mtp_completion_date")), + "mtp_re_model_completion_date": parse_hs_date( deal_data.get("mtp_re_model_completion_date") ), - "ioe_v3_completion_date": self._parse_hs_date( + "ioe_v3_completion_date": parse_hs_date( deal_data.get("ioe_v3_completion_date") ), "proposed_measures": deal_data.get("proposed_measures"), "approved_package": deal_data.get("approved_package"), "designer": deal_data.get("designer"), - "design_completion_date": self._parse_hs_date( + "design_completion_date": parse_hs_date( deal_data.get("design_completion_date") ), "actual_measures_installed": deal_data.get("actual_measures_installed"), "installer": deal_data.get("installer"), "installer_handover": deal_data.get("installer_handover"), "lodgement_status": deal_data.get("lodgement_status"), - "measures_lodgement_date": self._parse_hs_date( + "measures_lodgement_date": parse_hs_date( deal_data.get("measures_lodgement_date") ), - "lodgement_date": self._parse_hs_date(deal_data.get("lodgement_date")), - "expected_commencement_date": self._parse_hs_date( + "lodgement_date": parse_hs_date(deal_data.get("lodgement_date")), + "expected_commencement_date": parse_hs_date( deal_data.get("expected_commencement_date") ), "surveyor": deal_data.get("surveyor"), - "confirmed_survey_date": self._parse_hs_date( + "confirmed_survey_date": parse_hs_date( deal_data.get("confirmed_survey_date") ), "confirmed_survey_time": deal_data.get("confirmed_survey_time"), - "surveyed_date": self._parse_hs_date(deal_data.get("surveyed_date")), + "surveyed_date": parse_hs_date(deal_data.get("surveyed_date")), "design_type": deal_data.get("design_type"), }.items(): setattr(existing, attr, value or getattr(existing, attr)) @@ -491,38 +290,34 @@ class HubspotDataToDb: ), pre_sap=deal_data.get("pre_sap"), coordinator=deal_data.get("coordinator"), - mtp_completion_date=self._parse_hs_date( - deal_data.get("mtp_completion_date") - ), - mtp_re_model_completion_date=self._parse_hs_date( + mtp_completion_date=parse_hs_date(deal_data.get("mtp_completion_date")), + mtp_re_model_completion_date=parse_hs_date( deal_data.get("mtp_re_model_completion_date") ), - ioe_v3_completion_date=self._parse_hs_date( + ioe_v3_completion_date=parse_hs_date( deal_data.get("ioe_v3_completion_date") ), proposed_measures=deal_data.get("proposed_measures"), approved_package=deal_data.get("approved_package"), designer=deal_data.get("designer"), - design_completion_date=self._parse_hs_date( + design_completion_date=parse_hs_date( deal_data.get("design_completion_date") ), actual_measures_installed=deal_data.get("actual_measures_installed"), installer=deal_data.get("installer"), installer_handover=deal_data.get("installer_handover"), lodgement_status=deal_data.get("lodgement_status"), - measures_lodgement_date=self._parse_hs_date( + measures_lodgement_date=parse_hs_date( deal_data.get("measures_lodgement_date") ), - lodgement_date=self._parse_hs_date(deal_data.get("lodgement_date")), - expected_commencement_date=self._parse_hs_date( + lodgement_date=parse_hs_date(deal_data.get("lodgement_date")), + expected_commencement_date=parse_hs_date( deal_data.get("expected_commencement_date") ), surveyor=deal_data.get("surveyor"), - confirmed_survey_date=self._parse_hs_date( - deal_data.get("confirmed_survey_date") - ), + confirmed_survey_date=parse_hs_date(deal_data.get("confirmed_survey_date")), confirmed_survey_time=deal_data.get("confirmed_survey_time"), - surveyed_date=self._parse_hs_date(deal_data.get("surveyed_date")), + surveyed_date=parse_hs_date(deal_data.get("surveyed_date")), design_type=deal_data.get("design_type"), ) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index dd992243..42def3b2 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -1,6 +1,7 @@ from typing import Dict, List, Optional from backend.app.db.models.hubspot_deal_data import HubspotDealData +from etl.hubspot.utils import parse_hs_date class HubspotDealDiffer: @@ -18,7 +19,94 @@ class HubspotDealDiffer: new_listing: Optional[Dict[str, str]], old_deal: HubspotDealData, ) -> bool: - raise NotImplementedError + """ + Returns True if ANY difference exists between HubSpot data and DB. + Returns False if everything matches (i.e. no update needed). + """ + + # --- Deal ID --- + if str(old_deal.deal_id) != str(new_deal.get("hs_object_id")): + return True + + # --- Company --- + if new_company is not None: + if old_deal.company_id != new_company: + return True + + # --- Listing --- + hs_listing = new_listing or {} + + if old_deal.listing_id != hs_listing.get("listing_id"): + return True + + if old_deal.landlord_property_id != hs_listing.get("owner_property_id"): + return True + + if old_deal.uprn != hs_listing.get("national_uprn"): + return True + + # --- Field mappings --- + FIELD_MAP = { + "outcome": "outcome", + "dealstage": "dealstage", + "dealname": "dealname", + "project_code": "project_code", + "outcome_notes": "outcome_notes", + "major_condition_issue_description": "major_condition_issue_description", + "major_condition_issue_photos": "major_condition_issue_photos", + "coordination_status__stage_1_": "coordination_status", + "coordination_comments": "coordination_comments", + "retrofit_design_status": "design_status", + "pashub_link": "pashub_link", + "sharepoint_link": "sharepoint_link", + "dampmould_growth": "dampmould_growth", + "damp_mould_and_repairs_comments": "damp_mould_and_repairs_comments", + "pre_sap": "pre_sap", + "coordinator": "coordinator", + "proposed_measures": "proposed_measures", + "approved_package": "approved_package", + "designer": "designer", + "actual_measures_installed": "actual_measures_installed", + "installer": "installer", + "installer_handover": "installer_handover", + "lodgement_status": "lodgement_status", + "design_type": "design_type", + "surveyor": "surveyor", + } + + for hs_field, db_field in FIELD_MAP.items(): + old_value = getattr(old_deal, db_field) + new_value = new_deal.get(hs_field) + + if old_value != new_value: + return True + + # --- Date fields --- + date_fields = [ + ("mtp_completion_date", "mtp_completion_date"), + ("mtp_re_model_completion_date", "mtp_re_model_completion_date"), + ("ioe_v3_completion_date", "ioe_v3_completion_date"), + ("design_completion_date", "design_completion_date"), + ("measures_lodgement_date", "measures_lodgement_date"), + ("lodgement_date", "lodgement_date"), + ("expected_commencement_date", "expected_commencement_date"), + ("confirmed_survey_date", "confirmed_survey_date"), + ("surveyed_date", "surveyed_date"), + ] + + for hs_field, db_field in date_fields: + old_value = getattr(old_deal, db_field) + new_value = parse_hs_date(new_deal.get(hs_field)) + + if old_value != new_value: + return True + + # --- Time field --- + if old_deal.confirmed_survey_time != new_deal.get("confirmed_survey_time"): + return True + + # No differences found + return False @staticmethod def check_for_pashub_trigger( diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 826d7e05..5d5b2b26 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -63,21 +63,3 @@ def handler(body: dict[str, Any], context: Any) -> None: ): # TODO: trigger pashub file fetcher return - - # if db_deal: - # db_client.update_deal_with_checks(db_deal, hubspot_client) - # else: - # hubspot_deal: Dict[str, str] - # company: Optional[str] - # listing: Optional[dict[str, str]] - - # hubspot_deal, company, listing = ( - # hubspot_client.get_deal_and_company_and_listing(hubspot_deal_id) - # ) - - # if company: - # company_data: CompanyData = hubspot_client.get_company_information(company) - # db_client: HubspotDataToDb = HubspotDataToDb() - # db_client.upsert_company(company_data) - - # db_client.upsert_deal(hubspot_deal, company, listing, hubspot_client) diff --git a/etl/hubspot/utils.py b/etl/hubspot/utils.py new file mode 100644 index 00000000..9fbeae62 --- /dev/null +++ b/etl/hubspot/utils.py @@ -0,0 +1,11 @@ +from datetime import datetime +from typing import Optional + + +def parse_hs_date(value: Optional[str]) -> Optional[datetime]: + if not value: + return None + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")) + except ValueError: + return None From f572dfd2b316c17636061c72d323803394f0343c Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 12:53:48 +0000 Subject: [PATCH 30/43] trigger pashub to ara lambda if necessary --- etl/hubspot/scripts/scraper/main.py | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 5d5b2b26..18e425a4 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -1,3 +1,5 @@ +import json +import boto3 from typing import Any, Dict, Optional from etl.hubspot.hubspotClient import HubspotClient @@ -8,6 +10,9 @@ from etl.hubspot.hubspot_trigger_orchestrator_trigger_request import ( ) from backend.utils.subtasks import task_handler from backend.app.db.models.hubspot_deal_data import HubspotDealData +from utils.logger import setup_logger + +logger = setup_logger() @task_handler() @@ -15,6 +20,9 @@ def handler(body: dict[str, Any], context: Any) -> None: db_client = HubspotDataToDb() hubspot_client = HubspotClient() + sqs_client = boto3.client("sqs") + PASHUB_TRIGGER_QUEUE_URL = "pashub_to_ara-queue-dev" # TODO: get from env var + payload = HubspotTriggerOrchestratorTriggerRequest.model_validate(body) hubspot_deal_id: str = payload.hubspot_deal_id @@ -40,6 +48,9 @@ def handler(body: dict[str, Any], context: Any) -> None: db_client.upsert_deal(hubspot_deal, company, listing, hubspot_client) return + deal_unchanged = True + + # Deal already in db, check whether anything has changed if HubspotDealDiffer.check_for_db_update_trigger( new_deal=hubspot_deal, new_company=company, @@ -53,6 +64,9 @@ def handler(body: dict[str, Any], context: Any) -> None: hs_company_id=company, hs_listing=listing, ) + deal_unchanged = False + + if deal_unchanged: return # ============================== @@ -62,4 +76,21 @@ def handler(body: dict[str, Any], context: Any) -> None: new_deal=hubspot_deal, old_deal=db_deal ): # TODO: trigger pashub file fetcher + message_body: Dict[str, Optional[str]] = { + "pashub_link": hubspot_deal["pashub_link"], + "address": None, # can we get this? + "sharepoint_link": hubspot_deal["sharepoint_link"], + "uprn": hubspot_deal["national_uprn"], + "landlord_property_id": hubspot_deal["owner_property_id"], + "deal_stage": hubspot_deal["deal_stage"], + } + + response = sqs_client.send_message( + QueueUrl=PASHUB_TRIGGER_QUEUE_URL, MessageBody=json.dumps(message_body) + ) + + logger.info( + f"Sent message to Pashub To Ara queue. MessageId: {response['MessageId']}" + ) + return From c718e36c1180a0fd2413a6a5d6cf93562eebc462 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 12:54:01 +0000 Subject: [PATCH 31/43] trigger pashub to ara lambda if necessary --- etl/hubspot/scripts/scraper/main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 18e425a4..0bc285a7 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -92,5 +92,3 @@ def handler(body: dict[str, Any], context: Any) -> None: logger.info( f"Sent message to Pashub To Ara queue. MessageId: {response['MessageId']}" ) - - return From 2b93e06629e146c8d536d019414946cf958bf405 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 13:07:13 +0000 Subject: [PATCH 32/43] add todo --- etl/hubspot/hubspotClient.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index 8053b41f..6bdf71ed 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -230,7 +230,9 @@ class HubspotClient: self.logger.info(f"Listing info for deal {deal_id}: {listing_info}") return listing_info - def from_deal_id_get_info(self, deal_id: str) -> dict[str, str]: + def from_deal_id_get_info( + self, deal_id: str + ) -> dict[str, str]: # TODO: add dataclass for this deals_api: DealsBasicApi = self.client.crm.deals.basic_api # type: ignore[reportUnknownMemberType] deal: HubspotObject = self._call_with_retry( From ff0027dbc46f04c2383a2a8630f476c0c9b071f8 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 13:09:48 +0000 Subject: [PATCH 33/43] remove todo --- etl/hubspot/scripts/scraper/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 0bc285a7..cec03da8 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -75,7 +75,6 @@ def handler(body: dict[str, Any], context: Any) -> None: if HubspotDealDiffer.check_for_pashub_trigger( new_deal=hubspot_deal, old_deal=db_deal ): - # TODO: trigger pashub file fetcher message_body: Dict[str, Optional[str]] = { "pashub_link": hubspot_deal["pashub_link"], "address": None, # can we get this? From 4425b28d4fd15e8ab23fc20abcb25b1728dcc085 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 14:24:52 +0000 Subject: [PATCH 34/43] address review comments and add logging --- etl/hubspot/hubspotDataTodB.py | 24 +++---- etl/hubspot/scripts/scraper/main.py | 99 ++++++++++++++++------------- 2 files changed, 64 insertions(+), 59 deletions(-) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 210c9593..65fad572 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -78,22 +78,6 @@ class HubspotDataToDb: .one_or_none() ) - def _parse_hs_date(self, value: Optional[str]) -> Optional[datetime]: - if not value: - return None - try: - return datetime.fromisoformat(value.replace("Z", "+00:00")) - except ValueError: - return None - - def _sha256(self, file_path: str) -> str: - """Compute SHA-256 checksum of a file.""" - sha256 = hashlib.sha256() - with open(file_path, "rb") as f: - for chunk in iter(lambda: f.read(8192), b""): - sha256.update(chunk) - return sha256.hexdigest() - def update_deal_with_checks( self, deal_in_db: HubspotDealData, @@ -185,6 +169,14 @@ class HubspotDataToDb: session.refresh(new_record) return new_record + def _sha256(self, file_path: str) -> str: + """Compute SHA-256 checksum of a file.""" + sha256 = hashlib.sha256() + with open(file_path, "rb") as f: + for chunk in iter(lambda: f.read(8192), b""): + sha256.update(chunk) + return sha256.hexdigest() + def _update_existing_deal( self, existing: HubspotDealData, diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index cec03da8..8fa71bf7 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -38,56 +38,69 @@ def handler(body: dict[str, Any], context: Any) -> None: hubspot_deal_id ) + deal_changed = False if not db_deal: # New hubspot deal, no diffing to do + logger.info(f"New HubSpot deal of ID {hubspot_deal_id}. Loading to database...") if company: company_data: CompanyData = hubspot_client.get_company_information(company) db_client: HubspotDataToDb = HubspotDataToDb() db_client.upsert_company(company_data) db_client.upsert_deal(hubspot_deal, company, listing, hubspot_client) - return - - deal_unchanged = True - - # Deal already in db, check whether anything has changed - if HubspotDealDiffer.check_for_db_update_trigger( - new_deal=hubspot_deal, - new_company=company, - new_listing=listing, - old_deal=db_deal, - ): - db_client.update_deal_with_checks( - deal_in_db=db_deal, - hubspot_client=hubspot_client, - hs_deal=hubspot_deal, - hs_company_id=company, - hs_listing=listing, - ) - deal_unchanged = False - - if deal_unchanged: - return - - # ============================== - # Orchestration of other lambdas - # ============================== - if HubspotDealDiffer.check_for_pashub_trigger( - new_deal=hubspot_deal, old_deal=db_deal - ): - message_body: Dict[str, Optional[str]] = { - "pashub_link": hubspot_deal["pashub_link"], - "address": None, # can we get this? - "sharepoint_link": hubspot_deal["sharepoint_link"], - "uprn": hubspot_deal["national_uprn"], - "landlord_property_id": hubspot_deal["owner_property_id"], - "deal_stage": hubspot_deal["deal_stage"], - } - - response = sqs_client.send_message( - QueueUrl=PASHUB_TRIGGER_QUEUE_URL, MessageBody=json.dumps(message_body) - ) - + else: + # Deal already in db, check whether anything has changed logger.info( - f"Sent message to Pashub To Ara queue. MessageId: {response['MessageId']}" + f"HubSpot deal {hubspot_deal_id} already in database. Checking for changes..." ) + if HubspotDealDiffer.check_for_db_update_trigger( + new_deal=hubspot_deal, + new_company=company, + new_listing=listing, + old_deal=db_deal, + ): + logger.info( + f"Deal {hubspot_deal_id} has been changed, updating database..." + ) + db_client.update_deal_with_checks( + deal_in_db=db_deal, + hubspot_client=hubspot_client, + hs_deal=hubspot_deal, + hs_company_id=company, + hs_listing=listing, + ) + deal_changed = True + + if not deal_changed: + logger.info(f"No changes to deal {hubspot_deal_id}") + return + + # ============================== + # Orchestration of other lambdas + # ============================== + if HubspotDealDiffer.check_for_pashub_trigger( + new_deal=hubspot_deal, old_deal=db_deal + ): + logger.info( + f"Triggering Pas Hub file fetcher for HubSpot deal ID {hubspot_deal_id}" + ) + message_body: Dict[str, Optional[str]] = { + "pashub_link": hubspot_deal["pashub_link"], + "address": None, # potentially available from Listing, leave as None for now + "sharepoint_link": hubspot_deal["sharepoint_link"], + "uprn": hubspot_deal["national_uprn"], + "landlord_property_id": hubspot_deal["owner_property_id"], + "deal_stage": hubspot_deal["deal_stage"], + } + + response = sqs_client.send_message( + QueueUrl=PASHUB_TRIGGER_QUEUE_URL, MessageBody=json.dumps(message_body) + ) + + logger.info( + f"Sent message to Pashub To Ara queue. MessageId: {response['MessageId']}" + ) + else: + logger.info( + f"Not Triggering PasHub file fetcher for HubSpot deal ID {hubspot_deal_id}" + ) From bd891a7a85365b4fca64ad95ebc7c4fb0ed4b82a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 14:41:14 +0000 Subject: [PATCH 35/43] address JTK review comments --- backend/app/db/models/hubspot_deal_data.py | 8 ++++---- etl/hubspot/hubspotDataTodB.py | 2 +- etl/hubspot/hubspot_deal_differ.py | 6 ++---- etl/hubspot/scripts/onboarding/new_organisation.py | 2 +- etl/hubspot/scripts/scraper/main.py | 2 +- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/backend/app/db/models/hubspot_deal_data.py b/backend/app/db/models/hubspot_deal_data.py index 1d7607e0..758f688d 100644 --- a/backend/app/db/models/hubspot_deal_data.py +++ b/backend/app/db/models/hubspot_deal_data.py @@ -65,8 +65,8 @@ class HubspotDealData(SQLModel, table=True): server_default=text("(NOW() AT TIME ZONE 'utc')"), nullable=False, ), - default=None, # Nullable in db but optional here as value is set on db save for new record - ) + default=func.now(), + ) # Nullable in db but optional here as value is set on db save for new record updated_at: Optional[datetime] = Field( sa_column=Column( @@ -75,5 +75,5 @@ class HubspotDealData(SQLModel, table=True): onupdate=func.now(), nullable=False, ), - default=None, # Nullable in db but optional here as value is set on db save for new record - ) + default=func.now(), + ) # Nullable in db but optional here as value is set on db save for new record diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 65fad572..a50c99da 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -31,7 +31,7 @@ class HubspotDataToDb: records = self.read_org_table(limit) return [org.name for org in records if org.name] - def upsert_company(self, company_data: CompanyData) -> Organisation: + def upsert_organisation(self, company_data: CompanyData) -> Organisation: """Upserts a company record. Updates if hubspot_company_id exists, otherwise creates new.""" with db_read_session() as session: hubspot_id = company_data.get("hs_object_id") diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index 42def3b2..4db303ab 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -8,6 +8,7 @@ class HubspotDealDiffer: COORDINATION_COMPLETE: List[str] = [ "v1 ioe/mtp complete", "v2 ioe/mtp complete", + "v3 ioe/mtp complete", ] RETROFIT_DESIGN_COMPLETE = "uploaded" LODGEMENT_COMPLETE: List[str] = ["lodgement complete", "measures lodged"] @@ -72,6 +73,7 @@ class HubspotDealDiffer: "lodgement_status": "lodgement_status", "design_type": "design_type", "surveyor": "surveyor", + "confirmed_survey_time": "confirmed_survey_time", } for hs_field, db_field in FIELD_MAP.items(): @@ -101,10 +103,6 @@ class HubspotDealDiffer: if old_value != new_value: return True - # --- Time field --- - if old_deal.confirmed_survey_time != new_deal.get("confirmed_survey_time"): - return True - # No differences found return False diff --git a/etl/hubspot/scripts/onboarding/new_organisation.py b/etl/hubspot/scripts/onboarding/new_organisation.py index f8c6ba7a..0785949a 100644 --- a/etl/hubspot/scripts/onboarding/new_organisation.py +++ b/etl/hubspot/scripts/onboarding/new_organisation.py @@ -22,7 +22,7 @@ companies_to_add_or_ensure_it_exists = [ for company in companies_to_add_or_ensure_it_exists: company_info: CompanyData = hubspot.get_company_information(company.value) - dbRead.upsert_company(company_info) + dbRead.upsert_organisation(company_info) dbRead = HubspotDataToDb() diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 8fa71bf7..31945705 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -45,7 +45,7 @@ def handler(body: dict[str, Any], context: Any) -> None: if company: company_data: CompanyData = hubspot_client.get_company_information(company) db_client: HubspotDataToDb = HubspotDataToDb() - db_client.upsert_company(company_data) + db_client.upsert_organisation(company_data) db_client.upsert_deal(hubspot_deal, company, listing, hubspot_client) else: From 62fe46adc43d5c9ae98ad3b9de798bfaa1a82374 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 15:00:36 +0000 Subject: [PATCH 36/43] get queue name from settings --- .github/workflows/deploy_terraform.yml | 2 +- backend/app/config.py | 1 + etl/hubspot/scripts/scraper/main.py | 3 ++- .../terraform/lambda/hubspot_deal_etl/main.tf | 10 ++++++++++ .../terraform/lambda/pashub_to_ara/outputs.tf | 4 ++++ 5 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 infrastructure/terraform/lambda/pashub_to_ara/outputs.tf diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index fccc6da4..22f16fee 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -505,7 +505,7 @@ jobs: # Deploy Hubspot ETL Lambda # ============================================================ hubspot_etl_lambda: - needs: [hubspot_etl_image, determine_stage] + needs: [hubspot_etl_image, determine_stage, pashub_to_ara_lambda] uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: hubspot-etl-to-ara diff --git a/backend/app/config.py b/backend/app/config.py index 80a2d46a..9532ddd6 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -38,6 +38,7 @@ class Settings(BaseSettings): PLAN_TRIGGER_BUCKET: str = "changeme" ENGINE_SQS_URL: str = "changeme" CATEGORISATION_SQS_URL: str = "changeme" + PASHUB_TO_ARA_SQS_URL: str = "changeme" # Third parties EPC_AUTH_TOKEN: str = "changeme" diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 31945705..ea79bc18 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -2,6 +2,7 @@ import json import boto3 from typing import Any, Dict, Optional +from backend.app.config import get_settings from etl.hubspot.hubspotClient import HubspotClient from etl.hubspot.hubspotDataTodB import CompanyData, HubspotDataToDb from etl.hubspot.hubspot_deal_differ import HubspotDealDiffer @@ -21,7 +22,7 @@ def handler(body: dict[str, Any], context: Any) -> None: hubspot_client = HubspotClient() sqs_client = boto3.client("sqs") - PASHUB_TRIGGER_QUEUE_URL = "pashub_to_ara-queue-dev" # TODO: get from env var + PASHUB_TRIGGER_QUEUE_URL = get_settings().PASHUB_TO_ARA_SQS_URL payload = HubspotTriggerOrchestratorTriggerRequest.model_validate(body) hubspot_deal_id: str = payload.hubspot_deal_id diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf index 6ce7a386..518e1e05 100644 --- a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf @@ -7,6 +7,14 @@ data "terraform_remote_state" "shared" { } } +data "terraform_remote_state" "pashub_to_ara" { + backend = "s3" + config = { + bucket = "pashub-to-ara-terraform-state" + key = "ev:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} data "aws_secretsmanager_secret_version" "db_credentials" { secret_id = "${var.stage}/assessment_model/db_credentials" @@ -39,6 +47,8 @@ module "hubspot_deal_etl" { DB_NAME = var.db_name DB_PORT = var.db_port HUBSPOT_API_KEY = var.hubspot_api_key + + PASHUB_TO_ARA_SQS_URL = data.terraform_remote_state.pashub_to_ara.pashhub_to_ara_queue_url } } diff --git a/infrastructure/terraform/lambda/pashub_to_ara/outputs.tf b/infrastructure/terraform/lambda/pashub_to_ara/outputs.tf new file mode 100644 index 00000000..738aa4fc --- /dev/null +++ b/infrastructure/terraform/lambda/pashub_to_ara/outputs.tf @@ -0,0 +1,4 @@ +output "pashhub_to_ara_queue_url" { + value = module.lambda.queue_url + description = "URL of the PasHub to Ara SQS queue" +} From f1f3b84cbdadcecd4010658f0b119295a805e4ee Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 15:49:13 +0000 Subject: [PATCH 37/43] simplify photo upload logic --- etl/hubspot/hubspotDataTodB.py | 89 +++++------------------------ etl/hubspot/scripts/scraper/main.py | 9 ++- 2 files changed, 19 insertions(+), 79 deletions(-) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index a50c99da..6763f19c 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -78,53 +78,6 @@ class HubspotDataToDb: .one_or_none() ) - def update_deal_with_checks( - self, - deal_in_db: HubspotDealData, - hubspot_client: HubspotClient, - hs_deal: Dict[str, str], - hs_company_id: Optional[str], - hs_listing: Optional[Dict[str, str]], - ) -> bool: - """ - Updates deal in database and handles major_condition_issue_photos file upload to S3 with integrity check. - """ - self.upsert_deal(hs_deal, hs_company_id, hs_listing, hubspot_client) - - # Handle photo upload if it exists but S3 URL is missing - if self._needs_photo_upload(deal_in_db): - print( - f"🖼️ Found photo for deal_id {deal_in_db.deal_id} — uploading to S3..." - ) - - photo_url = hs_deal.get("major_condition_issue_photos") - - if photo_url: - self._upload_photo_to_s3( - deal_in_db, - photo_url, - hubspot_client, - verify=True, - ) - - # persist change - with db_read_session() as session: - db_record = session.get(HubspotDealData, deal_in_db.id) - db_record.major_condition_issue_evidence_s3_url = ( - deal_in_db.major_condition_issue_evidence_s3_url - ) - session.add(db_record) - session.commit() - - return False - else: - print(f"⚠️ Photo URL missing for deal_id {deal_in_db.deal_id}") - - else: - print(f"✅ No update or upload required for deal_id {deal_in_db.deal_id}.") - - return True - def upsert_deal( self, deal_data: Dict[str, str], @@ -169,14 +122,6 @@ class HubspotDataToDb: session.refresh(new_record) return new_record - def _sha256(self, file_path: str) -> str: - """Compute SHA-256 checksum of a file.""" - sha256 = hashlib.sha256() - with open(file_path, "rb") as f: - for chunk in iter(lambda: f.read(8192), b""): - sha256.update(chunk) - return sha256.hexdigest() - def _update_existing_deal( self, existing: HubspotDealData, @@ -315,18 +260,20 @@ class HubspotDataToDb: def _handle_existing_photo_upload( self, - existing: HubspotDealData, + existing_deal: HubspotDealData, hubspot_client: HubspotClient, ): - if self._needs_photo_upload(existing): - fresh_deal = hubspot_client.from_deal_id_get_info(existing.deal_id) - photo_url = fresh_deal.get("major_condition_issue_photos") + # if self._needs_photo_upload(existing): - if not photo_url: - print(f"⚠️ Photo URL missing for deal_id {existing.deal_id}") - return + fresh_deal = hubspot_client.from_deal_id_get_info(existing_deal.deal_id) + fresh_photo_url = fresh_deal.get("major_condition_issue_photos") - self._upload_photo_to_s3(existing, photo_url, hubspot_client) + if not fresh_photo_url: + print(f"⚠️ Photo URL missing for deal_id {existing_deal.deal_id}") + return + + if fresh_photo_url != existing_deal.major_condition_issue_photos: + self._upload_photo_to_s3(existing_deal, fresh_photo_url, hubspot_client) def _handle_new_photo_upload( self, @@ -343,12 +290,11 @@ class HubspotDataToDb: def _upload_photo_to_s3( self, record: HubspotDealData, - photo_url: str, + hubspot_photo_url: str, hubspot_client: HubspotClient, - verify: bool = False, ): try: - local_file = hubspot_client.download_file_from_url(photo_url) + local_file = hubspot_client.download_file_from_url(hubspot_photo_url) s3_url = self.s3.upload_file( local_file, @@ -356,11 +302,6 @@ class HubspotDataToDb: prefix="hubspot/awaabs_law_evidence/", ) - if verify: - downloaded = self.s3.download_from_url(s3_url) - if self._sha256(local_file) != self._sha256(downloaded): - raise ValueError("File integrity check failed after S3 upload.") - record.major_condition_issue_evidence_s3_url = s3_url except Exception as e: @@ -369,8 +310,8 @@ class HubspotDataToDb: if "local_file" in locals() and os.path.exists(local_file): os.remove(local_file) - def _needs_photo_upload(self, deal: HubspotDealData) -> bool: + def _needs_photo_upload(self, old_deal: HubspotDealData) -> bool: return bool( - deal.major_condition_issue_photos - and not deal.major_condition_issue_evidence_s3_url + old_deal.major_condition_issue_photos + and not old_deal.major_condition_issue_evidence_s3_url ) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index ea79bc18..f41ef154 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -63,12 +63,11 @@ def handler(body: dict[str, Any], context: Any) -> None: logger.info( f"Deal {hubspot_deal_id} has been changed, updating database..." ) - db_client.update_deal_with_checks( - deal_in_db=db_deal, + db_client.upsert_deal( + deal_data=hubspot_deal, + company=company, + listing=listing, hubspot_client=hubspot_client, - hs_deal=hubspot_deal, - hs_company_id=company, - hs_listing=listing, ) deal_changed = True From a495c930a1bdc0510f7339890467cc0da050268f Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 15:49:37 +0000 Subject: [PATCH 38/43] remove unused import --- etl/hubspot/hubspotDataTodB.py | 1 - 1 file changed, 1 deletion(-) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 6763f19c..c24d5813 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -1,4 +1,3 @@ -import hashlib import os from sqlmodel import select from datetime import datetime, timezone From 757c2241132a6796c5c75133bd5b8c14466340c5 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 15:54:33 +0000 Subject: [PATCH 39/43] add image update logging --- etl/hubspot/hubspotDataTodB.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index c24d5813..b7171290 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -10,6 +10,10 @@ from etl.hubspot.s3_uploader import S3Uploader from backend.app.db.connection import db_read_session from backend.app.db.models.organisation import Organisation from etl.hubspot.utils import parse_hs_date +from utils.logger import setup_logger + + +logger = setup_logger() class HubspotDataToDb: @@ -272,7 +276,12 @@ class HubspotDataToDb: return if fresh_photo_url != existing_deal.major_condition_issue_photos: + logger.info( + f"Hubspot image URL changed from {existing_deal.major_condition_issue_photos} to {fresh_photo_url}" + ) self._upload_photo_to_s3(existing_deal, fresh_photo_url, hubspot_client) + else: + logger.info(f"Hubspot iamge URL unchanged: {fresh_photo_url}") def _handle_new_photo_upload( self, From 3123723e8b284811d5459befdb277ed2b77e695a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 16:25:44 +0000 Subject: [PATCH 40/43] =?UTF-8?q?differ=20handles=20missing=20timezone=20f?= =?UTF-8?q?rom=20hubspot=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspotDataTodB.py | 6 ++--- etl/hubspot/scripts/scraper/main.py | 7 +++++ etl/hubspot/tests/test_hubspot_deal_differ.py | 27 ++++++++++++++++++- 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index b7171290..9756833b 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -101,11 +101,11 @@ class HubspotDataToDb: existing = session.exec(statement).first() if existing: + self._handle_existing_photo_upload(existing, hubspot_client) + print(f"🔄 Updating existing deal (deal_id={deal_id})") self._update_existing_deal(existing, deal_data, listing, company) - self._handle_existing_photo_upload(existing, hubspot_client) - session.add(existing) session.commit() session.refresh(existing) @@ -281,7 +281,7 @@ class HubspotDataToDb: ) self._upload_photo_to_s3(existing_deal, fresh_photo_url, hubspot_client) else: - logger.info(f"Hubspot iamge URL unchanged: {fresh_photo_url}") + logger.info(f"Hubspot image URL unchanged: {fresh_photo_url}") def _handle_new_photo_upload( self, diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index f41ef154..d754cbb1 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -104,3 +104,10 @@ def handler(body: dict[str, Any], context: Any) -> None: logger.info( f"Not Triggering PasHub file fetcher for HubSpot deal ID {hubspot_deal_id}" ) + + print("done") + + +if __name__ == "__main__": + handler({"hubspot_deal_id": "371470706915"}, "") + print("beep") diff --git a/etl/hubspot/tests/test_hubspot_deal_differ.py b/etl/hubspot/tests/test_hubspot_deal_differ.py index 74d3f057..9f41a5e6 100644 --- a/etl/hubspot/tests/test_hubspot_deal_differ.py +++ b/etl/hubspot/tests/test_hubspot_deal_differ.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone from typing import Any, Dict import uuid @@ -352,6 +352,31 @@ def test_db_update_trigger__company_changed__returns_true() -> None: assert result is True +def test_db_update_trigger__missing_hubspot_timezone__returns_false() -> None: + deal_id = uuid.uuid4() + + old_deal = make_old_deal( + id=deal_id, + design_completion_date=datetime(2025, 11, 3, 0, 0, tzinfo=timezone.utc), + ) + + new_deal = make_new_deal( + deal_id, + design_completion_date=datetime(2025, 11, 3, 0, 0), + ) + + new_company = "new_company" + + result = HubspotDealDiffer.check_for_db_update_trigger( + new_deal=new_deal, + new_company=new_company, + new_listing=None, + old_deal=old_deal, + ) + + assert result is False + + def test_db_update_trigger__listing_changed__returns_true() -> None: deal_id = uuid.uuid4() From 9852aa2809ad61667da39c2d612cadb79d55f9b2 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Thu, 9 Apr 2026 16:40:47 +0000 Subject: [PATCH 41/43] =?UTF-8?q?differ=20handles=20missing=20timezone=20f?= =?UTF-8?q?rom=20hubspot=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 4 ++++ etl/hubspot/tests/test_hubspot_deal_differ.py | 7 +++---- etl/hubspot/utils.py | 9 +++++++-- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index 4db303ab..b95b544c 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -103,6 +103,10 @@ class HubspotDealDiffer: if old_value != new_value: return True + # --- Time field --- + if old_deal.confirmed_survey_time != new_deal.get("confirmed_survey_time"): + return True + # No differences found return False diff --git a/etl/hubspot/tests/test_hubspot_deal_differ.py b/etl/hubspot/tests/test_hubspot_deal_differ.py index 9f41a5e6..69f7668b 100644 --- a/etl/hubspot/tests/test_hubspot_deal_differ.py +++ b/etl/hubspot/tests/test_hubspot_deal_differ.py @@ -362,14 +362,13 @@ def test_db_update_trigger__missing_hubspot_timezone__returns_false() -> None: new_deal = make_new_deal( deal_id, - design_completion_date=datetime(2025, 11, 3, 0, 0), + hs_object_id="1", + design_completion_date=datetime(2025, 11, 3, 0, 0).isoformat(), ) - new_company = "new_company" - result = HubspotDealDiffer.check_for_db_update_trigger( new_deal=new_deal, - new_company=new_company, + new_company=None, new_listing=None, old_deal=old_deal, ) diff --git a/etl/hubspot/utils.py b/etl/hubspot/utils.py index 9fbeae62..b7331f94 100644 --- a/etl/hubspot/utils.py +++ b/etl/hubspot/utils.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone from typing import Optional @@ -6,6 +6,11 @@ def parse_hs_date(value: Optional[str]) -> Optional[datetime]: if not value: return None try: - return datetime.fromisoformat(value.replace("Z", "+00:00")) + dt = datetime.fromisoformat(value.replace("Z", "+00:00")) + + if dt.tzinfo is None: + return dt.replace(tzinfo=timezone.utc) + + return dt.astimezone(timezone.utc) except ValueError: return None From 06503cd989b472b99627ff7c1aca6760cf5a984f Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 10 Apr 2026 10:23:38 +0000 Subject: [PATCH 42/43] correct use of terraform state and fix typo --- infrastructure/terraform/lambda/hubspot_deal_etl/main.tf | 2 +- infrastructure/terraform/lambda/pashub_to_ara/outputs.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf index 518e1e05..516ec282 100644 --- a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf @@ -48,7 +48,7 @@ module "hubspot_deal_etl" { DB_PORT = var.db_port HUBSPOT_API_KEY = var.hubspot_api_key - PASHUB_TO_ARA_SQS_URL = data.terraform_remote_state.pashub_to_ara.pashhub_to_ara_queue_url + PASHUB_TO_ARA_SQS_URL = data.terraform_remote_state.pashub_to_ara.outputs.pashub_to_ara_queue_url } } diff --git a/infrastructure/terraform/lambda/pashub_to_ara/outputs.tf b/infrastructure/terraform/lambda/pashub_to_ara/outputs.tf index 738aa4fc..d44b8763 100644 --- a/infrastructure/terraform/lambda/pashub_to_ara/outputs.tf +++ b/infrastructure/terraform/lambda/pashub_to_ara/outputs.tf @@ -1,4 +1,4 @@ -output "pashhub_to_ara_queue_url" { +output "pashub_to_ara_queue_url" { value = module.lambda.queue_url description = "URL of the PasHub to Ara SQS queue" } From 9378c417e1240abd8d51d3a3a3d1961728175bb9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 10 Apr 2026 13:01:37 +0000 Subject: [PATCH 43/43] fix typo in terraform state key --- infrastructure/terraform/lambda/hubspot_deal_etl/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf index 516ec282..e8762337 100644 --- a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf @@ -11,7 +11,7 @@ data "terraform_remote_state" "pashub_to_ara" { backend = "s3" config = { bucket = "pashub-to-ara-terraform-state" - key = "ev:/${var.stage}/terraform.tfstate" + key = "env:/${var.stage}/terraform.tfstate" region = "eu-west-2" } }