skip file if already processed according to db

This commit is contained in:
Daniel Roth 2026-04-07 14:34:33 +00:00
parent 849a272974
commit 15f1fde16a
5 changed files with 71 additions and 4 deletions

View file

@ -0,0 +1,25 @@
from typing import Optional
from sqlalchemy import select
from backend.app.db.connection import db_read_session
from backend.app.db.models.uploaded_file import (
FileSourceEnum,
FileTypeEnum,
UploadedFile,
)
def get_uploaded_file_by_listing_type_and_source(
hubspot_listing_id: int,
file_type: FileTypeEnum,
file_source: FileSourceEnum,
) -> Optional[UploadedFile]:
with db_read_session() as session:
statement = select(UploadedFile).where(
UploadedFile.hubspot_listing_id == hubspot_listing_id,
UploadedFile.file_type == file_type,
UploadedFile.file_source == file_source,
)
return session.exec(statement).one_or_none()

View file

@ -14,6 +14,8 @@ class FileTypeEnum(enum.Enum):
PAR_PHOTO_PACK = "par_photo_pack"
PAS_2023_PROPERTY = "pas_2023_property"
PAS_2023_OCCUPANCY = "pas_2023_occupancy"
ECMK_SITE_NOTE = "ecmk_site_note"
ECMK_RD_SAP_SITE_NOTE = "ecmk_rd_sap_site_note"
class FileSourceEnum(enum.Enum):

View file

@ -0,0 +1 @@
,daniel,daniel-Dell-15-DC15250,07.04.2026 11:47,/home/daniel/snap/onlyoffice-desktopeditors/1067/.local/share/onlyoffice;

View file

@ -8,6 +8,10 @@ from playwright.sync_api import (
BrowserContext,
)
from backend.app.db.functions.uploaded_files_functions import (
get_uploaded_file_by_listing_type_and_source,
)
from backend.app.db.models.uploaded_file import FileSourceEnum, FileTypeEnum
from backend.ecmk_fetcher.address_list import (
PropertyRow,
extract_addresses_from_spreadsheet,
@ -20,7 +24,11 @@ from backend.ecmk_fetcher.browser import (
go_to_next_page,
login,
)
from backend.ecmk_fetcher.reports import REPORT_TYPES, build_property_id
from backend.ecmk_fetcher.reports import (
REPORT_TYPES,
build_property_id,
map_report_type_to_db_file_type,
)
from backend.ecmk_fetcher.sharepoint import upload_file_to_sharepoint
from utils.logger import setup_logger
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient
@ -48,6 +56,8 @@ def run_job() -> None:
sharepoint_base_path: str = "/Projects/Southern Housing/SH-SURV-26-001/Assessments"
# s3_bucket: str = "retrofit-energy-assessments-dev"
with sync_playwright() as p:
browser: Browser = p.chromium.launch(headless=True)
context: BrowserContext = browser.new_context()
@ -92,12 +102,29 @@ def run_job() -> None:
sharepoint_address: str = property_row.address
# Check whether files have already been processed before continuing with this property
# hubspot_listing_id: str = property_row.listing_id
go_to_assessment_details(page, row)
for report_type in REPORT_TYPES:
hubspot_listing_id: str = property_row.listing_id
try:
db_file_type: FileTypeEnum = (
map_report_type_to_db_file_type(report_type)
)
except ValueError:
logger.error(
f"Unknown report type {report_type}, skipping file"
)
continue
if get_uploaded_file_by_listing_type_and_source(
hubspot_listing_id=int(hubspot_listing_id),
file_type=db_file_type,
file_source=FileSourceEnum.ECMK,
):
logger.debug("File already uploaded to s3, skipping")
continue
file_path: str | None = download_with_retry(
page, report_type
)

View file

@ -1,5 +1,7 @@
from enum import Enum
from backend.app.db.models.uploaded_file import FileTypeEnum
class FileDownloadButtonType(Enum):
ASSESSOR_HUB_SITENOTE_REPORT = 11
@ -15,6 +17,16 @@ REPORT_TYPES = [
]
def map_report_type_to_db_file_type(report_type: int) -> FileTypeEnum:
match report_type:
case FileDownloadButtonType.ASSESSOR_HUB_SITENOTE_REPORT.value:
return FileTypeEnum.ECMK_SITE_NOTE
case FileDownloadButtonType.SITENOTE_REPORT.value:
return FileTypeEnum.ECMK_RD_SAP_SITE_NOTE
case _:
raise ValueError("Unknown report type")
def build_report_selector(report_type: int) -> str:
return f"a.download-report-btn[data-report-type='{report_type}']"