From 231473ecbac2c71c34375a595ffd5b1cbf9ed6f4 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Wed, 1 Apr 2026 16:24:00 +0000 Subject: [PATCH] load files to s3 and update db --- backend/app/db/models/uploaded_file.py | 1 + backend/ecmk_fetcher/handler/handler.py | 41 +++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py index 726ed0a3..9b751d34 100644 --- a/backend/app/db/models/uploaded_file.py +++ b/backend/app/db/models/uploaded_file.py @@ -20,6 +20,7 @@ class FileSourceEnum(enum.Enum): PAS_HUB = "pas hub" SHAREPOINT = "sharepoint" HUBSPOT = "hubspot" + ECMK = "ecmk" class UploadedFile(Base): diff --git a/backend/ecmk_fetcher/handler/handler.py b/backend/ecmk_fetcher/handler/handler.py index 48721d14..932b8552 100644 --- a/backend/ecmk_fetcher/handler/handler.py +++ b/backend/ecmk_fetcher/handler/handler.py @@ -1,3 +1,4 @@ +from datetime import datetime, timezone import os from enum import Enum import re @@ -10,7 +11,10 @@ from playwright.sync_api import ( TimeoutError as PlaywrightTimeoutError, ) +from backend.app.db.connection import db_session +from backend.app.db.models.uploaded_file import FileSourceEnum, UploadedFile from utils.logger import setup_logger +from utils.s3 import upload_file_to_s3 from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient from utils.sharepoint.domna_sites import DomnaSites @@ -181,6 +185,37 @@ def download_report_by_selector(page: Page, selector: str) -> str: return save_path +def upload_job_to_s3_and_update_db(job_files: List[str], uprn: str) -> None: + bucket = "retrofit-energy-assessments-dev" + + base_path = f"documents/uprn/{uprn}" + + uploaded_files: List[UploadedFile] = [] + + for file_path in job_files: + filename = os.path.basename(file_path) + file_key = f"{base_path}/{filename}" + + upload_file_to_s3(file_path, bucket, file_key) + + # load row to db + uploaded_files.append( + UploadedFile( + s3_file_bucket=bucket, + s3_file_key=file_key, + s3_upload_timestamp=datetime.now(timezone.utc), + uprn=int(uprn), + file_source=FileSourceEnum.ECMK.value, + ) + ) + + with db_session() as session: + session.add_all(uploaded_files) + session.commit() + + pass + + def download_report() -> None: username: str = "" password: str = "" @@ -232,7 +267,7 @@ def download_report() -> None: last_name: str = cells.nth(2).inner_text().strip() address: str = cells.nth(5).inner_text().strip() postcode: str = cells.nth(7).inner_text().strip() - # uprn: str = cells.nth(8).inner_text().strip() + uprn: str = cells.nth(8).inner_text().strip() status: str = cells.nth(9).inner_text().strip() if first_name == "Oliver" and last_name == "Stephens": @@ -268,7 +303,9 @@ def download_report() -> None: sharepoint_path=f"{sharepoint_base_path}/{sharepoint_address}/1. Retrofit Assessment/A. Assessment", file_name=os.path.basename(file_path), ) - # TODO: stick in s3 + # TODO: could s3 load happen for all files at once to reduce db roundtrips? + if uprn: + upload_job_to_s3_and_update_db([file_path], uprn) finally: if os.path.exists(file_path): os.remove(file_path)