From fcf25f7cac88dd5790c2fb128d3b7e95e1a98144 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 31 Mar 2026 18:46:37 +0000 Subject: [PATCH] get rid of photos from local drive when i run it locally so its less verbose --- etl/hubspot/hubspotDataTodB.py | 9 +++++++++ etl/hubspot/scripts/scraper/bulk_load.py | 21 ++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/etl/hubspot/hubspotDataTodB.py b/etl/hubspot/hubspotDataTodB.py index 38ec3e35..0c38f483 100644 --- a/etl/hubspot/hubspotDataTodB.py +++ b/etl/hubspot/hubspotDataTodB.py @@ -311,6 +311,9 @@ class HubspotDataToDb: f"⚠️ Failed to download/upload photo for deal_id {deal_in_db.deal_id}: {e}" ) # Continue without the file — don't crash the entire update + finally: + if "local_file" in locals() and os.path.exists(local_file): + os.remove(local_file) else: print(f"⚠️ Photo URL missing for deal_id {deal_in_db.deal_id}") @@ -405,6 +408,9 @@ class HubspotDataToDb: f"⚠️ Failed to download photo for deal_id {existing.deal_id}: {e}" ) # Continue without the file — don't crash the update + finally: + if "local_file" in locals() and os.path.exists(local_file): + os.remove(local_file) else: print(f"⚠️ Photo URL missing for deal_id {existing.deal_id}") @@ -475,6 +481,9 @@ class HubspotDataToDb: f"⚠️ Failed to download photo for deal_id {new_record.deal_id}: {e}" ) # Continue without the file — don't crash the insert + finally: + if "local_file" in locals() and os.path.exists(local_file): + os.remove(local_file) session.add(new_record) session.commit() diff --git a/etl/hubspot/scripts/scraper/bulk_load.py b/etl/hubspot/scripts/scraper/bulk_load.py index fabf8a3f..6fac23ea 100644 --- a/etl/hubspot/scripts/scraper/bulk_load.py +++ b/etl/hubspot/scripts/scraper/bulk_load.py @@ -16,17 +16,24 @@ def bulk_load(companies: list[Companies] | None = None) -> None: hubspot = HubspotClient() targets = companies or list(Companies) - for company in tqdm(targets, desc="Companies"): + for company in tqdm(targets, desc="Companies", unit="co"): company_id = company.value deal_ids = hubspot.get_deal_ids_from_company(company_id) - for deal_id in tqdm(deal_ids, desc=f"{company.name}", leave=False): - deal_data = hubspot.from_deal_id_get_info(deal_id) - if deal_data.get("pipeline") != PIPELINE_ID: - continue + processed = 0 + with tqdm(deal_ids, desc=company.name, unit="deal", leave=False) as deal_bar: + for deal_id in deal_bar: + deal_data = hubspot.from_deal_id_get_info(deal_id) + if deal_data.get("pipeline") != PIPELINE_ID: + deal_bar.set_postfix({"status": "skip", "deal": deal_id}) + continue - handler({"hubspot_deal_id": deal_id}, context=None) - print(f"Processed deal {deal_id} (company: {company.name})") + deal_bar.set_postfix({"status": "uploading", "deal": deal_id}) + handler({"hubspot_deal_id": deal_id}, context=None) + processed += 1 + deal_bar.set_postfix({"status": "done", "deal": deal_id}) + + tqdm.write(f"[{company.name}] {processed}/{len(deal_ids)} deals in pipeline") if __name__ == "__main__":