From e8fa12824d175891ca088e3dd2534d22dc5938e8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 17 Dec 2025 11:25:10 +0000 Subject: [PATCH 1/3] made the script faster --- .vscode/settings.json | 22 +++++ etl/hubSpotClient/hubspotClient.py | 28 ++++++ .../scripts/hubspot_abri_etl_first_time.py | 98 ++++++++++++++----- 3 files changed, 121 insertions(+), 27 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 27782c1..ce943bf 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -15,5 +15,27 @@ // "%load_ext autoreload", "%autoreload 2" // ] + "vim.enableNeovim": false, + + // Allow VSCode native keybindings to override Vim when needed + "vim.handleKeys": { + "": false, + "": false, + "": false, + "": false, + "": false, + "": false, + "": false, + "": false, + "": false, + "": false + }, + + // Terminal copy/paste via Ctrl+Shift+C / Ctrl+Shift+V + "terminal.integrated.copyOnSelection": false, + "terminal.integrated.commandsToSkipShell": [ + "workbench.action.terminal.copySelection", + "workbench.action.terminal.paste" + ], } \ No newline at end of file diff --git a/etl/hubSpotClient/hubspotClient.py b/etl/hubSpotClient/hubspotClient.py index ca26f87..b0d74d0 100644 --- a/etl/hubSpotClient/hubspotClient.py +++ b/etl/hubSpotClient/hubspotClient.py @@ -15,6 +15,7 @@ class Companies(Enum): SOUTHERN_HOUSING_GROUP = "109343619305" LIVEWEST = "86205872354" SURESERVE = "301745289413" + HOMEGROUP = "94946071794" class DealStage(Enum): SURVEYED_COMPLETE_NEEDS_SIGN_OFF = "1617223914" @@ -83,6 +84,32 @@ class HubSpotClient(): self.logger.error(f"Error fetching associated company for deal {deal_id}: {e}") return None + def get_deals_from_company(self, company_id: str) -> list[str]: + associations_api = self.client.crm.associations.v4.basic_api + + deal_ids = [] + after = None + + while True: + response = associations_api.get_page( + object_type="companies", + object_id=company_id, + to_object_type="deals", + limit=100, + after=after + ) + + deal_ids.extend( + assoc.to_object_id for assoc in response.results + ) + + if not response.paging or not response.paging.next: + break + + after = response.paging.next.after + + return deal_ids + def from_deal_get_associated_listing(self, deal_id: str): """ Get the associated listing information for a given deal. @@ -126,6 +153,7 @@ class HubSpotClient(): properties=[ 'dealname', 'dealstage', + 'pipeline', 'outcome', #outcome, 'outcome_notes', #outcome notes 'project_code', diff --git a/etl/hubSpotClient/scripts/hubspot_abri_etl_first_time.py b/etl/hubSpotClient/scripts/hubspot_abri_etl_first_time.py index 936765a..cd7a52b 100644 --- a/etl/hubSpotClient/scripts/hubspot_abri_etl_first_time.py +++ b/etl/hubSpotClient/scripts/hubspot_abri_etl_first_time.py @@ -1,33 +1,77 @@ +# from etl.hubSpotClient.hubspotClient import HubSpotClient, Companies, Pipeline +# from tqdm import tqdm +# from etl.db.hubSpotLoad import HubspotTodb + +# # get ALL deals +# hubspot = HubSpotClient() + +# # All deals from a pipeline_id via filter +# deals = hubspot.get_deal_ids_by_pipeline( +# pipeline_id=Pipeline.OPERATIONS_SOCIAL_HOUSING.value, +# ) + +# # deals from companies we care about +# valueable_deals = [ +# # Companies.ABRI.value, +# # Companies.SOUTHERN_HOUSING_GROUP.value, +# # Companies.SURESERVE.value, +# # Companies.LIVEWEST.value, +# Companies.HOMEGROUP.value, +# ] +# deals_to_add = [] + + +# deal_to_companies = {} +# loader = HubspotTodb() +# # Get all deals we care about +# for i,deal in enumerate(tqdm(deals)): +# company = hubspot.from_deal_get_associated_company_id(deal) +# if company in valueable_deals: +# deals_to_add.append(deal) +# deal_to_companies.update({deal: company}) +# deal_data = hubspot.from_deal_get_info(deal_id=deal) +# listing_data = hubspot.from_deal_get_associated_listing(deal_id=deal) +# loader.new_record_to_hubspot_data(deal_data, deal_to_companies[deal], listing_data, hubspot) + + from etl.hubSpotClient.hubspotClient import HubSpotClient, Companies, Pipeline from tqdm import tqdm from etl.db.hubSpotLoad import HubspotTodb -# get ALL deals hubspot = HubSpotClient() - -# All deals from a pipeline_id via filter -deals = hubspot.get_deal_ids_by_pipeline( - pipeline_id=Pipeline.OPERATIONS_SOCIAL_HOUSING.value, - ) - -# deals from companies we care about -valueable_deals = [ - Companies.ABRI.value, - Companies.SOUTHERN_HOUSING_GROUP.value, - Companies.SURESERVE.value, - Companies.LIVEWEST.value, -] -deals_to_add = [] - - -deal_to_companies = {} loader = HubspotTodb() -# Get all deals we care about -for i,deal in enumerate(tqdm(deals)): - company = hubspot.from_deal_get_associated_company_id(deal) - if company in valueable_deals: - deals_to_add.append(deal) - deal_to_companies.update({deal: company}) - deal_data = hubspot.from_deal_get_info(deal_id=deal) - listing_data = hubspot.from_deal_get_associated_listing(deal_id=deal) - loader.new_record_to_hubspot_data(deal_data, deal_to_companies[deal], listing_data, hubspot) \ No newline at end of file + +PIPELINE_ID = Pipeline.OPERATIONS_SOCIAL_HOUSING.value + +valuable_companies = [ + Companies.HOMEGROUP.value, +] + +deals_to_add = [] +deal_to_companies = {} + +for company_id in valuable_companies: + # 🔥 Cheap: company → deals + deal_ids = hubspot.get_deals_from_company(company_id) + + for deal_id in tqdm(deal_ids, desc=f"Company {company_id}"): + # Fetch minimal deal info once + deal_data = hubspot.from_deal_get_info(deal_id) + print(f"working on deal {deal_id}") + # Filter by pipeline (small local filter) + if deal_data.get("pipeline") != PIPELINE_ID: + continue + + deals_to_add.append(deal_id) + deal_to_companies[deal_id] = company_id + + listing_data = hubspot.from_deal_get_associated_listing(deal_id) + + loader.new_record_to_hubspot_data( + deal_data, + company_id, + listing_data, + hubspot + ) + + print(f"Uploaded deal_id {deal_id} to db") \ No newline at end of file From 4c4b3b059c40f812564a4419de0643c0fb3fef8d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 17 Dec 2025 11:27:24 +0000 Subject: [PATCH 2/3] gather all deals --- .github/workflows/hubspot_sync.yml | 22 +++--- .../scripts/hubspot_abri_etl_first_time.py | 77 ------------------- .../scripts/hubspot_gather_all_deals.py | 45 +++++++++++ 3 files changed, 56 insertions(+), 88 deletions(-) delete mode 100644 etl/hubSpotClient/scripts/hubspot_abri_etl_first_time.py create mode 100644 etl/hubSpotClient/scripts/hubspot_gather_all_deals.py diff --git a/.github/workflows/hubspot_sync.yml b/.github/workflows/hubspot_sync.yml index ce97480..5d57a15 100644 --- a/.github/workflows/hubspot_sync.yml +++ b/.github/workflows/hubspot_sync.yml @@ -21,16 +21,6 @@ jobs: pip install poetry poetry install --no-root - # - name: Run scripts - # env: - # PYTHONPATH: ${{ github.workspace }} - # DATABASE_URL: ${{ secrets.PROD_DATABASE_URL }} - # run: | - # pwd - # ls -la - # poetry run python etl/hubSpotClient/scripts/hubspot_abri_etl_first_time.py - - - name: Run scripts env: PYTHONPATH: ${{ github.workspace }} @@ -38,6 +28,16 @@ jobs: run: | pwd ls -la - poetry run python etl/hubSpotClient/scripts/hubspot_update_script.py + poetry run python etl/hubSpotClient/scripts/hubspot_gather_all_deals.py + + + # - name: Run scripts + # env: + # PYTHONPATH: ${{ github.workspace }} + # DATABASE_URL: ${{ secrets.PROD_DATABASE_URL }} + # run: | + # pwd + # ls -la + # poetry run python etl/hubSpotClient/scripts/hubspot_update_script.py \ No newline at end of file diff --git a/etl/hubSpotClient/scripts/hubspot_abri_etl_first_time.py b/etl/hubSpotClient/scripts/hubspot_abri_etl_first_time.py deleted file mode 100644 index cd7a52b..0000000 --- a/etl/hubSpotClient/scripts/hubspot_abri_etl_first_time.py +++ /dev/null @@ -1,77 +0,0 @@ -# from etl.hubSpotClient.hubspotClient import HubSpotClient, Companies, Pipeline -# from tqdm import tqdm -# from etl.db.hubSpotLoad import HubspotTodb - -# # get ALL deals -# hubspot = HubSpotClient() - -# # All deals from a pipeline_id via filter -# deals = hubspot.get_deal_ids_by_pipeline( -# pipeline_id=Pipeline.OPERATIONS_SOCIAL_HOUSING.value, -# ) - -# # deals from companies we care about -# valueable_deals = [ -# # Companies.ABRI.value, -# # Companies.SOUTHERN_HOUSING_GROUP.value, -# # Companies.SURESERVE.value, -# # Companies.LIVEWEST.value, -# Companies.HOMEGROUP.value, -# ] -# deals_to_add = [] - - -# deal_to_companies = {} -# loader = HubspotTodb() -# # Get all deals we care about -# for i,deal in enumerate(tqdm(deals)): -# company = hubspot.from_deal_get_associated_company_id(deal) -# if company in valueable_deals: -# deals_to_add.append(deal) -# deal_to_companies.update({deal: company}) -# deal_data = hubspot.from_deal_get_info(deal_id=deal) -# listing_data = hubspot.from_deal_get_associated_listing(deal_id=deal) -# loader.new_record_to_hubspot_data(deal_data, deal_to_companies[deal], listing_data, hubspot) - - -from etl.hubSpotClient.hubspotClient import HubSpotClient, Companies, Pipeline -from tqdm import tqdm -from etl.db.hubSpotLoad import HubspotTodb - -hubspot = HubSpotClient() -loader = HubspotTodb() - -PIPELINE_ID = Pipeline.OPERATIONS_SOCIAL_HOUSING.value - -valuable_companies = [ - Companies.HOMEGROUP.value, -] - -deals_to_add = [] -deal_to_companies = {} - -for company_id in valuable_companies: - # 🔥 Cheap: company → deals - deal_ids = hubspot.get_deals_from_company(company_id) - - for deal_id in tqdm(deal_ids, desc=f"Company {company_id}"): - # Fetch minimal deal info once - deal_data = hubspot.from_deal_get_info(deal_id) - print(f"working on deal {deal_id}") - # Filter by pipeline (small local filter) - if deal_data.get("pipeline") != PIPELINE_ID: - continue - - deals_to_add.append(deal_id) - deal_to_companies[deal_id] = company_id - - listing_data = hubspot.from_deal_get_associated_listing(deal_id) - - loader.new_record_to_hubspot_data( - deal_data, - company_id, - listing_data, - hubspot - ) - - print(f"Uploaded deal_id {deal_id} to db") \ No newline at end of file diff --git a/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py b/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py new file mode 100644 index 0000000..eec8ae0 --- /dev/null +++ b/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py @@ -0,0 +1,45 @@ +from etl.hubSpotClient.hubspotClient import HubSpotClient, Companies, Pipeline +from tqdm import tqdm +from etl.db.hubSpotLoad import HubspotTodb + +hubspot = HubSpotClient() +loader = HubspotTodb() + +PIPELINE_ID = Pipeline.OPERATIONS_SOCIAL_HOUSING.value + +valuable_companies = [ + Companies.HOMEGROUP.value, + Companies.ABRI.value, + Companies.SOUTHERN_HOUSING_GROUP.value, + Companies.SURESERVE.value, + Companies.LIVEWEST.value, +] + +deals_to_add = [] +deal_to_companies = {} + +for company_id in valuable_companies: + # 🔥 Cheap: company → deals + deal_ids = hubspot.get_deals_from_company(company_id) + + for deal_id in tqdm(deal_ids, desc=f"Company {company_id}"): + # Fetch minimal deal info once + deal_data = hubspot.from_deal_get_info(deal_id) + print(f"working on deal {deal_id}") + # Filter by pipeline (small local filter) + if deal_data.get("pipeline") != PIPELINE_ID: + continue + + deals_to_add.append(deal_id) + deal_to_companies[deal_id] = company_id + + listing_data = hubspot.from_deal_get_associated_listing(deal_id) + + loader.new_record_to_hubspot_data( + deal_data, + company_id, + listing_data, + hubspot + ) + + print(f"Uploaded deal_id {deal_id} to db") \ No newline at end of file From 824e17dee622d481bb60ef04e53935f5eb7b6eb2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 18 Dec 2025 14:34:08 +0000 Subject: [PATCH 3/3] save --- etl/db/hubSpotLoad.py | 2 ++ etl/hubSpotClient/hubspotClient.py | 1 + etl/hubSpotClient/scripts/hubspot_gather_all_deals.py | 11 ++++++----- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/etl/db/hubSpotLoad.py b/etl/db/hubSpotLoad.py index dca83d5..0c47a40 100644 --- a/etl/db/hubSpotLoad.py +++ b/etl/db/hubSpotLoad.py @@ -117,6 +117,8 @@ class HubspotTodb: Also uploads photos if present and adds S3 URL. """ with get_db_session() as session: + print("junte was here ") + print(deal_data) deal_id = deal_data.get("hs_object_id") statement = select(HubspotDealData).where(HubspotDealData.deal_id == deal_id) diff --git a/etl/hubSpotClient/hubspotClient.py b/etl/hubSpotClient/hubspotClient.py index b0d74d0..591b128 100644 --- a/etl/hubSpotClient/hubspotClient.py +++ b/etl/hubSpotClient/hubspotClient.py @@ -16,6 +16,7 @@ class Companies(Enum): LIVEWEST = "86205872354" SURESERVE = "301745289413" HOMEGROUP = "94946071794" + APPLE = "184769046716" class DealStage(Enum): SURVEYED_COMPLETE_NEEDS_SIGN_OFF = "1617223914" diff --git a/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py b/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py index eec8ae0..f4826a0 100644 --- a/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py +++ b/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py @@ -8,11 +8,12 @@ loader = HubspotTodb() PIPELINE_ID = Pipeline.OPERATIONS_SOCIAL_HOUSING.value valuable_companies = [ - Companies.HOMEGROUP.value, - Companies.ABRI.value, - Companies.SOUTHERN_HOUSING_GROUP.value, - Companies.SURESERVE.value, - Companies.LIVEWEST.value, + # Companies.HOMEGROUP.value, + # Companies.ABRI.value, + # Companies.SOUTHERN_HOUSING_GROUP.value, + # Companies.SURESERVE.value, + # Companies.LIVEWEST.value, + Companies.APPLE.value, ] deals_to_add = []