From 2fafc0d4fb4b218c0345b2fa6492e93ae1c72cc4 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 16 Apr 2025 17:48:01 +0000 Subject: [PATCH] added a script that allow domna notes to be extracted from deals --- etl/dimitra_hubspot_notes_gather.py | 109 ++++++++++++++++++++++++++++ etl/hubSpotClient/hubspot.py | 80 ++++++++++++++++++++ poetry.lock | 37 +++++++++- pyproject.toml | 1 + 4 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 etl/dimitra_hubspot_notes_gather.py diff --git a/etl/dimitra_hubspot_notes_gather.py b/etl/dimitra_hubspot_notes_gather.py new file mode 100644 index 0000000..923c395 --- /dev/null +++ b/etl/dimitra_hubspot_notes_gather.py @@ -0,0 +1,109 @@ +import os + +os.environ["SHAREPOINT_CLIENT_ID"] = "895e3b77-b1d7-43ec-b18f-dcfe07cdfeaf" +os.environ["SHAREPOINT_CLIENT_SECRET"] = "SOf8Q~-is4wdQiqvEEm9FlJQRAY9ELGaj5Qz-a6E" +os.environ["SHAREPOINT_TENANT_ID"] = "c3f7519c-2719-4547-af04-6da6cbfd8f8f" +os.environ["SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID"] = "b5a51507-9427-4ee0-b03e-90ec7681e2d3" +os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284" + +from etl.scraper.scraper import SharePointScraper, SharePointInstaller, previous_monday +from etl.hubSpotClient.hubspot import HubSpotClient, DealStage +import pandas as pd +from bs4 import BeautifulSoup +from openpyxl import Workbook +from openpyxl.styles import Font + + +hubspot = HubSpotClient() +import time +pipelines_to_include =[ + "SALES - SOCIAL HOUSING", + "PVT PAY", + "NRLA GENERAL ENQUIRIES", + # "OSMOSIS - SALES", + +] +exclude_stage = { + "SALES - SOCIAL HOUSING" : [ + "HA TO REENGAGE", + "APPOINTMENT SCHEDULED", + "AWAITING ASSET LIST", + "ASSET LIST RECEIVED", + "ASSET LIST STANDARDISED", + "ROUTE MARCH CREATED", + "HA WEEKLY REPORTING", + ], + "PVT PAY": [ + "LIVE OPPORTUNITY", + "CLOSED LOST", + "INVOICED", + "COLD - KIT", + "CLOSED WON", + + ], + "NRLA GENERAL ENQUIRIES": [ + "CUSTOMER CONTACTED", + "LOST", + "COLD", + ] +} + +include_pipeline_upper = [s.upper().strip() for s in pipelines_to_include] +exclude_stage_upper = [s.upper().strip() for s in exclude_stage] +notes_data = [] +pipelines = hubspot.client.crm.pipelines.pipelines_api.get_all(object_type="deals") +for pipeline in pipelines.results: + pipeline_name = pipeline.label.upper().strip() + if pipeline_name in pipelines_to_include: + for stage in pipeline.stages: + if stage.label.upper().strip() not in exclude_stage[pipeline_name]: + for deal_id in hubspot.get_all_deals_from_stage_id(stage.id): + notes = hubspot.get_notes_from_deals_id(deal_id) + for note in notes: + deal_name = hubspot.get_deal_name_by_id(deal_id) + html_body = note['note'] + soup = BeautifulSoup(html_body, "html.parser") + plain_text = soup.get_text(separator="\n") # Keeps line breaks + notes_data.append({ + "note_body": plain_text, + "deal_name": deal_name, # Include deal_id to relate the note to the deal + "pipeline_name": pipeline.label # Add the pipeline name + }) + + time.sleep(0.75) + print("delay to not bombard the server") + +notes_df = pd.DataFrame(notes_data) +notes_df.to_csv("output.csv") +df = notes_df + +wb = Workbook() +wb.remove(wb.active) # Remove default sheet + +for pipeline, group_df in df.groupby("pipeline_name"): + ws = wb.create_sheet(title=pipeline[:31]) # Excel sheet name limit = 31 chars + + # Sort by deal name + group_df = group_df.sort_values("deal_name") + + current_row = 1 + for deal_name, deal_notes in group_df.groupby("deal_name"): + # Bold header for each deal + ws.cell(row=current_row, column=1, value=f"Deal Stage: {deal_name}") + ws.cell(row=current_row, column=1).font = Font(bold=True) + current_row += 1 + + # Notes for the deal + for note in deal_notes["note_body"]: + ws.cell(row=current_row, column=2, value=note) + current_row += 1 + + # Add a blank row between groups + current_row += 1 + +# Save to Excel +file_name = "DEAL_NOTES_FROM_HUBSPOT.xlsx" +wb.save(file_name) +output_path = os.path.abspath(file_name) +sharepoint_client = SharePointScraper(SharePointInstaller.DOMNA) +sharepoint_client.upload_file(output_path, f"/02. Sales and Marketing/02. DEAL Notes from Hubspot/{previous_monday}",file_name) diff --git a/etl/hubSpotClient/hubspot.py b/etl/hubSpotClient/hubspot.py index cf022e6..4924d27 100644 --- a/etl/hubSpotClient/hubspot.py +++ b/etl/hubSpotClient/hubspot.py @@ -4,6 +4,8 @@ from hubspot.crm.deals import PublicObjectSearchRequest from hubspot.crm.deals.models import SimplePublicObjectInput from etl.hubSpotClient.types import SubmissionInfoFromDeal + + class DealStage(Enum): SURVEYED_COMPLETE_NEEDS_SIGN_OFF = "1617223914" SURVEYED_NO_ACCESS_NEED_SIGN_OFF = "1617223915" @@ -17,7 +19,85 @@ class HubSpotClient(): def get_all_deals(self): return self.client.crm.deals.get_all() + + + def get_deal_name_by_id(self, deal_id): + try: + deal = self.client.crm.deals.basic_api.get_by_id(deal_id) + return deal.properties.get("dealname", "No deal name") + except Exception as e: + return "Unknown Deal" # Fallback if the deal name is not found + def get_notes_from_deals_id(self, deals_id): + from hubspot.crm.objects import PublicObjectSearchRequest + found_notes = [] + after = None + while True: + # Correct filter for notes associated with the given deal ID + search_request = PublicObjectSearchRequest( + filter_groups=[{ + "filters": [{ + "propertyName": "associations.deal", # Filter by association to the deal + "operator": "EQ", + "value": deals_id, + }] + }], + properties=["hs_note_body", "hubspot_owner_id"], # Properties of the note you need + limit=200, + after=after, + ) + # Call the search API + response = self.client.crm.objects.search_api.do_search(object_type="notes", public_object_search_request=search_request) + + # Add the results to the found_notes list + found_notes.extend(response.results) + + # Handle pagination if more results are available + if not response.paging or not response.paging.next: + break + after = response.paging.next.after + + all_notes = [] + for note in found_notes: + # Extract note content and author information + note_body = note.properties.get("hs_note_body", "No content") + + # Collect note details in a dictionary + all_notes.append({ + "note_id": note.id, + "note": note_body, + }) + return all_notes + + + def get_all_deals_from_stage_id(self, stage_id): + found_deals = [] + after = None + while True: + search_request = PublicObjectSearchRequest( + filter_groups=[{ + "filters": [{ + "propertyName": "dealstage", + "operator": "EQ", + "value": stage_id, + }] + }], + properties=[ + "dealname", + ], + limit=200, + after=after, + ) + response = self.client.crm.deals.search_api.do_search(search_request) + found_deals.extend(response.results) + if not response.paging or not response.paging.next: + break + after = response.paging.next.after + + all_deals = [] + for deal in found_deals: + all_deals.append(deal.id) + return all_deals def get_deals_from_deal_stage(self, deal_stage: DealStage): found_deals = [] diff --git a/poetry.lock b/poetry.lock index b055ca9..ddd2ad3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -61,6 +61,29 @@ files = [ astroid = ["astroid (>=2,<4)"] test = ["astroid (>=2,<4)", "pytest", "pytest-cov", "pytest-xdist"] +[[package]] +name = "beautifulsoup4" +version = "4.13.4" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.7.0" +groups = ["main"] +files = [ + {file = "beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b"}, + {file = "beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195"}, +] + +[package.dependencies] +soupsieve = ">1.2" +typing-extensions = ">=4.0.0" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "certifi" version = "2025.1.31" @@ -1684,6 +1707,18 @@ files = [ {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, ] +[[package]] +name = "soupsieve" +version = "2.6" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"}, + {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, +] + [[package]] name = "sqlalchemy" version = "2.0.40" @@ -1925,4 +1960,4 @@ files = [ [metadata] lock-version = "2.1" python-versions = ">=3.12" -content-hash = "55a974b3a81d57c429f61ee6a12a84d38f5c703fdfdfdf2553bec6ba21c29bf5" +content-hash = "9b3e5a8f963d63fbb5fafd8595901358d10aba9f5261b398b9051504ce9320c2" diff --git a/pyproject.toml b/pyproject.toml index c86b613..c26faba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "pytest (>=8.3.5,<9.0.0)", "hubspot-api-client (>=11.1.0,<12.0.0)", "monday (>=2.0.1,<3.0.0)", + "beautifulsoup4 (>=4.13.4,<5.0.0)", ] [tool.poetry]