From 2fafc0d4fb4b218c0345b2fa6492e93ae1c72cc4 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <junte@domna.homes>
Date: Wed, 16 Apr 2025 17:48:01 +0000
Subject: [PATCH] added a script that allow domna notes to be extracted from
 deals

---
 etl/dimitra_hubspot_notes_gather.py | 109 ++++++++++++++++++++++++++++
 etl/hubSpotClient/hubspot.py        |  80 ++++++++++++++++++++
 poetry.lock                         |  37 +++++++++-
 pyproject.toml                      |   1 +
 4 files changed, 226 insertions(+), 1 deletion(-)
 create mode 100644 etl/dimitra_hubspot_notes_gather.py

diff --git a/etl/dimitra_hubspot_notes_gather.py b/etl/dimitra_hubspot_notes_gather.py
new file mode 100644
index 0000000..923c395
--- /dev/null
+++ b/etl/dimitra_hubspot_notes_gather.py
@@ -0,0 +1,109 @@
+import os
+
+os.environ["SHAREPOINT_CLIENT_ID"] = "895e3b77-b1d7-43ec-b18f-dcfe07cdfeaf"
+os.environ["SHAREPOINT_CLIENT_SECRET"] = "SOf8Q~-is4wdQiqvEEm9FlJQRAY9ELGaj5Qz-a6E"
+os.environ["SHAREPOINT_TENANT_ID"] = "c3f7519c-2719-4547-af04-6da6cbfd8f8f"
+os.environ["SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID"] = "b5a51507-9427-4ee0-b03e-90ec7681e2d3"
+os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284"
+
+from etl.scraper.scraper import SharePointScraper, SharePointInstaller, previous_monday
+from etl.hubSpotClient.hubspot import HubSpotClient, DealStage
+import pandas as pd
+from bs4 import BeautifulSoup
+from openpyxl import Workbook
+from openpyxl.styles import Font
+
+
+hubspot = HubSpotClient()
+import time
+pipelines_to_include =[
+    "SALES - SOCIAL HOUSING",
+    "PVT PAY",
+    "NRLA GENERAL ENQUIRIES",
+    # "OSMOSIS - SALES",
+
+]
+exclude_stage = {
+    "SALES - SOCIAL HOUSING" : [
+        "HA TO REENGAGE",
+        "APPOINTMENT SCHEDULED",
+        "AWAITING ASSET LIST",
+        "ASSET LIST RECEIVED",
+        "ASSET LIST STANDARDISED",
+        "ROUTE MARCH CREATED",
+        "HA WEEKLY REPORTING",
+    ],
+    "PVT PAY": [
+        "LIVE OPPORTUNITY",
+        "CLOSED LOST",
+        "INVOICED",
+        "COLD - KIT",
+        "CLOSED WON",
+
+    ],
+    "NRLA GENERAL ENQUIRIES": [
+        "CUSTOMER CONTACTED",
+        "LOST",
+        "COLD",
+    ]
+}
+
+include_pipeline_upper = [s.upper().strip() for s in pipelines_to_include]
+exclude_stage_upper = [s.upper().strip() for s in exclude_stage]
+notes_data = []
+pipelines = hubspot.client.crm.pipelines.pipelines_api.get_all(object_type="deals")
+for pipeline in pipelines.results:
+    pipeline_name = pipeline.label.upper().strip()
+    if pipeline_name in pipelines_to_include:
+        for stage in pipeline.stages:
+            if stage.label.upper().strip() not in exclude_stage[pipeline_name]:
+                for deal_id in hubspot.get_all_deals_from_stage_id(stage.id):
+                    notes = hubspot.get_notes_from_deals_id(deal_id)
+                    for note in notes:
+                        deal_name = hubspot.get_deal_name_by_id(deal_id)
+                        html_body = note['note']
+                        soup = BeautifulSoup(html_body, "html.parser")
+                        plain_text = soup.get_text(separator="\n")  # Keeps line breaks
+                        notes_data.append({
+                            "note_body": plain_text,
+                            "deal_name": deal_name,  # Include deal_id to relate the note to the deal
+                            "pipeline_name": pipeline.label  # Add the pipeline name
+                        })
+
+                        time.sleep(0.75)
+                        print("delay to not bombard the server")
+
+notes_df = pd.DataFrame(notes_data)
+notes_df.to_csv("output.csv")
+df = notes_df
+
+wb = Workbook()
+wb.remove(wb.active)  # Remove default sheet
+
+for pipeline, group_df in df.groupby("pipeline_name"):
+    ws = wb.create_sheet(title=pipeline[:31])  # Excel sheet name limit = 31 chars
+
+    # Sort by deal name
+    group_df = group_df.sort_values("deal_name")
+
+    current_row = 1
+    for deal_name, deal_notes in group_df.groupby("deal_name"):
+        # Bold header for each deal
+        ws.cell(row=current_row, column=1, value=f"Deal Stage: {deal_name}")
+        ws.cell(row=current_row, column=1).font = Font(bold=True)
+        current_row += 1
+
+        # Notes for the deal
+        for note in deal_notes["note_body"]:
+            ws.cell(row=current_row, column=2, value=note)
+            current_row += 1
+
+        # Add a blank row between groups
+        current_row += 1
+
+# Save to Excel
+file_name = "DEAL_NOTES_FROM_HUBSPOT.xlsx"
+wb.save(file_name)
+output_path = os.path.abspath(file_name)
+sharepoint_client = SharePointScraper(SharePointInstaller.DOMNA)
+sharepoint_client.upload_file(output_path, f"/02. Sales and Marketing/02. DEAL Notes from Hubspot/{previous_monday}",file_name)
diff --git a/etl/hubSpotClient/hubspot.py b/etl/hubSpotClient/hubspot.py
index cf022e6..4924d27 100644
--- a/etl/hubSpotClient/hubspot.py
+++ b/etl/hubSpotClient/hubspot.py
@@ -4,6 +4,8 @@ from hubspot.crm.deals import PublicObjectSearchRequest
 from hubspot.crm.deals.models import SimplePublicObjectInput
 from etl.hubSpotClient.types import SubmissionInfoFromDeal
 
+
+
 class DealStage(Enum):
     SURVEYED_COMPLETE_NEEDS_SIGN_OFF = "1617223914"
     SURVEYED_NO_ACCESS_NEED_SIGN_OFF = "1617223915"
@@ -17,7 +19,85 @@ class HubSpotClient():
 
     def get_all_deals(self):
         return self.client.crm.deals.get_all()
+
+        
+    def get_deal_name_by_id(self, deal_id):
+        try:
+            deal = self.client.crm.deals.basic_api.get_by_id(deal_id)
+            return deal.properties.get("dealname", "No deal name")
+        except Exception as e:
+            return "Unknown Deal"  # Fallback if the deal name is not found
     
+    def get_notes_from_deals_id(self, deals_id):
+        from hubspot.crm.objects import PublicObjectSearchRequest
+        found_notes = []
+        after = None
+        while True:
+            # Correct filter for notes associated with the given deal ID
+            search_request = PublicObjectSearchRequest(
+                filter_groups=[{
+                    "filters": [{
+                        "propertyName": "associations.deal",  # Filter by association to the deal
+                        "operator": "EQ",
+                        "value": deals_id,
+                    }]
+                }],
+                properties=["hs_note_body", "hubspot_owner_id"],  # Properties of the note you need
+                limit=200,
+                after=after,
+            )
+            # Call the search API
+            response = self.client.crm.objects.search_api.do_search(object_type="notes", public_object_search_request=search_request)
+
+            # Add the results to the found_notes list
+            found_notes.extend(response.results)
+
+            # Handle pagination if more results are available
+            if not response.paging or not response.paging.next:
+                break
+            after = response.paging.next.after
+
+        all_notes = []
+        for note in found_notes:
+            # Extract note content and author information
+            note_body = note.properties.get("hs_note_body", "No content")
+
+            # Collect note details in a dictionary
+            all_notes.append({
+                "note_id": note.id,
+                "note": note_body,
+            })
+        return all_notes
+    
+    
+    def get_all_deals_from_stage_id(self, stage_id):
+        found_deals = []
+        after = None
+        while True:
+            search_request = PublicObjectSearchRequest(
+                filter_groups=[{
+                    "filters": [{
+                        "propertyName": "dealstage",
+                        "operator": "EQ",
+                        "value": stage_id,
+                    }]
+                }],
+                properties=[
+                    "dealname",
+                ],
+                limit=200,
+                after=after,
+            )
+            response = self.client.crm.deals.search_api.do_search(search_request)
+            found_deals.extend(response.results)
+            if not response.paging or not response.paging.next:
+                break
+            after = response.paging.next.after
+
+        all_deals = []
+        for deal in found_deals:
+            all_deals.append(deal.id)
+        return all_deals
 
     def get_deals_from_deal_stage(self, deal_stage: DealStage):
         found_deals = []
diff --git a/poetry.lock b/poetry.lock
index b055ca9..ddd2ad3 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -61,6 +61,29 @@ files = [
 astroid = ["astroid (>=2,<4)"]
 test = ["astroid (>=2,<4)", "pytest", "pytest-cov", "pytest-xdist"]
 
+[[package]]
+name = "beautifulsoup4"
+version = "4.13.4"
+description = "Screen-scraping library"
+optional = false
+python-versions = ">=3.7.0"
+groups = ["main"]
+files = [
+    {file = "beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b"},
+    {file = "beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195"},
+]
+
+[package.dependencies]
+soupsieve = ">1.2"
+typing-extensions = ">=4.0.0"
+
+[package.extras]
+cchardet = ["cchardet"]
+chardet = ["chardet"]
+charset-normalizer = ["charset-normalizer"]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "certifi"
 version = "2025.1.31"
@@ -1684,6 +1707,18 @@ files = [
     {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
 ]
 
+[[package]]
+name = "soupsieve"
+version = "2.6"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"},
+    {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"},
+]
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.40"
@@ -1925,4 +1960,4 @@ files = [
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.12"
-content-hash = "55a974b3a81d57c429f61ee6a12a84d38f5c703fdfdfdf2553bec6ba21c29bf5"
+content-hash = "9b3e5a8f963d63fbb5fafd8595901358d10aba9f5261b398b9051504ce9320c2"
diff --git a/pyproject.toml b/pyproject.toml
index c86b613..c26faba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dependencies = [
     "pytest (>=8.3.5,<9.0.0)",
     "hubspot-api-client (>=11.1.0,<12.0.0)",
     "monday (>=2.0.1,<3.0.0)",
+    "beautifulsoup4 (>=4.13.4,<5.0.0)",
 ]
 
 [tool.poetry]