From 1031cf63b18fb5fbb64dcd4c6acccaf51532d480 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 3 Mar 2026 14:52:21 +0000 Subject: [PATCH] updated script to do live tracking and folder generation better --- .github/workflows/hubspot_sync.yml | 2 +- etl/hubSpotClient/scripts/hubspot_company.py | 1 + .../scripts/hubspot_gather_all_deals.py | 1 + .../scripts/hubspot_update_script.py | 1 + ...osmosis_monday_to_sharepoint_automation.py | 166 ++++++++++++------ etl/scraper/scraper.py | 23 ++- 6 files changed, 130 insertions(+), 64 deletions(-) diff --git a/.github/workflows/hubspot_sync.yml b/.github/workflows/hubspot_sync.yml index c6bb1b3..e1fb5cd 100644 --- a/.github/workflows/hubspot_sync.yml +++ b/.github/workflows/hubspot_sync.yml @@ -3,7 +3,7 @@ name: Hubspot Sync on: schedule: # Every 15 minutes, 07:00โ€“18:59, Mondayโ€“Friday (UTC) - - cron: '*/15 7-18 * * 1-5' + - cron: '0 7-18/2 * * 1-5' # Once on Saturday at 09:00 UTC - cron: '0 9 * * 6' diff --git a/etl/hubSpotClient/scripts/hubspot_company.py b/etl/hubSpotClient/scripts/hubspot_company.py index 788f2e2..d6482f3 100644 --- a/etl/hubSpotClient/scripts/hubspot_company.py +++ b/etl/hubSpotClient/scripts/hubspot_company.py @@ -6,6 +6,7 @@ hubspot = HubSpotClient() companies = [ Companies.THE_GUINESS_PARTNERSHIP, + Companies.SOUTHERN_HOUSING_GROUP, ] # All deals from a pipeline_id via filter diff --git a/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py b/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py index 904ed38..df53c46 100644 --- a/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py +++ b/etl/hubSpotClient/scripts/hubspot_gather_all_deals.py @@ -9,6 +9,7 @@ PIPELINE_ID = Pipeline.OPERATIONS_SOCIAL_HOUSING.value valuable_companies = [ Companies.THE_GUINESS_PARTNERSHIP.value, + Companies.SOUTHERN_HOUSING_GROUP.value, ] deals_to_add = [] diff --git a/etl/hubSpotClient/scripts/hubspot_update_script.py b/etl/hubSpotClient/scripts/hubspot_update_script.py index ed893cb..02be7f6 100644 --- a/etl/hubSpotClient/scripts/hubspot_update_script.py +++ b/etl/hubSpotClient/scripts/hubspot_update_script.py @@ -8,6 +8,7 @@ db = HubspotTodb() companies = [ Companies.THE_GUINESS_PARTNERSHIP, + Companies.SOUTHERN_HOUSING_GROUP, ] # Global trackers diff --git a/etl/osmosis_monday_to_sharepoint_automation.py b/etl/osmosis_monday_to_sharepoint_automation.py index 18e544f..7647672 100644 --- a/etl/osmosis_monday_to_sharepoint_automation.py +++ b/etl/osmosis_monday_to_sharepoint_automation.py @@ -1,71 +1,126 @@ import os # osmsis keys -os.environ["SHAREPOINT_CLIENT_ID"] = "6832a4c5-fb8c-4082-a746-4f51e1020f0d" -os.environ["SHAREPOINT_CLIENT_SECRET"] = "xpC8Q~Frww48SM1V-D8lGy5iOY7P_cJ7FF3jgarQ" -os.environ["SHAREPOINT_TENANT_ID"] = "10d5af8b-2cfd-4882-9ccd-b96e4812dacf" +os.environ["SHAREPOINT_CLIENT_ID"] = "0e28c4f9-3e77-4571-8d63-df1857f4266a" +os.environ["SHAREPOINT_CLIENT_SECRET"] = "2s48Q~t8.pI-~rbtQaSCFcYY97Z3LiMYhuo0GaOb" +os.environ["SHAREPOINT_TENANT_ID"] = "6f080c63-8a66-4bbc-9d72-b85d5df30555" -from etl.scraper.scraper import SharePointInstaller -from etl.scraper.scraper import SharePointScraper +import time import pandas as pd from tqdm import tqdm -import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from functools import wraps +from typing import Callable + +from etl.scraper.scraper import SharePointInstaller, SharePointScraper + +osmosis = SharePointScraper(SharePointInstaller.PRIVATE_PAY) + +parent_folder = "/Projects/Southern Housing/SH-SURV-26-001/Assessments" + +excel_path = "/workspaces/survey-extractor/example_data/SH-SURV-26-001-monday.com.xlsx" +asset_list = pd.read_excel(excel_path, sheet_name="SH-SURV-26-001-monday.com") + +# -------------------------------------------------- +# Retry Decorator (3 attempts + exponential backoff) +# -------------------------------------------------- + +def retry(max_attempts: int = 3, base_delay: float = 1.0): + def decorator(func: Callable): + @wraps(func) + def wrapper(*args, **kwargs): + attempt = 0 + while attempt < max_attempts: + try: + return func(*args, **kwargs) + except Exception as e: + attempt += 1 + if attempt >= max_attempts: + print(f"โŒ Failed after {max_attempts} attempts: {e}") + raise + sleep_time = base_delay * (2 ** (attempt - 1)) + print(f"โš  Retry {attempt}/{max_attempts} after error: {e}") + time.sleep(sleep_time) + return wrapper + return decorator -osmosis = SharePointScraper(SharePointInstaller.OSMOSIS_WAVE_3) +# -------------------------------------------------- +# Folder Creation Logic (Wrapped with retry) +# -------------------------------------------------- + +@retry(max_attempts=3) +def process_asset(address: pd.Series): + folder_name = f"{address['Name']} {address['Postcode']}" + print(f"\n๐Ÿ“ Processing {folder_name}") + + web_url = osmosis.create_dir(folder_name, parent_folder) + time.sleep(0.5) # small throttle + + base_path = f"{parent_folder}/{folder_name}" + + # ----------------------- + # Folder Structure + # ----------------------- + + osmosis.create_dir("1. Retrofit Assessment", base_path) + osmosis.create_dir("A. Assessment", f"{base_path}/1. Retrofit Assessment") + osmosis.create_dir("B. Air Tightness Tests", f"{base_path}/1. Retrofit Assessment") + + osmosis.create_dir("2. RC Mid-Term Plan", base_path) + osmosis.create_dir("SAP", f"{base_path}/2. RC Mid-Term Plan") + + osmosis.create_dir("3. Retrofit Design", base_path) + + osmosis.create_dir("4. Post EPC", base_path) + osmosis.create_dir( + f"{address['Name']} - POST EPC Photos", + f"{base_path}/4. Post EPC" + ) + + osmosis.create_dir("5. Trustmark Lodgement", base_path) + trust_path = f"{base_path}/5. Trustmark Lodgement" + osmosis.create_dir("1. Works", trust_path) + osmosis.create_dir("2. Required Documents", trust_path) + osmosis.create_dir("3. Additional Documents", trust_path) + + return { + "Name": address["Name"], + "Postcode": address["Postcode"], + "Sharepoint": web_url, + } -parent_folder = "/Osmosis-ACD Projects/NCHA WHSHF Wave 3/Property Folders" +# -------------------------------------------------- +# Parallel Execution +# -------------------------------------------------- -asset_list = pd.read_excel("osmosis_data/asset_list.xlsx", sheet_name="Sheet 1") +results = [] +failed_rows = [] +MAX_WORKERS = 5 -new_asset_list = [] -# Create asset list and location -for index, address in tqdm(asset_list.iterrows()): - if index > 39: - folder_name = address['Name'] + " " + address['Postcode'] - webUrl = osmosis.create_dir(folder_name, parent_folder) - time.sleep(1) - print(f"building folders insidea {folder_name}") +with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + futures = [ + executor.submit(process_asset, row) + for _, row in asset_list.iterrows() + ] - print("building retrofit assessment") - first_folder = "1. Retrofit Assessment" - osmosis.create_dir(first_folder, parent_folder + f"/{folder_name}") - osmosis.create_dir("A. Assessment", parent_folder + f"/{folder_name}/{first_folder}") - osmosis.create_dir("B. Air Tightness Tests", parent_folder + f"/{folder_name}/{first_folder}") + # โœ… Progress bar tied to completed futures + for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Assets"): + try: + result = future.result() + results.append(result) + except Exception as e: + failed_rows.append(e) - print("building RC MID Term plan") - second_folder = "2. RC Mid-Term Plan" - osmosis.create_dir(second_folder, parent_folder + f"/{folder_name}") - osmosis.create_dir("SAP", parent_folder + f"/{folder_name}/{second_folder}") - - print("building Retrofit Design") - third_folder = "3. Retrofit Design" - osmosis.create_dir(third_folder, parent_folder + f"/{folder_name}") - - print("building post epc") - fourth_folder = "4. Post EPC" - osmosis.create_dir(fourth_folder, parent_folder + f"/{folder_name}") - osmosis.create_dir(f"{address['Name']} - POST EPC Photos", parent_folder + f"/{folder_name}/{fourth_folder}") - - print("Building Trust mark Lodgement") - fifth_folder = "5. Trustmark Lodgement" - osmosis.create_dir(fifth_folder, parent_folder + f"/{folder_name}") - osmosis.create_dir("1. Works", parent_folder + f"/{folder_name}/{fifth_folder}") - - osmosis.create_dir("2. Required Documents", parent_folder + f"/{folder_name}/{fifth_folder}") - osmosis.create_dir("3. Additional Documents", parent_folder + f"/{folder_name}/{fifth_folder}") - - asset_data = { - "Name": address['Name'], - "Postcode": address['Postcode'], - "Sharepoint": webUrl, - } - print(asset_data) - - new_asset_list.append(asset_data) +print("\nโœ… Finished") +print(f"โœ” Successful: {len(results)}") +print(f"โŒ Failed: {len(failed_rows)}") +if failed_rows: + pd.DataFrame(failed_rows).to_excel("failed_rows.xlsx", index=False) + print("๐Ÿ“„ Saved failed rows to failed_rows.xlsx") # # Run this is you just want to get url # def just_url(asset_list): # new_asset_list = [] @@ -82,9 +137,10 @@ for index, address in tqdm(asset_list.iterrows()): # new_asset_list.append(asset_data) # return new_asset_list -# new_asset_list = just_url(asset_list=asset_list) -df = pd.DataFrame(new_asset_list) -df.to_csv("output.csv", index=False) +# # new_asset_list = just_url(asset_list=asset_list) + +# df = pd.DataFrame(new_asset_list) +# df.to_csv("output.csv", index=False) diff --git a/etl/scraper/scraper.py b/etl/scraper/scraper.py index 2fc5545..542378f 100644 --- a/etl/scraper/scraper.py +++ b/etl/scraper/scraper.py @@ -30,6 +30,7 @@ class SharePointInstaller(Enum): # WARMFRONT = os.getenv("WARMFRONT_SHARPOINT_ID", "bea71c30-d366-454c-a484-ae4d6fd95bc4") # NEW_JJC = os.getenv("NEW JJC", "10d96eba-b4f9-4e30-804f-05a8b60507b0") OSMOSIS_ACD = os.getenv("OSMOSIS_ACD_SHAREPOINT_ID", "931c4361-681b-44e4-86f6-1a54aba3ae8a") + PRIVATE_PAY = os.getenv("PRIVATE_PAY_SHAREPOINT_ID", "16812ae4-5898-4fec-a6f6-382d1435586f") class SharePointScraper(): """ @@ -122,7 +123,6 @@ class SharePointScraper(): return False def create_dir(self, file_name, at_path="/"): - sharepoint_client = SharePointClient( tenant_id=self.sharepoint_tenant_id, client_id=self.sharepoint_client_id, @@ -130,13 +130,20 @@ class SharePointScraper(): site_id=self.sharepoint_drive.value, ) - if self.does_folder_exists_at(file_name, at_path) is False: - return sharepoint_client.create_folder(file_name, at_path)['webUrl'] - else: - for folders in self.get_folders_in_path(at_path)['value']: - if file_name.upper() in folders["name"].upper(): - return folders["webUrl"] - + folders = self.get_folders_in_path(at_path) + + # Check if folder already exists (case-insensitive match) + if "value" in folders: + for folder in folders["value"]: + if "name" in folder and folder["name"].lower() == file_name.lower(): + self.logger.info(f"Folder already exists: {file_name} at {at_path}") + return folder["webUrl"] # โœ… return existing folder + + # Folder does NOT exist โ†’ create it + self.logger.info(f"Creating folder: {file_name} at {at_path}") + created = sharepoint_client.create_folder(file_name, at_path) + + return created["webUrl"] def upload_file(self, file_path, sharepoint_path, file_name): sharepoint_client = SharePointClient( tenant_id=self.sharepoint_tenant_id,