From a75ab60e3a2e4d85defd9f07586f34bdd14f1aa1 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 16 May 2025 11:09:55 +0000 Subject: [PATCH] script to automate complaince done@ --- etl/osmosis_complaince_address_to_files.py | 119 +++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 etl/osmosis_complaince_address_to_files.py diff --git a/etl/osmosis_complaince_address_to_files.py b/etl/osmosis_complaince_address_to_files.py new file mode 100644 index 0000000..d3d14d9 --- /dev/null +++ b/etl/osmosis_complaince_address_to_files.py @@ -0,0 +1,119 @@ +from monday import MondayClient +import json +import requests +import time +from tqdm import tqdm +import os +from etl.scraper.scraper import SharePointInstaller +from etl.scraper.scraper import SharePointScraper + +board_id = "6097548932" +monday_key = "eyJhbGciOiJIUzI1NiJ9.eyJ0aWQiOjQ5ODc2ODQxOCwiYWFpIjoxMSwidWlkIjozNjE3ODAzNCwiaWFkIjoiMjAyNS0wNC0xMVQxMToyMzoxNy40NjdaIiwicGVyIjoibWU6d3JpdGUiLCJhY3RpZCI6MTM5OTc4MjMsInJnbiI6InVzZTEifQ.-2Lit4s46ZF6AXuMW9t0TxIaFLkHqD4Yo-PyM9i2XZY" +monday = MondayClient(monday_key) + + +# osmsis keys +os.environ["SHAREPOINT_CLIENT_ID"] = "6832a4c5-fb8c-4082-a746-4f51e1020f0d" +os.environ["SHAREPOINT_CLIENT_SECRET"] = "xpC8Q~Frww48SM1V-D8lGy5iOY7P_cJ7FF3jgarQ" +os.environ["SHAREPOINT_TENANT_ID"] = "10d5af8b-2cfd-4882-9ccd-b96e4812dacf" +osmosis = SharePointScraper(SharePointInstaller.OSMOSIS_WAVE_2) +parent_folder = "/Osmosis ACD/Osmosis ACD Projects/Installer Documentation/" + +# Change this per installer +parent_folder += "Platform Housing Group/Broadoak" + + +def download_file(url): + headers = { + "Authorization": monday_key, + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/125.0.0.0 Safari/537.36", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,image/apng,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "Connection": "keep-alive", + "Referer": "https://osmosis-acd-team.monday.com/", # Optional but helpful + "Upgrade-Insecure-Requests": "1", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "same-origin", + "Sec-Fetch-User": "?1", + } + local_filename = os.path.join("/tmp", url.split("/")[-1]) + with requests.get(url.strip(), headers=headers, stream=True) as r: + r.raise_for_status() + with open(local_filename, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + return local_filename + +def get_all_items(board_id, monday): + # Parameters + limit = 25 # Adjust the limit based on how many items you want per request + all_items = [] # List to store all fetched items + cursor = None # Start without a cursor for the first page + + # Loop through pages + while True: + # Fetch items for the current page + response = monday.boards.fetch_items_by_board_id( + board_ids=board_id, + limit=limit, + cursor=cursor + ) + + items = response['data']['boards'][0]['items_page']['items'] + + # If no items are returned, stop the loop + if not items: + break + + # Append items from this page to the all_items list + all_items.extend(items) + + # Get the cursor for the next page (if there is one) + cursor = response['data']['boards'][0]['items_page'].get('cursor') # Get the current cursor + + # If there's no cursor, we've reached the last page + if not cursor: + break + print(f"cursor {cursor}") + print(f"len all_itemms {len(all_items)}") + return all_items + +def upload_to_sharepoint(to_upload, master_folder_name): + osmosis.create_dir(master_folder_name, parent_folder) + for file_path in to_upload: + osmosis.upload_file(file_path, parent_folder + f"/{master_folder_name}", file_path[5:]) + +# Step 1: Fetch column IDs +board_data = monday.boards.fetch_boards_by_id(board_id) +columns = board_data["data"]["boards"][0]["columns"] +col_id_map = {col["title"].lower(): col["id"] for col in columns} + +name_id = col_id_map.get("name") # Replace with actual title if different +files_id = col_id_map.get("file(s)") # Replace with actual title if different + +if not name_id or not files_id: + raise Exception("Could not find 'name' or 'file(s)' columns") + +items = get_all_items(board_id, monday) +for item in tqdm(items): + item_name = item["name"] + item_id = item["id"] + + print(f"Downloading '{item_name}'...") + for val in item["column_values"]: + # files + if val["id"] == files_id: + all_files_csv = val["text"] + files = all_files_csv.split(",") + + to_upload = [] + for file in tqdm(files): + print(f"Downloading {file}") + to_upload.append(download_file(file)) + upload_to_sharepoint(to_upload, item_name) + \ No newline at end of file