added sharepoint functionality for mr roth - enjoy

This commit is contained in:
Jun-te Kim 2026-03-24 13:36:47 +00:00
parent 21f5cd40da
commit a13ab09ab9
6 changed files with 537 additions and 0 deletions

View file

@ -13,6 +13,7 @@ boto3==1.35.44
openpyxl==3.1.5
# Basic
pytz
msal
uvicorn[standard]
sqlmodel
# Testing

View file

View file

@ -0,0 +1,172 @@
from pprint import pformat
from enum import Enum
import os
from utils.logger import setup_logger
from utils.sharepoint.sharepoint_client import SharePointClient
from functools import wraps
import re
from datetime import datetime, timedelta
from io import BytesIO
class DomnaSites(Enum):
# https//{tenant}.sharepoint.com/sites/{site}/_api/site/id
# TODO: Add these to github secrets!!!
DOMNA = os.getenv("DOMNA_SHAREPOINT_ID")
OSMOSIS_ACD = os.getenv("OSMOSIS_ACD_SHAREPOINT_ID")
PRIVATE_PAY = os.getenv("PRIVATE_PAY_SHAREPOINT_ID")
SOCIAL_HOUSING_WAVE_3 = os.getenv("SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID")
class DomnaSharepointClient:
"""
A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change
"""
def __init__(self, sharepoint_location, development=False):
self.logger = setup_logger()
self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None)
self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
self.sharepoint_tenant_id = os.getenv("SHAREPOINT_TENANT_ID", None)
self.sharepoint_drive = sharepoint_location
assert (
self.sharepoint_client_id is not None
), "Please assign SHAREPOINT_CLIENT_ID env variable"
assert (
self.sharepoint_client_secret is not None
), "Please assign SHAREPOINT_CLIENT_SECRET env variable"
assert (
self.sharepoint_tenant_id is not None
), "Please assign SHAREPOINT_TENANT_ID env variable"
assert (
self.sharepoint_drive.value is not None
), "Please set sharepoint driver id env variable. See SharePointInstaller for more information"
def get_folders_in_path(self, path):
sharepoint_client = SharePointClient(
tenant_id=self.sharepoint_tenant_id,
client_id=self.sharepoint_client_id,
client_secret=self.sharepoint_client_secret,
site_id=self.sharepoint_drive.value,
)
return sharepoint_client.list_folder_contents(path)
def get_file_content(self, url):
sharepoint_client = SharePointClient(
tenant_id=self.sharepoint_tenant_id,
client_id=self.sharepoint_client_id,
client_secret=self.sharepoint_client_secret,
site_id=self.sharepoint_drive.value,
)
return sharepoint_client.download_sharepoint_file(url)
def does_folder_exists_at(self, file_name, file_path):
folders = self.get_folders_in_path(file_path)
if "value" in folders:
for folder in folders["value"]:
if file_name.upper() in folder["name"].upper():
return True
return False
def create_dir(self, file_name, at_path="/"):
sharepoint_client = SharePointClient(
tenant_id=self.sharepoint_tenant_id,
client_id=self.sharepoint_client_id,
client_secret=self.sharepoint_client_secret,
site_id=self.sharepoint_drive.value,
)
folders = self.get_folders_in_path(at_path)
# Check if folder already exists (case-insensitive match)
if "value" in folders:
for folder in folders["value"]:
if "name" in folder and folder["name"].lower() == file_name.lower():
self.logger.info(f"Folder already exists: {file_name} at {at_path}")
return folder["webUrl"] # ✅ return existing folder
# Folder does NOT exist → create it
self.logger.info(f"Creating folder: {file_name} at {at_path}")
created = sharepoint_client.create_folder(file_name, at_path)
return created["webUrl"]
def makedir(self, dir_name, at_path="/"):
return self.create_dir(dir_name, at_path)
def upload_file(self, file_path, sharepoint_path, file_name):
sharepoint_client = SharePointClient(
tenant_id=self.sharepoint_tenant_id,
client_id=self.sharepoint_client_id,
client_secret=self.sharepoint_client_secret,
site_id=self.sharepoint_drive.value,
)
def get_file_stream(file_path):
return open(file_path, "rb")
sharepoint_client.upload_file(
file_name, get_file_stream(file_path), sharepoint_path
)
def download_files_from_path(self, path, avoid=None):
"""
Download all non-media files from a list of root paths.
Args:
root_paths (List[str]): List of full folder paths to start from.
Returns:
List[Dict[str, List[str]]]: A list of dictionaries mapping address folder names to downloaded file paths.
"""
if avoid is None:
avoid = [
".jpg",
".mov",
".JPG",
".heic",
".HEIC",
".png",
".PNG",
".jpeg",
".JPEG",
".mp4",
".MP4",
]
files_info = self.get_folders_in_path(path)
if "value" not in files_info:
raise RuntimeError(f"Failed to get files from {path}")
file_names_to_download = {
file["name"]: file["@microsoft.graph.downloadUrl"]
for file in files_info["value"]
if "file" in file and not any(file["name"].endswith(ext) for ext in avoid)
}
downloaded_files = []
for file_name, url in file_names_to_download.items():
self.logger.info(f"Downloading {file_name} from {url}")
content = self.get_file_content(url)
file_path = self.create_temp_file(content, f"{path}/{file_name}")
downloaded_files.append(file_path)
return downloaded_files
def create_temp_file(self, content: BytesIO, path: str):
# Ensure the path is under /tmp/
new_path = os.path.join("/tmp/sharepoint", path)
# Ensure the parent directory exists
os.makedirs(os.path.dirname(new_path), exist_ok=True)
# Write content to the specified file
with open(new_path, "wb+") as temp_file:
temp_file.write(content.getvalue())
self.logger.debug(f"Temporary file created at: {new_path}")
return new_path

25
utils/sharepoint/main.py Normal file
View file

@ -0,0 +1,25 @@
# This is small script to see if Domna Sharepoint Client works
# for basic functionality
# Can we import it?
from io import BytesIO
from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient, DomnaSites
# can we initliase it
client = DomnaSharepointClient(sharepoint_location=DomnaSites.SOCIAL_HOUSING_WAVE_3)
# can we get an example of root path?
client.get_folders_in_path("/")
client.get_folders_in_path("/JTK Test Folder")
# can we make a folder appear in JTK Test Folder?
client.makedir("Dan is the best", "/JTK Test Folder")
content = BytesIO(b"Hello, this is some file content!")
path = client.create_temp_file(content, "some/place/over/the/rainbow")
client.upload_file(
path, "/JTK Test Folder/Dan is the best", "junte_is_the_worst_at_python.txt"
)

View file

@ -0,0 +1,339 @@
"""
This file contains the functions which enable interaction with SharePoint via the API.
Documentation to get api_id:
https://answers.microsoft.com/en-us/msoffice/forum/all/what-is-the-best-way-to-findout-the-share-point/7b2d4183-4188-4cd5-8441-dd93207c5a01
"""
from msal import ConfidentialClientApplication
from datetime import datetime, timedelta
import requests
from functools import wraps
import time
import logging
from io import BytesIO
import tempfile
import os
# Api Documentation: https://learn.microsoft.com/en-us/graph/api/drive-get?view=graph-rest-1.0&tabs=http
def handle_error(response):
"""
Handle errors based on HTTP status codes and log detailed information.
"""
try:
error_json = response.json().get("error", {})
except ValueError:
error_json = {}
error_code = error_json.get("code", "unknownError")
error_message = error_json.get("message", "No detailed error message provided.")
inner_error = error_json.get("innererror", {})
details = error_json.get("details", [])
logger.error(f"Error Code: {error_code}")
logger.error(f"Error Message: {error_message}")
if inner_error:
logger.error(f"Inner Error: {inner_error}")
if details:
logger.error(f"Error Details: {details}")
if response.status_code == 401:
logger.error("Unauthorized. Token might be invalid.")
elif response.status_code == 403:
logger.error("Forbidden. Access denied to the requested resource.")
elif response.status_code == 404:
logger.error("Not Found. The requested resource doesnt exist.")
elif response.status_code == 429:
retry_after = int(
response.headers.get("Retry-After", 5)
) # Default to 5 seconds if not provided
logger.warning(f"Too Many Requests. Retrying after {retry_after} seconds...")
time.sleep(retry_after)
return "retry"
elif response.status_code in (500, 503):
retry_after = int(
response.headers.get("Retry-After", 5)
) # Default to 5 seconds if not provided
logger.error(f"Server error. Retrying after {retry_after} seconds...")
time.sleep(retry_after)
return "retry"
else:
raise ValueError(
f"API request failed with status code {response.status_code} - {error_message}"
)
raise ValueError(
f"API request failed with status code {response.status_code} - {error_message}"
)
def api_call_decorator(func):
"""
Handles various aspects of the API call, including refreshing the access token if needed and handling pagination.
:param func: The function to be decorated.
:return: The wrapped function.
"""
@wraps(func)
def wrapper(self, *args, **kwargs):
try:
# Check and refresh the access token if needed
if self.is_access_token_expired():
self.retrieve_access_token()
logger.debug("Access token refreshed.")
# Get the HTTP method, URL, and optionally data from the function
http_method, url, data = func(self, *args, **kwargs)
# Initialize the results list and handle pagination if page_size is provided
results = []
page_size = kwargs.get("page_size", None)
response_data = {}
while url:
response = requests.request(
http_method, url, headers=self.headers, json=data
)
# Handle the response
if response.status_code == 200 or response.status_code == 201:
response_json = response.json() # Store the response JSON
if page_size:
results.extend(response_json.get("value", []))
url = response_json.get("@odata.nextLink", None)
else:
response_data = (
response_json # Capture the full response for consistency
)
break
else:
retry = handle_error(response)
if retry == "retry":
continue
if page_size:
response_data = {"value": results}
return response_data
except Exception as e:
logger.exception("An error occurred during the API call.")
raise e
return wrapper
class SharePointClient:
access_token = None
access_token_request_timestamp = None
access_token_expiry = None
headers = None
TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
def __init__(
self,
tenant_id,
client_id,
client_secret,
site_id,
access_token=None,
access_token_expiration_details=None,
):
"""
Initializes the SharePointClient with necessary credentials and site information.
:param tenant_id: The tenant ID.
:param client_id: The client ID.
:param client_secret: The client secret.
:param site_id: The site ID.
:param access_token: The access token (optional)
:param access_token_expiration_details: The access token expiration details (optional)
"""
self.tenant_id = tenant_id
self.client_id = client_id
self.client_secret = client_secret
if access_token:
if not access_token_expiration_details:
raise ValueError("Access token expiration details must be provided.")
self.access_token = access_token
self.set_access_token_expiration_details(access_token_expiration_details)
self.headers = {
"Authorization": f"Bearer {self.access_token['access_token']}"
}
else:
self.retrieve_access_token()
# Retrieve static identifiers
self.site_id = site_id
self.document_drive = self.get_documents_drive()
self.document_drive_id = self.document_drive["id"]
def get_token_expiration_details(self):
"""
Returns the access token expiration details. Converts the datetime objects to strings for serialization.
:return:
"""
return {
"access_token_request_timestamp": datetime.strftime(
self.access_token_request_timestamp, self.TIMESTAMP_FORMAT
),
"access_token_expiry": datetime.strftime(
self.access_token_expiry, self.TIMESTAMP_FORMAT
),
}
def set_access_token_expiration_details(self, access_token_expiration_details):
"""
Sets the access token expiration details from a serialized dictionary.
:param access_token_expiration_details: The serialized access token expiration details.
:return:
"""
self.access_token_request_timestamp = datetime.strptime(
access_token_expiration_details["access_token_request_timestamp"],
self.TIMESTAMP_FORMAT,
)
self.access_token_expiry = datetime.strptime(
access_token_expiration_details["access_token_expiry"],
self.TIMESTAMP_FORMAT,
)
def is_access_token_expired(self):
"""
Checks if the access token has expired. If it has, a new access token is retrieved.
:return: True if expired, False otherwise.
"""
return datetime.now() >= self.access_token_expiry
def retrieve_access_token(self, refresh=False):
"""
Implements authentication using MSAL.
:param refresh: If True, force a refresh of the access token.
:return: None
"""
app = ConfidentialClientApplication(
self.client_id,
authority=f"https://login.microsoftonline.com/{self.tenant_id}",
client_credential=self.client_secret,
)
scope = ["https://graph.microsoft.com/.default"]
access_token_request_timestamp = datetime.now()
if refresh:
logger.debug("Forcing refresh of access token.")
token = app.acquire_token_for_client(scopes=scope)
else:
# Check if a token is already cached
token = app.acquire_token_silent(scope, account=None)
if not token:
token = app.acquire_token_for_client(scopes=scope)
if "access_token" not in token:
logger.error("Authentication failed.")
raise ValueError("Authentication failed")
access_token_expiry = access_token_request_timestamp + timedelta(
seconds=token["expires_in"] - 20
)
self.access_token = token
self.access_token_request_timestamp = access_token_request_timestamp
self.access_token_expiry = access_token_expiry
self.headers = {"Authorization": f"Bearer {self.access_token['access_token']}"}
# logger.debug("Access token retrieved successfully.")
@api_call_decorator
def get_documents_drive(self):
"""
Get the document drive of the SharePoint site.
:return: Tuple containing HTTP method, URL, and None for data.
"""
url = f"https://graph.microsoft.com/v1.0/sites/{self.site_id}/drive"
# logger.debug(f"Getting document drive from URL: {url}")
return "GET", url, None
@api_call_decorator
def list_folder_contents(self, folder_path: str, page_size: int = 100):
"""
GET drive/root/children
This function will list the contents of a folder in SharePoint.
:param drive_id: The ID of the drive.
:param folder_path: The path of the folder.
:param page_size: The number of items per page (default is 100).
:return: Tuple containing HTTP method, URL, and None for data.
"""
url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{folder_path}:/children?$top={page_size}"
# logger.debug(f"Listing folder contents from URL: {url}")
return "GET", url, None
@api_call_decorator
def create_folder(self, file_name, folder_path):
"""
POST https://graph.microsoft.com/v1.0/me/drive/root/children
Content-Type: application/json
{
"name": "New Folder",
"folder": { },
"@microsoft.graph.conflictBehavior": "rename"
}
"""
data = {
"name": file_name,
"folder": {},
"@microsoft.graph.conflictBehavior": "rename",
}
url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{folder_path}:/children"
return "POST", url, data
def upload_file(self, file_name, file_stream, sharepoint_parent_id):
"""
Uploads a file to SharePoint using the Graph API.
PUT /drives/{drive-id}/root:/{path-to-file}:/content
:param file_name: Name of the file to upload
:param sharepoint_path: Path within the SharePoint site (folder path)
:param file_stream: File content as a binary stream (e.g., BytesIO or open(file, 'rb'))
:return: Response JSON from the API
"""
url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{sharepoint_parent_id}/{file_name}:/content"
# logger.debug(f"Uploading file to URL: {url}")
response = requests.put(url, headers=self.headers, data=file_stream)
if response.status_code in (200, 201):
# logger.info(f"File '{file_name}' uploaded successfully.")
return response.json()
else:
retry = handle_error(response)
if retry == "retry":
return self.upload_file(file_name, sharepoint_parent_id, file_stream)
@staticmethod
def download_sharepoint_file(download_url):
"""
Downloads a file from the given URL and returns its content.
:param download_url: The URL to download the file from.
:return: The content of the downloaded file.
"""
response = requests.get(download_url, stream=True)
response.raise_for_status() # Check if the request was successful
file_content = BytesIO()
# Read the file content into memory
for chunk in response.iter_content(chunk_size=8192):
file_content.write(chunk)
file_content.seek(0) # Reset the file pointer to the beginning
return file_content

0
utils/sharepoint/temp Normal file
View file