diff --git a/.devcontainer/backend/requirements.txt b/.devcontainer/backend/requirements.txt index d7afa6a2..029e5efa 100644 --- a/.devcontainer/backend/requirements.txt +++ b/.devcontainer/backend/requirements.txt @@ -13,6 +13,7 @@ boto3==1.35.44 openpyxl==3.1.5 # Basic pytz +msal uvicorn[standard] sqlmodel # Testing diff --git a/utils/sharepoint/__init__.py b/utils/sharepoint/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/sharepoint/domna_sharepoint_client.py b/utils/sharepoint/domna_sharepoint_client.py new file mode 100644 index 00000000..55336f85 --- /dev/null +++ b/utils/sharepoint/domna_sharepoint_client.py @@ -0,0 +1,172 @@ +from pprint import pformat +from enum import Enum +import os +from utils.logger import setup_logger +from utils.sharepoint.sharepoint_client import SharePointClient +from functools import wraps +import re +from datetime import datetime, timedelta +from io import BytesIO + + +class DomnaSites(Enum): + # https//{tenant}.sharepoint.com/sites/{site}/_api/site/id + # TODO: Add these to github secrets!!! + DOMNA = os.getenv("DOMNA_SHAREPOINT_ID") + OSMOSIS_ACD = os.getenv("OSMOSIS_ACD_SHAREPOINT_ID") + PRIVATE_PAY = os.getenv("PRIVATE_PAY_SHAREPOINT_ID") + SOCIAL_HOUSING_WAVE_3 = os.getenv("SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID") + + +class DomnaSharepointClient: + """ + A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change + """ + + def __init__(self, sharepoint_location, development=False): + self.logger = setup_logger() + self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None) + self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + self.sharepoint_tenant_id = os.getenv("SHAREPOINT_TENANT_ID", None) + self.sharepoint_drive = sharepoint_location + + assert ( + self.sharepoint_client_id is not None + ), "Please assign SHAREPOINT_CLIENT_ID env variable" + assert ( + self.sharepoint_client_secret is not None + ), "Please assign SHAREPOINT_CLIENT_SECRET env variable" + assert ( + self.sharepoint_tenant_id is not None + ), "Please assign SHAREPOINT_TENANT_ID env variable" + assert ( + self.sharepoint_drive.value is not None + ), "Please set sharepoint driver id env variable. See SharePointInstaller for more information" + + def get_folders_in_path(self, path): + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + return sharepoint_client.list_folder_contents(path) + + def get_file_content(self, url): + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + return sharepoint_client.download_sharepoint_file(url) + + def does_folder_exists_at(self, file_name, file_path): + folders = self.get_folders_in_path(file_path) + if "value" in folders: + for folder in folders["value"]: + if file_name.upper() in folder["name"].upper(): + return True + return False + + def create_dir(self, file_name, at_path="/"): + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + folders = self.get_folders_in_path(at_path) + + # Check if folder already exists (case-insensitive match) + if "value" in folders: + for folder in folders["value"]: + if "name" in folder and folder["name"].lower() == file_name.lower(): + self.logger.info(f"Folder already exists: {file_name} at {at_path}") + return folder["webUrl"] # ✅ return existing folder + + # Folder does NOT exist → create it + self.logger.info(f"Creating folder: {file_name} at {at_path}") + created = sharepoint_client.create_folder(file_name, at_path) + + return created["webUrl"] + + def makedir(self, dir_name, at_path="/"): + return self.create_dir(dir_name, at_path) + + def upload_file(self, file_path, sharepoint_path, file_name): + sharepoint_client = SharePointClient( + tenant_id=self.sharepoint_tenant_id, + client_id=self.sharepoint_client_id, + client_secret=self.sharepoint_client_secret, + site_id=self.sharepoint_drive.value, + ) + + def get_file_stream(file_path): + return open(file_path, "rb") + + sharepoint_client.upload_file( + file_name, get_file_stream(file_path), sharepoint_path + ) + + def download_files_from_path(self, path, avoid=None): + """ + Download all non-media files from a list of root paths. + + Args: + root_paths (List[str]): List of full folder paths to start from. + + Returns: + List[Dict[str, List[str]]]: A list of dictionaries mapping address folder names to downloaded file paths. + """ + if avoid is None: + avoid = [ + ".jpg", + ".mov", + ".JPG", + ".heic", + ".HEIC", + ".png", + ".PNG", + ".jpeg", + ".JPEG", + ".mp4", + ".MP4", + ] + + files_info = self.get_folders_in_path(path) + + if "value" not in files_info: + raise RuntimeError(f"Failed to get files from {path}") + + file_names_to_download = { + file["name"]: file["@microsoft.graph.downloadUrl"] + for file in files_info["value"] + if "file" in file and not any(file["name"].endswith(ext) for ext in avoid) + } + + downloaded_files = [] + for file_name, url in file_names_to_download.items(): + self.logger.info(f"Downloading {file_name} from {url}") + content = self.get_file_content(url) + file_path = self.create_temp_file(content, f"{path}/{file_name}") + downloaded_files.append(file_path) + + return downloaded_files + + def create_temp_file(self, content: BytesIO, path: str): + # Ensure the path is under /tmp/ + new_path = os.path.join("/tmp/sharepoint", path) + + # Ensure the parent directory exists + os.makedirs(os.path.dirname(new_path), exist_ok=True) + + # Write content to the specified file + with open(new_path, "wb+") as temp_file: + temp_file.write(content.getvalue()) + + self.logger.debug(f"Temporary file created at: {new_path}") + return new_path diff --git a/utils/sharepoint/main.py b/utils/sharepoint/main.py new file mode 100644 index 00000000..4cc4cfb0 --- /dev/null +++ b/utils/sharepoint/main.py @@ -0,0 +1,25 @@ +# This is small script to see if Domna Sharepoint Client works +# for basic functionality + +# Can we import it? +from io import BytesIO + +from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient, DomnaSites + + +# can we initliase it +client = DomnaSharepointClient(sharepoint_location=DomnaSites.SOCIAL_HOUSING_WAVE_3) + +# can we get an example of root path? + +client.get_folders_in_path("/") +client.get_folders_in_path("/JTK Test Folder") + +# can we make a folder appear in JTK Test Folder? +client.makedir("Dan is the best", "/JTK Test Folder") + +content = BytesIO(b"Hello, this is some file content!") +path = client.create_temp_file(content, "some/place/over/the/rainbow") +client.upload_file( + path, "/JTK Test Folder/Dan is the best", "junte_is_the_worst_at_python.txt" +) diff --git a/utils/sharepoint/sharepoint_client.py b/utils/sharepoint/sharepoint_client.py new file mode 100644 index 00000000..421b1535 --- /dev/null +++ b/utils/sharepoint/sharepoint_client.py @@ -0,0 +1,339 @@ +""" +This file contains the functions which enable interaction with SharePoint via the API. + +Documentation to get api_id: +https://answers.microsoft.com/en-us/msoffice/forum/all/what-is-the-best-way-to-findout-the-share-point/7b2d4183-4188-4cd5-8441-dd93207c5a01 +""" + +from msal import ConfidentialClientApplication +from datetime import datetime, timedelta +import requests +from functools import wraps +import time +import logging +from io import BytesIO +import tempfile +import os + +# Api Documentation: https://learn.microsoft.com/en-us/graph/api/drive-get?view=graph-rest-1.0&tabs=http + + +def handle_error(response): + """ + Handle errors based on HTTP status codes and log detailed information. + """ + try: + error_json = response.json().get("error", {}) + except ValueError: + error_json = {} + + error_code = error_json.get("code", "unknownError") + error_message = error_json.get("message", "No detailed error message provided.") + inner_error = error_json.get("innererror", {}) + details = error_json.get("details", []) + + logger.error(f"Error Code: {error_code}") + logger.error(f"Error Message: {error_message}") + if inner_error: + logger.error(f"Inner Error: {inner_error}") + if details: + logger.error(f"Error Details: {details}") + + if response.status_code == 401: + logger.error("Unauthorized. Token might be invalid.") + elif response.status_code == 403: + logger.error("Forbidden. Access denied to the requested resource.") + elif response.status_code == 404: + logger.error("Not Found. The requested resource doesn’t exist.") + elif response.status_code == 429: + retry_after = int( + response.headers.get("Retry-After", 5) + ) # Default to 5 seconds if not provided + logger.warning(f"Too Many Requests. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return "retry" + elif response.status_code in (500, 503): + retry_after = int( + response.headers.get("Retry-After", 5) + ) # Default to 5 seconds if not provided + logger.error(f"Server error. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return "retry" + else: + raise ValueError( + f"API request failed with status code {response.status_code} - {error_message}" + ) + + raise ValueError( + f"API request failed with status code {response.status_code} - {error_message}" + ) + + +def api_call_decorator(func): + """ + Handles various aspects of the API call, including refreshing the access token if needed and handling pagination. + :param func: The function to be decorated. + :return: The wrapped function. + """ + + @wraps(func) + def wrapper(self, *args, **kwargs): + try: + # Check and refresh the access token if needed + if self.is_access_token_expired(): + self.retrieve_access_token() + logger.debug("Access token refreshed.") + + # Get the HTTP method, URL, and optionally data from the function + http_method, url, data = func(self, *args, **kwargs) + + # Initialize the results list and handle pagination if page_size is provided + results = [] + page_size = kwargs.get("page_size", None) + response_data = {} + + while url: + response = requests.request( + http_method, url, headers=self.headers, json=data + ) + + # Handle the response + if response.status_code == 200 or response.status_code == 201: + response_json = response.json() # Store the response JSON + if page_size: + results.extend(response_json.get("value", [])) + url = response_json.get("@odata.nextLink", None) + else: + response_data = ( + response_json # Capture the full response for consistency + ) + break + else: + retry = handle_error(response) + if retry == "retry": + continue + + if page_size: + response_data = {"value": results} + + return response_data + + except Exception as e: + logger.exception("An error occurred during the API call.") + raise e + + return wrapper + + +class SharePointClient: + access_token = None + access_token_request_timestamp = None + access_token_expiry = None + headers = None + + TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + + def __init__( + self, + tenant_id, + client_id, + client_secret, + site_id, + access_token=None, + access_token_expiration_details=None, + ): + """ + Initializes the SharePointClient with necessary credentials and site information. + :param tenant_id: The tenant ID. + :param client_id: The client ID. + :param client_secret: The client secret. + :param site_id: The site ID. + :param access_token: The access token (optional) + :param access_token_expiration_details: The access token expiration details (optional) + """ + self.tenant_id = tenant_id + self.client_id = client_id + self.client_secret = client_secret + + if access_token: + if not access_token_expiration_details: + raise ValueError("Access token expiration details must be provided.") + self.access_token = access_token + self.set_access_token_expiration_details(access_token_expiration_details) + self.headers = { + "Authorization": f"Bearer {self.access_token['access_token']}" + } + else: + self.retrieve_access_token() + + # Retrieve static identifiers + self.site_id = site_id + self.document_drive = self.get_documents_drive() + self.document_drive_id = self.document_drive["id"] + + def get_token_expiration_details(self): + """ + Returns the access token expiration details. Converts the datetime objects to strings for serialization. + :return: + """ + return { + "access_token_request_timestamp": datetime.strftime( + self.access_token_request_timestamp, self.TIMESTAMP_FORMAT + ), + "access_token_expiry": datetime.strftime( + self.access_token_expiry, self.TIMESTAMP_FORMAT + ), + } + + def set_access_token_expiration_details(self, access_token_expiration_details): + """ + Sets the access token expiration details from a serialized dictionary. + :param access_token_expiration_details: The serialized access token expiration details. + :return: + """ + self.access_token_request_timestamp = datetime.strptime( + access_token_expiration_details["access_token_request_timestamp"], + self.TIMESTAMP_FORMAT, + ) + self.access_token_expiry = datetime.strptime( + access_token_expiration_details["access_token_expiry"], + self.TIMESTAMP_FORMAT, + ) + + def is_access_token_expired(self): + """ + Checks if the access token has expired. If it has, a new access token is retrieved. + :return: True if expired, False otherwise. + """ + return datetime.now() >= self.access_token_expiry + + def retrieve_access_token(self, refresh=False): + """ + Implements authentication using MSAL. + :param refresh: If True, force a refresh of the access token. + :return: None + """ + app = ConfidentialClientApplication( + self.client_id, + authority=f"https://login.microsoftonline.com/{self.tenant_id}", + client_credential=self.client_secret, + ) + + scope = ["https://graph.microsoft.com/.default"] + + access_token_request_timestamp = datetime.now() + + if refresh: + logger.debug("Forcing refresh of access token.") + token = app.acquire_token_for_client(scopes=scope) + else: + # Check if a token is already cached + token = app.acquire_token_silent(scope, account=None) + + if not token: + token = app.acquire_token_for_client(scopes=scope) + + if "access_token" not in token: + logger.error("Authentication failed.") + raise ValueError("Authentication failed") + + access_token_expiry = access_token_request_timestamp + timedelta( + seconds=token["expires_in"] - 20 + ) + + self.access_token = token + self.access_token_request_timestamp = access_token_request_timestamp + self.access_token_expiry = access_token_expiry + self.headers = {"Authorization": f"Bearer {self.access_token['access_token']}"} + + # logger.debug("Access token retrieved successfully.") + + @api_call_decorator + def get_documents_drive(self): + """ + Get the document drive of the SharePoint site. + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/sites/{self.site_id}/drive" + # logger.debug(f"Getting document drive from URL: {url}") + return "GET", url, None + + @api_call_decorator + def list_folder_contents(self, folder_path: str, page_size: int = 100): + """ + GET drive/root/children + + This function will list the contents of a folder in SharePoint. + :param drive_id: The ID of the drive. + :param folder_path: The path of the folder. + :param page_size: The number of items per page (default is 100). + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{folder_path}:/children?$top={page_size}" + # logger.debug(f"Listing folder contents from URL: {url}") + return "GET", url, None + + @api_call_decorator + def create_folder(self, file_name, folder_path): + """ + POST https://graph.microsoft.com/v1.0/me/drive/root/children + Content-Type: application/json + { + "name": "New Folder", + "folder": { }, + "@microsoft.graph.conflictBehavior": "rename" + } + + """ + data = { + "name": file_name, + "folder": {}, + "@microsoft.graph.conflictBehavior": "rename", + } + url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{folder_path}:/children" + + return "POST", url, data + + def upload_file(self, file_name, file_stream, sharepoint_parent_id): + """ + Uploads a file to SharePoint using the Graph API. + PUT /drives/{drive-id}/root:/{path-to-file}:/content + + :param file_name: Name of the file to upload + :param sharepoint_path: Path within the SharePoint site (folder path) + :param file_stream: File content as a binary stream (e.g., BytesIO or open(file, 'rb')) + :return: Response JSON from the API + """ + url = f"https://graph.microsoft.com/v1.0/drives/{self.document_drive_id}/root:/{sharepoint_parent_id}/{file_name}:/content" + # logger.debug(f"Uploading file to URL: {url}") + + response = requests.put(url, headers=self.headers, data=file_stream) + + if response.status_code in (200, 201): + # logger.info(f"File '{file_name}' uploaded successfully.") + return response.json() + else: + retry = handle_error(response) + if retry == "retry": + return self.upload_file(file_name, sharepoint_parent_id, file_stream) + + @staticmethod + def download_sharepoint_file(download_url): + """ + Downloads a file from the given URL and returns its content. + + :param download_url: The URL to download the file from. + :return: The content of the downloaded file. + """ + response = requests.get(download_url, stream=True) + response.raise_for_status() # Check if the request was successful + + file_content = BytesIO() + + # Read the file content into memory + for chunk in response.iter_content(chunk_size=8192): + file_content.write(chunk) + + file_content.seek(0) # Reset the file pointer to the beginning + + return file_content diff --git a/utils/sharepoint/temp b/utils/sharepoint/temp new file mode 100644 index 00000000..e69de29b