survey-extraction/etl/utils/utils.py

from urllib.parse import unquote

def get_sharepoint_path(url):
    url_parts = url.split('/')
    # Find the index of 'Forms'
    forms_index = url_parts.index('Forms')
    # Get the part after 'Forms'
    after_forms = url_parts[forms_index + 1]

    # Find 'id=' and extract after it
    if 'id=' in after_forms:
        id_part = after_forms.split('id=')[1]
        # Only keep the path before '&' (to ignore other parameters)
        id_path = id_part.split('&')[0]
        # Decode the path
        decoded_path = unquote(id_path)
        # Now, remove the leading '/sites/xxx/Shared Documents/' part
        parts = decoded_path.split('Shared Documents')
        if len(parts) > 1:
            final_path = parts[1].strip('/')
            return final_path
        else:
            return decoded_path.strip('/')