mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
43 lines
1.5 KiB
Python
43 lines
1.5 KiB
Python
"""Parse S3 URIs into ``(bucket, key)`` pairs.
|
|
|
|
A pure-stdlib helper for the infrastructure layer. It deliberately pulls in
|
|
neither pandas, boto3, nor the legacy ``utils`` package, so slim Lambda images
|
|
that only need URI parsing do not drag the wider data stack along.
|
|
|
|
Two input shapes are supported:
|
|
|
|
* canonical S3 URIs --- ``s3://bucket/key``
|
|
* AWS S3 console URLs --- ``https://.../s3/object/bucket?prefix=key``
|
|
"""
|
|
|
|
from urllib.parse import unquote
|
|
|
|
|
|
def parse_s3_uri(s3_uri: str) -> tuple[str, str]:
|
|
"""Return the ``(bucket, key)`` pair addressed by ``s3_uri``.
|
|
|
|
Raises:
|
|
ValueError: if ``s3_uri`` is neither a well-formed ``s3://`` URI nor
|
|
an AWS console URL carrying a ``prefix`` query parameter.
|
|
"""
|
|
if s3_uri.startswith("s3://"):
|
|
parts = s3_uri[len("s3://") :].split("/", 1)
|
|
if len(parts) < 2 or not parts[0] or not parts[1]:
|
|
raise ValueError("S3 URI must include both a bucket and a key")
|
|
return parts[0], parts[1]
|
|
|
|
if "?" not in s3_uri:
|
|
raise ValueError(f"Not an s3:// URI and has no query string: {s3_uri!r}")
|
|
base, query = s3_uri.split("?", 1)
|
|
|
|
if "/s3/object/" not in base:
|
|
raise ValueError(f"Console URL has no '/s3/object/' segment: {s3_uri!r}")
|
|
bucket = base.split("/s3/object/", 1)[1]
|
|
|
|
params: dict[str, str] = {}
|
|
for item in query.split("&"):
|
|
if "=" in item:
|
|
name, value = item.split("=", 1)
|
|
params[name] = value
|
|
key = unquote(params.get("prefix", ""))
|
|
return bucket, key
|