Model/infrastructure/s3_uri.py
2026-05-20 11:07:40 +00:00

43 lines
1.5 KiB
Python

"""Parse S3 URIs into ``(bucket, key)`` pairs.
A pure-stdlib helper for the infrastructure layer. It deliberately pulls in
neither pandas, boto3, nor the legacy ``utils`` package, so slim Lambda images
that only need URI parsing do not drag the wider data stack along.
Two input shapes are supported:
* canonical S3 URIs --- ``s3://bucket/key``
* AWS S3 console URLs --- ``https://.../s3/object/bucket?prefix=key``
"""
from urllib.parse import unquote
def parse_s3_uri(s3_uri: str) -> tuple[str, str]:
"""Return the ``(bucket, key)`` pair addressed by ``s3_uri``.
Raises:
ValueError: if ``s3_uri`` is neither a well-formed ``s3://`` URI nor
an AWS console URL carrying a ``prefix`` query parameter.
"""
if s3_uri.startswith("s3://"):
parts = s3_uri[len("s3://") :].split("/", 1)
if len(parts) < 2 or not parts[0] or not parts[1]:
raise ValueError("S3 URI must include both a bucket and a key")
return parts[0], parts[1]
if "?" not in s3_uri:
raise ValueError(f"Not an s3:// URI and has no query string: {s3_uri!r}")
base, query = s3_uri.split("?", 1)
if "/s3/object/" not in base:
raise ValueError(f"Console URL has no '/s3/object/' segment: {s3_uri!r}")
bucket = base.split("/s3/object/", 1)[1]
params: dict[str, str] = {}
for item in query.split("&"):
if "=" in item:
name, value = item.split("=", 1)
params[name] = value
key = unquote(params.get("prefix", ""))
return bucket, key