Skip to content

s3_client

Classes:

S3Client

S3Client()

Methods:

  • download

    Download a file from S3 to a local directory.

  • exists

    Check if a file exists in S3 at the given URI.

  • generate_presigned_uri

    Generate a presigned URL from a given S3 URI with a default expiration of 7 days.

  • traverse

    Traverse through an S3 "directory" and return entries under it.

  • upload

    Upload a local file to S3.

  • walk

    Generator that walks all objects under the given S3 URI.

Source code in src/unibox/utils/s3_client.py
20
21
22
23
24
25
26
def __init__(self) -> None:
    import boto3

    # Simple S3 client init; if you need custom credentials or region,
    # pass them directly via environment variables or create a custom session.
    session = boto3.Session()
    self.s3 = session.client("s3")

download

download(s3_uri: str, target_dir: str | Path) -> str

Download a file from S3 to a local directory. :param s3_uri: S3 URI (e.g. s3://bucket/key) :param target_dir: Local directory path :return: Local file path

Source code in src/unibox/utils/s3_client.py
28
29
30
31
32
33
34
35
36
37
38
def download(self, s3_uri: str, target_dir: str | Path) -> str:
    """Download a file from S3 to a local directory.
    :param s3_uri: S3 URI (e.g. s3://bucket/key)
    :param target_dir: Local directory path
    :return: Local file path
    """
    bucket, key = parse_s3_url(s3_uri)
    filename = os.path.basename(s3_uri)
    path = os.path.join(target_dir, filename)
    self.s3.download_file(bucket, key, path)
    return path

exists

exists(s3_uri: str) -> bool

Check if a file exists in S3 at the given URI. :param s3_uri: S3 URI :return: True if object exists, False otherwise.

Source code in src/unibox/utils/s3_client.py
48
49
50
51
52
53
54
55
56
57
58
def exists(self, s3_uri: str) -> bool:
    """Check if a file exists in S3 at the given URI.
    :param s3_uri: S3 URI
    :return: True if object exists, False otherwise.
    """
    bucket, key = parse_s3_url(s3_uri)
    try:
        self.s3.head_object(Bucket=bucket, Key=key)
        return True
    except self.s3.exceptions.ClientError:
        return False

generate_presigned_uri

generate_presigned_uri(
    s3_uri: str, expiration: int = 604800
) -> str

Generate a presigned URL from a given S3 URI with a default expiration of 7 days.

:param s3_uri: S3 URI (e.g., 's3://bucket-name/object-key') :param expiration: Time in seconds for the presigned URL to remain valid (default 7 days). :return: Presigned URL as a string. If error, returns None.

Source code in src/unibox/utils/s3_client.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def generate_presigned_uri(self, s3_uri: str, expiration: int = 604800) -> str:
    """Generate a presigned URL from a given S3 URI with a default expiration of 7 days.

    :param s3_uri: S3 URI (e.g., 's3://bucket-name/object-key')
    :param expiration: Time in seconds for the presigned URL to remain valid (default 7 days).
    :return: Presigned URL as a string. If error, returns None.
    """
    bucket, key = parse_s3_url(s3_uri)

    # Constrain expiration to AWS max if needed.
    expiration = min(expiration, 604800)

    try:
        response = self.s3.generate_presigned_url(
            "get_object",
            Params={"Bucket": bucket, "Key": key},
            ExpiresIn=expiration,
        )
        return response
    except ClientError as e:
        logging.exception(f"Failed to generate presigned URL for {s3_uri}: {e}")
        return None

traverse

traverse(
    s3_uri: str,
    include_extensions=None,
    exclude_extensions=None,
    relative_unix=False,
    debug_print=True,
)

Traverse through an S3 "directory" and return entries under it.

:param include_extensions: list of file extensions to include (e.g. ['.jpg', '.png']). :param exclude_extensions: list of file extensions to exclude (e.g. ['.txt', '.json']). :param relative_unix: return relative paths or full s3:// URIs. :param debug_print: whether to show a tqdm progress bar. :return: list of keys or URIs.

Source code in src/unibox/utils/s3_client.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def traverse(
    self,
    s3_uri: str,
    include_extensions=None,
    exclude_extensions=None,
    relative_unix=False,
    debug_print=True,
):
    """Traverse through an S3 "directory" and return entries under it.

    :param include_extensions: list of file extensions to include (e.g. ['.jpg', '.png']).
    :param exclude_extensions: list of file extensions to exclude (e.g. ['.txt', '.json']).
    :param relative_unix: return relative paths or full s3:// URIs.
    :param debug_print: whether to show a tqdm progress bar.
    :return: list of keys or URIs.
    """
    bucket, prefix = parse_s3_url(s3_uri)

    if not prefix.endswith("/"):
        prefix += "/"

    paginator = self.s3.get_paginator("list_objects_v2")
    response_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/")

    all_entries = []

    if debug_print:
        response_iterator = tqdm(response_iterator, desc="Traversing S3", unit="page")

    for page in response_iterator:
        # Subdirectories
        for d in page.get("CommonPrefixes", []):
            dir_key = d["Prefix"]
            dir_entry = dir_key if relative_unix else f"s3://{bucket}/{dir_key}"
            all_entries.append(dir_entry)

        # Files
        for obj in page.get("Contents", []):
            file_key = obj["Key"]
            if file_key == prefix:
                continue  # skip the directory itself

            # Check include/exclude
            if (include_extensions is None or any(file_key.endswith(ext) for ext in include_extensions)) and (
                exclude_extensions is None or not any(file_key.endswith(ext) for ext in exclude_extensions)
            ):
                file_entry = file_key[len(prefix) :] if relative_unix else f"s3://{bucket}/{file_key}"
                all_entries.append(file_entry)

    return all_entries

upload

upload(file_path: str, s3_uri: str) -> None

Upload a local file to S3. :param file_path: Local file path :param s3_uri: S3 URI (e.g. s3://bucket/key)

Source code in src/unibox/utils/s3_client.py
40
41
42
43
44
45
46
def upload(self, file_path: str, s3_uri: str) -> None:
    """Upload a local file to S3.
    :param file_path: Local file path
    :param s3_uri: S3 URI (e.g. s3://bucket/key)
    """
    bucket, key = parse_s3_url(s3_uri)
    self.s3.upload_file(file_path, bucket, key)

walk

walk(s3_uri: str)

Generator that walks all objects under the given S3 URI. Yields metadata dictionaries for each object.

Source code in src/unibox/utils/s3_client.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def walk(self, s3_uri: str):
    """Generator that walks all objects under the given S3 URI.
    Yields metadata dictionaries for each object.
    """
    bucket, key = parse_s3_url(s3_uri)
    paginator = self.s3.get_paginator("list_objects_v2")
    pages = paginator.paginate(Bucket=bucket, Prefix=key)
    for page in pages:
        for obj in page["Contents"]:
            yield {
                "key": obj["Key"],
                "size": obj["Size"],
                "last_modified": obj["LastModified"],
                "etag": obj["ETag"],
                "storage_class": obj["StorageClass"],
            }