ai-content-maker/.venv/Lib/site-packages/cloudpathlib/s3/s3client.py

import mimetypes
import os
from pathlib import Path, PurePosixPath
from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union

from ..client import Client, register_client_class
from ..cloudpath import implementation_registry
from ..enums import FileCacheMode
from ..exceptions import CloudPathException
from .s3path import S3Path

try:
    from boto3.session import Session
    from boto3.s3.transfer import TransferConfig, S3Transfer
    from botocore.config import Config
    from botocore.exceptions import ClientError
    import botocore.session
except ModuleNotFoundError:
    implementation_registry["s3"].dependencies_loaded = False


@register_client_class("s3")
class S3Client(Client):
    """Client class for AWS S3 which handles authentication with AWS for [`S3Path`](../s3path/)
    instances. See documentation for the [`__init__` method][cloudpathlib.s3.s3client.S3Client.__init__]
    for detailed authentication options."""

    def __init__(
        self,
        aws_access_key_id: Optional[str] = None,
        aws_secret_access_key: Optional[str] = None,
        aws_session_token: Optional[str] = None,
        no_sign_request: Optional[bool] = False,
        botocore_session: Optional["botocore.session.Session"] = None,
        profile_name: Optional[str] = None,
        boto3_session: Optional["Session"] = None,
        file_cache_mode: Optional[Union[str, FileCacheMode]] = None,
        local_cache_dir: Optional[Union[str, os.PathLike]] = None,
        endpoint_url: Optional[str] = None,
        boto3_transfer_config: Optional["TransferConfig"] = None,
        content_type_method: Optional[Callable] = mimetypes.guess_type,
        extra_args: Optional[dict] = None,
    ):
        """Class constructor. Sets up a boto3 [`Session`](
        https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html).
        Directly supports the same authentication interface, as well as the same environment
        variables supported by boto3. See [boto3 Session documentation](
        https://boto3.amazonaws.com/v1/documentation/api/latest/guide/session.html).

        If no authentication arguments or environment variables are provided, then the client will
        be instantiated as anonymous, which will only have access to public buckets.

        Args:
            aws_access_key_id (Optional[str]): AWS access key ID.
            aws_secret_access_key (Optional[str]): AWS secret access key.
            aws_session_token (Optional[str]): Session key for your AWS account. This is only
                needed when you are using temporarycredentials.
            no_sign_request (Optional[bool]): If `True`, credentials are not looked for and we use unsigned
                requests to fetch resources. This will only allow access to public resources. This is equivalent
                to `--no-sign-request` in the [AWS CLI](https://docs.aws.amazon.com/cli/latest/reference/).
            botocore_session (Optional[botocore.session.Session]): An already instantiated botocore
                Session.
            profile_name (Optional[str]): Profile name of a profile in a shared credentials file.
            boto3_session (Optional[Session]): An already instantiated boto3 Session.
            file_cache_mode (Optional[Union[str, FileCacheMode]]): How often to clear the file cache; see
                [the caching docs](https://cloudpathlib.drivendata.org/stable/caching/) for more information
                about the options in cloudpathlib.eums.FileCacheMode.
            local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache
                for downloaded files. If None, will use a temporary directory. Default can be set with
                the `CLOUDPATHLIB_LOCAL_CACHE_DIR` environment variable.
            endpoint_url (Optional[str]): S3 server endpoint URL to use for the constructed boto3 S3 resource and client.
                Parameterize it to access a customly deployed S3-compatible object store such as MinIO, Ceph or any other.
            boto3_transfer_config (Optional[dict]): Instantiated TransferConfig for managing
                [s3 transfers](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig)
            content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when
                writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding).
            extra_args (Optional[dict]): A dictionary of extra args passed to download, upload, and list functions as relevant. You
                can include any keys supported by upload or download, and we will pass on only the relevant args. To see the extra
                args that are supported look at the upload and download lists in the
                [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.S3Transfer).
        """
        endpoint_url = endpoint_url or os.getenv("AWS_ENDPOINT_URL")
        if boto3_session is not None:
            self.sess = boto3_session
        else:
            self.sess = Session(
                aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key,
                aws_session_token=aws_session_token,
                botocore_session=botocore_session,
                profile_name=profile_name,
            )

        if no_sign_request:
            self.s3 = self.sess.resource(
                "s3",
                endpoint_url=endpoint_url,
                config=Config(signature_version=botocore.session.UNSIGNED),
            )
            self.client = self.sess.client(
                "s3",
                endpoint_url=endpoint_url,
                config=Config(signature_version=botocore.session.UNSIGNED),
            )
        else:
            self.s3 = self.sess.resource("s3", endpoint_url=endpoint_url)
            self.client = self.sess.client("s3", endpoint_url=endpoint_url)

        self.boto3_transfer_config = boto3_transfer_config

        if extra_args is None:
            extra_args = {}

        self._extra_args = extra_args
        self.boto3_dl_extra_args = {
            k: v for k, v in extra_args.items() if k in S3Transfer.ALLOWED_DOWNLOAD_ARGS
        }
        self.boto3_ul_extra_args = {
            k: v for k, v in extra_args.items() if k in S3Transfer.ALLOWED_UPLOAD_ARGS
        }

        # listing ops (list_objects_v2, filter, delete) only accept these extras:
        # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html
        self.boto3_list_extra_args = {
            k: self._extra_args[k]
            for k in ["RequestPayer", "ExpectedBucketOwner"]
            if k in self._extra_args
        }

        super().__init__(
            local_cache_dir=local_cache_dir,
            content_type_method=content_type_method,
            file_cache_mode=file_cache_mode,
        )

    def _get_metadata(self, cloud_path: S3Path) -> Dict[str, Any]:
        # get accepts all download extra args
        data = self.s3.ObjectSummary(cloud_path.bucket, cloud_path.key).get(
            **self.boto3_dl_extra_args
        )

        return {
            "last_modified": data["LastModified"],
            "size": data["ContentLength"],
            "etag": data["ETag"],
            "content_type": data.get("ContentType", None),
            "extra": data["Metadata"],
        }

    def _download_file(self, cloud_path: S3Path, local_path: Union[str, os.PathLike]) -> Path:
        local_path = Path(local_path)
        obj = self.s3.Object(cloud_path.bucket, cloud_path.key)

        obj.download_file(
            str(local_path), Config=self.boto3_transfer_config, ExtraArgs=self.boto3_dl_extra_args
        )
        return local_path

    def _is_file_or_dir(self, cloud_path: S3Path) -> Optional[str]:
        # short-circuit the root-level bucket
        if not cloud_path.key:
            return "dir"

        # get first item by listing at least one key
        return self._s3_file_query(cloud_path)

    def _exists(self, cloud_path: S3Path) -> bool:
        # check if this is a bucket
        if not cloud_path.key:
            extra = {
                k: self._extra_args[k] for k in ["ExpectedBucketOwner"] if k in self._extra_args
            }

            try:
                self.client.head_bucket(Bucket=cloud_path.bucket, **extra)
                return True
            except ClientError:
                return False

        return self._s3_file_query(cloud_path) is not None

    def _s3_file_query(self, cloud_path: S3Path):
        """Boto3 query used for quick checks of existence and if path is file/dir"""
        # check if this is an object that we can access directly
        try:
            # head_object accepts all download extra args (note: Object.load does not accept extra args so we do not use it for this check)
            self.client.head_object(
                Bucket=cloud_path.bucket,
                Key=cloud_path.key.rstrip("/"),
                **self.boto3_dl_extra_args,
            )
            return "file"

        # else, confirm it is a dir by filtering to the first item under the prefix plus a "/"
        except (ClientError, self.client.exceptions.NoSuchKey):
            key = cloud_path.key.rstrip("/") + "/"

            return next(
                (
                    "dir"  # always a dir if we find anything with this query
                    for obj in (
                        self.s3.Bucket(cloud_path.bucket)
                        .objects.filter(Prefix=key, **self.boto3_list_extra_args)
                        .limit(1)
                    )
                ),
                None,
            )

    def _list_dir(self, cloud_path: S3Path, recursive=False) -> Iterable[Tuple[S3Path, bool]]:
        # shortcut if listing all available buckets
        if not cloud_path.bucket:
            if recursive:
                raise NotImplementedError(
                    "Cannot recursively list all buckets and contents; you can get all the buckets then recursively list each separately."
                )

            yield from (
                (self.CloudPath(f"s3://{b['Name']}"), True)
                for b in self.client.list_buckets().get("Buckets", [])
            )
            return

        prefix = cloud_path.key
        if prefix and not prefix.endswith("/"):
            prefix += "/"

        yielded_dirs = set()

        paginator = self.client.get_paginator("list_objects_v2")

        for result in paginator.paginate(
            Bucket=cloud_path.bucket,
            Prefix=prefix,
            Delimiter=("" if recursive else "/"),
            **self.boto3_list_extra_args,
        ):
            # yield everything in common prefixes as directories
            for result_prefix in result.get("CommonPrefixes", []):
                canonical = result_prefix.get("Prefix").rstrip("/")  # keep a canonical form
                if canonical not in yielded_dirs:
                    yield (
                        self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"),
                        True,
                    )
                    yielded_dirs.add(canonical)

            # check all the keys
            for result_key in result.get("Contents", []):
                # yield all the parents of any key that have not been yielded already
                o_relative_path = result_key.get("Key")[len(prefix) :]
                for parent in PurePosixPath(o_relative_path).parents:
                    parent_canonical = prefix + str(parent).rstrip("/")
                    if parent_canonical not in yielded_dirs and str(parent) != ".":
                        yield (
                            self.CloudPath(f"s3://{cloud_path.bucket}/{parent_canonical}"),
                            True,
                        )
                        yielded_dirs.add(parent_canonical)

                # if we already yielded this dir, go to next item in contents
                canonical = result_key.get("Key").rstrip("/")
                if canonical in yielded_dirs:
                    continue

                # s3 fake directories have 0 size and end with "/"
                if result_key.get("Key").endswith("/") and result_key.get("Size") == 0:
                    yield (
                        self.CloudPath(f"s3://{cloud_path.bucket}/{canonical}"),
                        True,
                    )
                    yielded_dirs.add(canonical)

                # yield object as file
                else:
                    yield (
                        self.CloudPath(f"s3://{cloud_path.bucket}/{result_key.get('Key')}"),
                        False,
                    )

    def _move_file(self, src: S3Path, dst: S3Path, remove_src: bool = True) -> S3Path:
        # just a touch, so "REPLACE" metadata
        if src == dst:
            o = self.s3.Object(src.bucket, src.key)
            o.copy_from(
                CopySource={"Bucket": src.bucket, "Key": src.key},
                Metadata=self._get_metadata(src).get("extra", {}),
                MetadataDirective="REPLACE",
                **self.boto3_ul_extra_args,
            )

        else:
            target = self.s3.Object(dst.bucket, dst.key)
            target.copy(
                {"Bucket": src.bucket, "Key": src.key},
                ExtraArgs=self.boto3_dl_extra_args,
                Config=self.boto3_transfer_config,
            )

            if remove_src:
                self._remove(src)
        return dst

    def _remove(self, cloud_path: S3Path, missing_ok: bool = True) -> None:
        file_or_dir = self._is_file_or_dir(cloud_path=cloud_path)
        if file_or_dir == "file":
            resp = self.s3.Object(cloud_path.bucket, cloud_path.key).delete(
                **self.boto3_list_extra_args
            )
            if resp.get("ResponseMetadata").get("HTTPStatusCode") not in (204, 200):
                raise CloudPathException(
                    f"Delete operation failed for {cloud_path} with response: {resp}"
                )

        elif file_or_dir == "dir":
            # try to delete as a direcotry instead
            bucket = self.s3.Bucket(cloud_path.bucket)

            prefix = cloud_path.key
            if prefix and not prefix.endswith("/"):
                prefix += "/"

            resp = bucket.objects.filter(Prefix=prefix, **self.boto3_list_extra_args).delete(
                **self.boto3_list_extra_args
            )
            if resp[0].get("ResponseMetadata").get("HTTPStatusCode") not in (204, 200):
                raise CloudPathException(
                    f"Delete operation failed for {cloud_path} with response: {resp}"
                )

        else:
            if not missing_ok:
                raise FileNotFoundError(
                    f"Cannot delete file that does not exist: {cloud_path} (consider passing missing_ok=True)"
                )

    def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: S3Path) -> S3Path:
        obj = self.s3.Object(cloud_path.bucket, cloud_path.key)

        extra_args = self.boto3_ul_extra_args.copy()

        if self.content_type_method is not None:
            content_type, content_encoding = self.content_type_method(str(local_path))
            if content_type is not None:
                extra_args["ContentType"] = content_type
            if content_encoding is not None:
                extra_args["ContentEncoding"] = content_encoding

        obj.upload_file(str(local_path), Config=self.boto3_transfer_config, ExtraArgs=extra_args)
        return cloud_path


S3Client.S3Path = S3Client.CloudPath  # type: ignore