ai-content-maker/.venv/Lib/site-packages/torch/multiprocessing/reductions.py

import multiprocessing
import os
import threading
from multiprocessing.reduction import ForkingPickler
from multiprocessing.util import register_after_fork
from typing import Union

import torch
import torch.utils.hooks
from torch._namedtensor_internals import check_serializing_named_tensor

try:
    # Early load resource_sharer to prevent a partially initialized instance
    # from being inherited in a forked child process. The reduce_storage method
    # requires this module indirectly through DupFd(). The built-in mp.Queue
    # class pickles arguments in a background thread which may overlap with the
    # fork.
    import multiprocessing.resource_sharer
except ImportError:
    pass


class StorageWeakRef:
    r"""A weak reference to a Storage.

    The cdata member is a Python number containing the integer representation of
    the Storage pointer.
    """

    __slots__ = ["cdata", "_free_weak_ref"]

    def __init__(self, storage):
        self.cdata = storage._weak_ref()
        # Save a direct reference to _free_weak_ref because the `torch` module
        # might be cleared during Python shutdown before this module is cleared.
        self._free_weak_ref = torch.Storage._free_weak_ref  # type: ignore[attr-defined]

    @classmethod
    def from_weakref(cls, cdata):
        instance = cls.__new__(cls)
        instance.cdata = cdata
        instance._free_weak_ref = torch.Storage._free_weak_ref  # type: ignore[attr-defined]
        return instance

    def expired(self):
        return torch.Storage._expired(self.cdata)  # type: ignore[attr-defined]

    def __del__(self):
        self._free_weak_ref(self.cdata)

    def __hash__(self):
        return self.cdata

    def __eq__(self, other):
        if id(self) == id(other):
            return True
        return self.cdata == other.cdata


class SharedCache(dict):
    """Dictionary from multiprocessing handles to StorageWeakRef."""

    def __init__(self):
        # free_dead_references() is called if the len exceeds the current
        # limit. The limit scales with the number of remaining live objects.
        self.limit = 128
        # `fork` inherits lock state, so in case we fork when the lock is held,
        # we register a function to reset the lock to a new object to avoid
        # possible deadlocks, following python multiprocessing library design.
        self._after_fork()
        register_after_fork(self, SharedCache._after_fork)

    def _after_fork(self):
        self.lock = threading.Lock()

    def get(self, key):
        with self.lock:
            return dict.get(self, key)

    def __setitem__(self, key, storage_ref):
        with self.lock:
            dict.__setitem__(self, key, storage_ref)
            if len(self) > self.limit:
                self.free_dead_references()

    def free_dead_references(self):
        live = 0
        for key, storage_ref in list(self.items()):
            if storage_ref.expired():
                del self[key]
            else:
                live += 1
        self.limit = max(128, live * 2)


# mapping from handles to StorageWeakRef objects
shared_cache = SharedCache()


def rebuild_event(device, handle):
    return torch.cuda.Event.from_ipc_handle(device, handle)


def reduce_event(event):
    handle = event.ipc_handle()
    return (rebuild_event, (event.device, handle))


def rebuild_tensor(cls, storage, metadata):
    storage_offset, size, stride, requires_grad = metadata
    t = torch._utils._rebuild_tensor(storage, storage_offset, size, stride)
    if cls == torch.nn.parameter.Parameter:
        # we have to pass requires_grad into constructor, rather than set it as an
        # attribute later, because it's an important check for Integer Tensors to
        # have requires_grad=False (or else they raise an error)
        t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad)
    else:
        t.requires_grad = requires_grad
    return t


def rebuild_cuda_tensor(
    tensor_cls,
    tensor_size,
    tensor_stride,
    tensor_offset,
    storage_cls,
    dtype,
    storage_device,
    storage_handle,
    storage_size_bytes,
    storage_offset_bytes,
    requires_grad,
    ref_counter_handle,
    ref_counter_offset,
    event_handle,
    event_sync_required,
):
    # If storage_handle is None, storage points to nullptr.
    if storage_handle is None or storage_size_bytes == 0:
        storage = storage_cls(0, dtype=dtype, device=storage_device, _internal=True)
    else:
        storage = storage_from_cache(
            storage_cls, (storage_handle, storage_offset_bytes)
        )
        if storage is None:
            torch.cuda._lazy_init()
            storage = storage_cls._new_shared_cuda(
                storage_device,
                storage_handle,
                storage_size_bytes,
                storage_offset_bytes,
                ref_counter_handle,
                ref_counter_offset,
                event_handle,
                event_sync_required,
            )
            shared_cache[(storage_handle, storage_offset_bytes)] = StorageWeakRef(
                storage
            )
        else:
            # We already ref counting this Storage, but producer needs new ref-counters to be released.
            storage_cls._release_ipc_counter(
                ref_counter_handle, ref_counter_offset, device=storage_device
            )

    _storage = (
        storage
        if isinstance(storage, torch.UntypedStorage)
        else storage._untyped_storage
    )

    t = torch._utils._rebuild_tensor(
        torch.storage.TypedStorage(wrap_storage=_storage, dtype=dtype, _internal=True),
        tensor_offset,
        tensor_size,
        tensor_stride,
    )

    if tensor_cls == torch.nn.parameter.Parameter:
        # It is crucial for integer tensors to receive
        # the requires_grad=False as an argument in the constructor
        t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad)
    else:
        t.requires_grad = requires_grad

    return t


def reduce_tensor(tensor):
    if tensor.requires_grad and not tensor.is_leaf:
        raise RuntimeError(
            "Cowardly refusing to serialize non-leaf tensor which requires_grad, "
            "since autograd does not support crossing process boundaries.  "
            "If you just want to transfer the data, call detach() on the tensor "
            "before serializing (e.g., putting it on the queue)."
        )

    check_serializing_named_tensor(tensor)
    torch.utils.hooks.warn_if_has_hooks(tensor)

    # Note [CUDA IPC and the caching allocator]
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # When you send a CUDA tensor over IPC, you might expect that you will
    # get out the same storage from the other end.  However, the CUDA caching
    # allocator makes it difficult to preserve this invariant.  Consider
    # the following situation: a tensor of size 0x100 points to offset 0x20 of
    # a storage at 0xA100 of size 0x100.  (For simplicity, all of these
    # sizes are given in bytes).  HOWEVER, with the caching allocator, this storage
    # might be part of a larger cudaMalloc allocation 0xA000 of size 0x4000.
    #
    # When we want to send this CUDA tensor over IPC, we must send the
    # *entire* cudaMalloc allocation, i.e., the 0xA000 region, not just
    # the storage 0xA100 (because that is what CUDA supports).  So, on the
    # other end, there simply isn't any way to say, "Wait, you gave me
    # a bigger region (0xA000) than the one I wanted (0xA100)".
    #
    # OK, so if you sent the cudaMalloc allocation, can you just wrap that up as
    # one storage itself? No, because this cudaMalloc allocation might contain
    # storages of mixed types: float, bytes, double... If you make the entire
    # allocation a single storage of a type A, we'll hit an error when constructing
    # a tensor of type B on the storage.
    #
    # cudaIpcMemHandle is an identifier to access the sender cudaMalloc allocation on the
    # receiver side. However, cudaIpcMemHandles from each device in a given process may
    # only be opened by one context per device per other process.
    # If we open and close a memory handle multiples times in a process, CUDA is allowed
    # to give it a different address; similarly, once we close the memory, we're not
    # allowed to access it(and the storage/tensor built on top of it), even if it is
    # still live in the original process. As we cannot make a cudaMalloc allocation
    # to a single storage in one go, this requires us to cache the device pointer for
    # each cudaIpcMemHandle on C++ side to reconstruct types of storages, while keep
    # the old ones alives.
    # See [https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html]
    #
    # This is fine, because all we need to do is to save our position in the allocation,
    # and reconstruct storage and tensor from it.
    # 0xA000 ->  -------CUDA Allocation------
    #           |                            |
    #           |                            |
    #           |                            |
    #           |                            |
    # 0xA100 ->  --------storage1 begin------
    #           |                            |
    # 0xA120 ->  --------tensor1 begin ------
    #           |                            |
    #           |                            |
    #           |                            |
    #           |                            |
    #           |                            |
    # 0xA160 ->  --------tensor1 end---------
    #           |                            |
    #           |                            |
    #           |                            |
    # 0xA200 ->  --------storage1 end--------
    #           |                            |
    # 0xE000 ->  --------CUDA allocation-----
    #
    # To send tensor1, the following info are required from sender to receiver for
    # storage recontruction.
    #   1. cudaIpcMemHandle of 0xA000(which can be mapped to a basePtr in receiver process).
    #      basePtr may not be exactly 0xA000 since it's a different process.
    #   2. offset(0xA100) of storage1 in the CUDA allocation.
    #   3. size of storage1(0x100).
    #
    # On receiver side:
    #   1. Get the devPtr of the MemHandle to access the memory, reconstruct a storage
    #      of the same type using (basePtr, offset, size).
    #   2. we can reconstruct the tensor on top of the reconstructed storage
    #   Tensor(size=0x040, offset=0x020, storage=Storage(data=basePtr+0xA100, size=0x0100))
    #
    # This strategy has a few implications:
    #
    # 1. When we serialize a CUDA tensor for IPC, we cannot do it all in one
    #    go (non-compositionally), and this requires to have a global map
    #    memHandle -> devPtr for each process.
    #
    # 2. We MUST NOT let the new IPC tensor be resizable.  Originally, a resize
    #    of the storage beyond 0x100 would merely have caused us to do a
    #    reallocation.  You don't really want to do this, but if you did,
    #    all that would happen is that you would lose IPC sharing.  But if
    #    you do this in the new world, we will happily let you write out of
    #    bounds of your "allocation", clobbering unrelated data in the cached
    #    allocator block.  BAD!
    #
    # By the way, in old versions of PyTorch, we supported this situation
    # natively using a "storage view", which permitted multiple storages to be
    # views on each other.  But this was the *only* use of storage views, so we
    # eliminated it so that we could just use tensor views to implement the same
    # thing.
    #

    # TODO: Handle distinguishing between subclass and non-subclass versions of NT better
    # https://github.com/pytorch/pytorch/issues/110543
    from torch.nested._internal.nested_tensor import NestedTensor

    if tensor.is_nested and not isinstance(tensor, NestedTensor):
        return reduce_nested_tensor(tensor)

    if tensor.layout in {
        torch.sparse_coo,
        torch.sparse_csr,
        torch.sparse_bsr,
        torch.sparse_csc,
        torch.sparse_bsc,
    }:
        return reduce_sparse_tensor(tensor)

    storage = tensor._typed_storage()

    if storage._untyped_storage.device.type == "cuda":
        (
            device,
            handle,
            storage_size_bytes,
            storage_offset_bytes,
            ref_counter_handle,
            ref_counter_offset,
            event_handle,
            event_sync_required,
        ) = storage._share_cuda_()
        tensor_offset = tensor.storage_offset()
        shared_cache[handle] = StorageWeakRef(storage)
        # _backward_hooks purposely omitted here, see
        # Note [Don't serialize hooks]
        return (
            rebuild_cuda_tensor,
            (
                type(tensor),
                tensor.size(),
                tensor.stride(),
                tensor_offset,  # tensor offset in its storage
                type(storage),
                tensor.dtype,
                device,
                handle,  # identifier which CUDA allocation is the storage in.
                storage_size_bytes,  # size(in bytes) of the storage
                storage_offset_bytes,  # offset(in bytes) of the storage in the CUDA allocation
                tensor.requires_grad,
                ref_counter_handle,
                ref_counter_offset,
                event_handle,
                event_sync_required,
            ),
        )

    # _backward_hooks purposely omitted here, see Note [Don't serialize hooks]
    metadata = (
        tensor.storage_offset(),
        tensor.size(),
        tensor.stride(),
        tensor.requires_grad,
    )
    return (rebuild_tensor, (type(tensor), storage, metadata))


def rebuild_nested_tensor(
    rebuild_buffer_func,
    rebuild_buffer_args,
    rebuild_sizes_func,
    rebuild_sizes_args,
    rebuild_strides_func,
    rebuild_strides_args,
    rebuild_offsets_func,
    rebuild_offsets_args,
):
    buffer = rebuild_buffer_func(*rebuild_buffer_args)
    sizes = rebuild_sizes_func(*rebuild_sizes_args)
    strides = rebuild_strides_func(*rebuild_strides_args)
    offsets = rebuild_offsets_func(*rebuild_offsets_args)
    return torch._nested_view_from_buffer_copy(buffer, sizes, strides, offsets)


def reduce_nested_tensor(nt):
    rebuild_buffer_func, rebuild_buffer_args = reduce_tensor(nt.values())
    rebuild_sizes_func, rebuild_sizes_args = reduce_tensor(nt._nested_tensor_size())
    rebuild_strides_func, rebuild_strides_args = reduce_tensor(
        nt._nested_tensor_strides()
    )
    rebuild_offsets_func, rebuild_offsets_args = reduce_tensor(
        nt._nested_tensor_storage_offsets()
    )

    return (
        rebuild_nested_tensor,
        (
            rebuild_buffer_func,
            rebuild_buffer_args,
            rebuild_sizes_func,
            rebuild_sizes_args,
            rebuild_strides_func,
            rebuild_strides_args,
            rebuild_offsets_func,
            rebuild_offsets_args,
        ),
    )


def rebuild_sparse_coo_tensor(
    rebuild_indices_func,
    rebuild_indices_args,
    rebuild_values_func,
    rebuild_values_args,
    shape,
    is_coalesced,
):
    indices = rebuild_indices_func(*rebuild_indices_args)
    values = rebuild_values_func(*rebuild_values_args)
    return torch.sparse_coo_tensor(indices, values, shape, is_coalesced=is_coalesced)


def rebuild_sparse_compressed_tensor(
    rebuild_compressed_indices_func,
    rebuild_compressed_indices_args,
    rebuild_plain_indices_func,
    rebuild_plain_indices_args,
    rebuild_values_func,
    rebuild_values_args,
    shape,
    layout,
):
    compressed_indices = rebuild_compressed_indices_func(
        *rebuild_compressed_indices_args
    )
    plain_indices = rebuild_plain_indices_func(*rebuild_plain_indices_args)
    values = rebuild_values_func(*rebuild_values_args)
    return torch.sparse_compressed_tensor(
        compressed_indices, plain_indices, values, shape, layout=layout
    )


def reduce_sparse_tensor(sparse):
    if sparse.layout is torch.sparse_coo:
        rebuild_indices_func, rebuild_indices_args = reduce_tensor(sparse._indices())
        rebuild_values_func, rebuild_values_args = reduce_tensor(sparse._values())
        return (
            rebuild_sparse_coo_tensor,
            (
                rebuild_indices_func,
                rebuild_indices_args,
                rebuild_values_func,
                rebuild_values_args,
                sparse.shape,
                sparse.is_coalesced(),
            ),
        )
    else:
        if sparse.layout in {torch.sparse_csr, torch.sparse_bsr}:
            compressed_indices = sparse.crow_indices()
            plain_indices = sparse.col_indices()
        elif sparse.layout in {torch.sparse_csc, torch.sparse_bsc}:
            compressed_indices = sparse.ccol_indices()
            plain_indices = sparse.row_indices()
        else:
            raise NotImplementedError(sparse.layout)
        (
            rebuild_compressed_indices_func,
            rebuild_compressed_indices_args,
        ) = reduce_tensor(compressed_indices)
        rebuild_plain_indices_func, rebuild_plain_indices_args = reduce_tensor(
            plain_indices
        )
        rebuild_values_func, rebuild_values_args = reduce_tensor(sparse.values())
        return (
            rebuild_sparse_compressed_tensor,
            (
                rebuild_compressed_indices_func,
                rebuild_compressed_indices_args,
                rebuild_plain_indices_func,
                rebuild_plain_indices_args,
                rebuild_values_func,
                rebuild_values_args,
                sparse.shape,
                sparse.layout,
            ),
        )


def fd_id(fd):
    # Returns a tuple which uniquely identifies a file descriptor. In Mac OS,
    # this doesn't work with shared memory handles, which is why we don't
    # support the "file_descriptor" sharing method on that platform.
    stat = os.fstat(fd)
    return (stat.st_ino, stat.st_dev)


def storage_from_cache(cls, key):
    storage_ref = shared_cache.get(key)
    if storage_ref is None:
        return None
    return torch.UntypedStorage._new_with_weak_ptr(storage_ref.cdata)


def rebuild_storage_fd(cls, df, size):
    fd = df.detach()
    try:
        storage = storage_from_cache(cls, fd_id(fd))
        if storage is not None:
            return storage
        storage = cls._new_shared_fd_cpu(fd, size)
        shared_cache[fd_id(fd)] = StorageWeakRef(storage)
        return storage
    finally:
        os.close(fd)


def rebuild_storage_filename(cls, manager, handle, size, dtype=None):
    storage: Union[torch.TypedStorage, torch.UntypedStorage] = storage_from_cache(
        cls, handle
    )
    if storage is not None:
        return storage._shared_decref()
    if dtype is None:
        storage = torch.UntypedStorage._new_shared_filename_cpu(manager, handle, size)
    else:
        byte_size = size * torch._utils._element_size(dtype)
        untyped_storage: torch.UntypedStorage = (
            torch.UntypedStorage._new_shared_filename_cpu(manager, handle, byte_size)
        )
        storage = torch.TypedStorage(
            wrap_storage=untyped_storage, dtype=dtype, _internal=True
        )
    shared_cache[handle] = StorageWeakRef(storage)
    return storage._shared_decref()


def rebuild_storage_empty(cls):
    return cls()


def rebuild_typed_storage(storage, dtype):
    return torch.storage.TypedStorage(wrap_storage=storage, dtype=dtype, _internal=True)


# Use for torch.storage.TypedStorage
def reduce_typed_storage(storage):
    return (rebuild_typed_storage, (storage._untyped_storage, storage.dtype))


def rebuild_typed_storage_child(storage, storage_type):
    return storage_type(wrap_storage=storage, _internal=True)


# Use for child classes of torch.storage.TypedStorage, like torch.FloatStorage
def reduce_typed_storage_child(storage):
    return (rebuild_typed_storage_child, (storage._untyped_storage, type(storage)))


def reduce_storage(storage):
    from . import get_sharing_strategy

    if storage.is_cuda:
        raise RuntimeError(
            "Cannot pickle CUDA storage; try pickling a CUDA tensor instead"
        )
    elif get_sharing_strategy() == "file_system":
        metadata = storage._share_filename_cpu_()
        cache_key = metadata[1]
        rebuild = rebuild_storage_filename
        if isinstance(storage, torch.TypedStorage):
            metadata += (storage.dtype,)
        storage._shared_incref()
    elif storage.size() == 0:
        # This is special cased because Empty tensors
        # (with size 0) cannot be mmapped.
        return (rebuild_storage_empty, (type(storage),))
    else:
        fd, size = storage._share_fd_cpu_()
        df = multiprocessing.reduction.DupFd(fd)
        cache_key = fd_id(fd)
        metadata = (df, size)
        rebuild = rebuild_storage_fd  # type: ignore[assignment]

    shared_cache[cache_key] = StorageWeakRef(storage)
    return (rebuild, (type(storage),) + metadata)


def init_reductions():
    ForkingPickler.register(torch.cuda.Event, reduce_event)

    for t in torch._storage_classes:
        if t.__name__ == "UntypedStorage":
            ForkingPickler.register(t, reduce_storage)
        else:
            ForkingPickler.register(t, reduce_typed_storage_child)

    ForkingPickler.register(torch.storage.TypedStorage, reduce_typed_storage)

    for t in torch._tensor_classes:
        ForkingPickler.register(t, reduce_tensor)

    # TODO: Maybe this should be in tensor_classes? :)
    ForkingPickler.register(torch.Tensor, reduce_tensor)
    ForkingPickler.register(torch.nn.parameter.Parameter, reduce_tensor)
first commit 2024-05-03 04:18:51 +03:00			`import multiprocessing`
			`import os`
			`import threading`
			`from multiprocessing.reduction import ForkingPickler`
			`from multiprocessing.util import register_after_fork`
			`from typing import Union`

			`import torch`
			`import torch.utils.hooks`
			`from torch._namedtensor_internals import check_serializing_named_tensor`

			`try:`
			`# Early load resource_sharer to prevent a partially initialized instance`
			`# from being inherited in a forked child process. The reduce_storage method`
			`# requires this module indirectly through DupFd(). The built-in mp.Queue`
			`# class pickles arguments in a background thread which may overlap with the`
			`# fork.`
			`import multiprocessing.resource_sharer`
			`except ImportError:`
			`pass`


			`class StorageWeakRef:`
			`r"""A weak reference to a Storage.`

			`The cdata member is a Python number containing the integer representation of`
			`the Storage pointer.`
			`"""`

			`__slots__ = ["cdata", "_free_weak_ref"]`

			`def __init__(self, storage):`
			`self.cdata = storage._weak_ref()`
			# Save a direct reference to _free_weak_ref because the `torch` module
			`# might be cleared during Python shutdown before this module is cleared.`
			`self._free_weak_ref = torch.Storage._free_weak_ref # type: ignore[attr-defined]`

			`@classmethod`
			`def from_weakref(cls, cdata):`
			`instance = cls.__new__(cls)`
			`instance.cdata = cdata`
			`instance._free_weak_ref = torch.Storage._free_weak_ref # type: ignore[attr-defined]`
			`return instance`

			`def expired(self):`
			`return torch.Storage._expired(self.cdata) # type: ignore[attr-defined]`

			`def __del__(self):`
			`self._free_weak_ref(self.cdata)`

			`def __hash__(self):`
			`return self.cdata`

			`def __eq__(self, other):`
			`if id(self) == id(other):`
			`return True`
			`return self.cdata == other.cdata`


			`class SharedCache(dict):`
			`"""Dictionary from multiprocessing handles to StorageWeakRef."""`

			`def __init__(self):`
			`# free_dead_references() is called if the len exceeds the current`
			`# limit. The limit scales with the number of remaining live objects.`
			`self.limit = 128`
			# `fork` inherits lock state, so in case we fork when the lock is held,
			`# we register a function to reset the lock to a new object to avoid`
			`# possible deadlocks, following python multiprocessing library design.`
			`self._after_fork()`
			`register_after_fork(self, SharedCache._after_fork)`

			`def _after_fork(self):`
			`self.lock = threading.Lock()`

			`def get(self, key):`
			`with self.lock:`
			`return dict.get(self, key)`

			`def __setitem__(self, key, storage_ref):`
			`with self.lock:`
			`dict.__setitem__(self, key, storage_ref)`
			`if len(self) > self.limit:`
			`self.free_dead_references()`

			`def free_dead_references(self):`
			`live = 0`
			`for key, storage_ref in list(self.items()):`
			`if storage_ref.expired():`
			`del self[key]`
			`else:`
			`live += 1`
			`self.limit = max(128, live * 2)`


			`# mapping from handles to StorageWeakRef objects`
			`shared_cache = SharedCache()`


			`def rebuild_event(device, handle):`
			`return torch.cuda.Event.from_ipc_handle(device, handle)`


			`def reduce_event(event):`
			`handle = event.ipc_handle()`
			`return (rebuild_event, (event.device, handle))`


			`def rebuild_tensor(cls, storage, metadata):`
			`storage_offset, size, stride, requires_grad = metadata`
			`t = torch._utils._rebuild_tensor(storage, storage_offset, size, stride)`
			`if cls == torch.nn.parameter.Parameter:`
			`# we have to pass requires_grad into constructor, rather than set it as an`
			`# attribute later, because it's an important check for Integer Tensors to`
			`# have requires_grad=False (or else they raise an error)`
			`t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad)`
			`else:`
			`t.requires_grad = requires_grad`
			`return t`


			`def rebuild_cuda_tensor(`
			`tensor_cls,`
			`tensor_size,`
			`tensor_stride,`
			`tensor_offset,`
			`storage_cls,`
			`dtype,`
			`storage_device,`
			`storage_handle,`
			`storage_size_bytes,`
			`storage_offset_bytes,`
			`requires_grad,`
			`ref_counter_handle,`
			`ref_counter_offset,`
			`event_handle,`
			`event_sync_required,`
			`):`
			`# If storage_handle is None, storage points to nullptr.`
			`if storage_handle is None or storage_size_bytes == 0:`
			`storage = storage_cls(0, dtype=dtype, device=storage_device, _internal=True)`
			`else:`
			`storage = storage_from_cache(`
			`storage_cls, (storage_handle, storage_offset_bytes)`
			`)`
			`if storage is None:`
			`torch.cuda._lazy_init()`
			`storage = storage_cls._new_shared_cuda(`
			`storage_device,`
			`storage_handle,`
			`storage_size_bytes,`
			`storage_offset_bytes,`
			`ref_counter_handle,`
			`ref_counter_offset,`
			`event_handle,`
			`event_sync_required,`
			`)`
			`shared_cache[(storage_handle, storage_offset_bytes)] = StorageWeakRef(`
			`storage`
			`)`
			`else:`
			`# We already ref counting this Storage, but producer needs new ref-counters to be released.`
			`storage_cls._release_ipc_counter(`
			`ref_counter_handle, ref_counter_offset, device=storage_device`
			`)`

			`_storage = (`
			`storage`
			`if isinstance(storage, torch.UntypedStorage)`
			`else storage._untyped_storage`
			`)`

			`t = torch._utils._rebuild_tensor(`
			`torch.storage.TypedStorage(wrap_storage=_storage, dtype=dtype, _internal=True),`
			`tensor_offset,`
			`tensor_size,`
			`tensor_stride,`
			`)`

			`if tensor_cls == torch.nn.parameter.Parameter:`
			`# It is crucial for integer tensors to receive`
			`# the requires_grad=False as an argument in the constructor`
			`t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad)`
			`else:`
			`t.requires_grad = requires_grad`

			`return t`


			`def reduce_tensor(tensor):`
			`if tensor.requires_grad and not tensor.is_leaf:`
			`raise RuntimeError(`
			`"Cowardly refusing to serialize non-leaf tensor which requires_grad, "`
			`"since autograd does not support crossing process boundaries. "`
			`"If you just want to transfer the data, call detach() on the tensor "`
			`"before serializing (e.g., putting it on the queue)."`
			`)`

			`check_serializing_named_tensor(tensor)`
			`torch.utils.hooks.warn_if_has_hooks(tensor)`

			`# Note [CUDA IPC and the caching allocator]`
			`# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
			`# When you send a CUDA tensor over IPC, you might expect that you will`
			`# get out the same storage from the other end. However, the CUDA caching`
			`# allocator makes it difficult to preserve this invariant. Consider`
			`# the following situation: a tensor of size 0x100 points to offset 0x20 of`
			`# a storage at 0xA100 of size 0x100. (For simplicity, all of these`
			`# sizes are given in bytes). HOWEVER, with the caching allocator, this storage`
			`# might be part of a larger cudaMalloc allocation 0xA000 of size 0x4000.`
			`#`
			`# When we want to send this CUDA tensor over IPC, we must send the`
			`# entire cudaMalloc allocation, i.e., the 0xA000 region, not just`
			`# the storage 0xA100 (because that is what CUDA supports). So, on the`
			`# other end, there simply isn't any way to say, "Wait, you gave me`
			`# a bigger region (0xA000) than the one I wanted (0xA100)".`
			`#`
			`# OK, so if you sent the cudaMalloc allocation, can you just wrap that up as`
			`# one storage itself? No, because this cudaMalloc allocation might contain`
			`# storages of mixed types: float, bytes, double... If you make the entire`
			`# allocation a single storage of a type A, we'll hit an error when constructing`
			`# a tensor of type B on the storage.`
			`#`
			`# cudaIpcMemHandle is an identifier to access the sender cudaMalloc allocation on the`
			`# receiver side. However, cudaIpcMemHandles from each device in a given process may`
			`# only be opened by one context per device per other process.`
			`# If we open and close a memory handle multiples times in a process, CUDA is allowed`
			`# to give it a different address; similarly, once we close the memory, we're not`
			`# allowed to access it(and the storage/tensor built on top of it), even if it is`
			`# still live in the original process. As we cannot make a cudaMalloc allocation`
			`# to a single storage in one go, this requires us to cache the device pointer for`
			`# each cudaIpcMemHandle on C++ side to reconstruct types of storages, while keep`
			`# the old ones alives.`
			`# See [https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html]`
			`#`
			`# This is fine, because all we need to do is to save our position in the allocation,`
			`# and reconstruct storage and tensor from it.`
			`# 0xA000 -> -------CUDA Allocation------`
			`# \| \|`
			`# \| \|`
			`# \| \|`
			`# \| \|`
			`# 0xA100 -> --------storage1 begin------`
			`# \| \|`
			`# 0xA120 -> --------tensor1 begin ------`
			`# \| \|`
			`# \| \|`
			`# \| \|`
			`# \| \|`
			`# \| \|`
			`# 0xA160 -> --------tensor1 end---------`
			`# \| \|`
			`# \| \|`
			`# \| \|`
			`# 0xA200 -> --------storage1 end--------`
			`# \| \|`
			`# 0xE000 -> --------CUDA allocation-----`
			`#`
			`# To send tensor1, the following info are required from sender to receiver for`
			`# storage recontruction.`
			`# 1. cudaIpcMemHandle of 0xA000(which can be mapped to a basePtr in receiver process).`
			`# basePtr may not be exactly 0xA000 since it's a different process.`
			`# 2. offset(0xA100) of storage1 in the CUDA allocation.`
			`# 3. size of storage1(0x100).`
			`#`
			`# On receiver side:`
			`# 1. Get the devPtr of the MemHandle to access the memory, reconstruct a storage`
			`# of the same type using (basePtr, offset, size).`
			`# 2. we can reconstruct the tensor on top of the reconstructed storage`
			`# Tensor(size=0x040, offset=0x020, storage=Storage(data=basePtr+0xA100, size=0x0100))`
			`#`
			`# This strategy has a few implications:`
			`#`
			`# 1. When we serialize a CUDA tensor for IPC, we cannot do it all in one`
			`# go (non-compositionally), and this requires to have a global map`
			`# memHandle -> devPtr for each process.`
			`#`
			`# 2. We MUST NOT let the new IPC tensor be resizable. Originally, a resize`
			`# of the storage beyond 0x100 would merely have caused us to do a`
			`# reallocation. You don't really want to do this, but if you did,`
			`# all that would happen is that you would lose IPC sharing. But if`
			`# you do this in the new world, we will happily let you write out of`
			`# bounds of your "allocation", clobbering unrelated data in the cached`
			`# allocator block. BAD!`
			`#`
			`# By the way, in old versions of PyTorch, we supported this situation`
			`# natively using a "storage view", which permitted multiple storages to be`
			`# views on each other. But this was the only use of storage views, so we`
			`# eliminated it so that we could just use tensor views to implement the same`
			`# thing.`
			`#`

			`# TODO: Handle distinguishing between subclass and non-subclass versions of NT better`
			`# https://github.com/pytorch/pytorch/issues/110543`
			`from torch.nested._internal.nested_tensor import NestedTensor`

			`if tensor.is_nested and not isinstance(tensor, NestedTensor):`
			`return reduce_nested_tensor(tensor)`

			`if tensor.layout in {`
			`torch.sparse_coo,`
			`torch.sparse_csr,`
			`torch.sparse_bsr,`
			`torch.sparse_csc,`
			`torch.sparse_bsc,`
			`}:`
			`return reduce_sparse_tensor(tensor)`

			`storage = tensor._typed_storage()`

			`if storage._untyped_storage.device.type == "cuda":`
			`(`
			`device,`
			`handle,`
			`storage_size_bytes,`
			`storage_offset_bytes,`
			`ref_counter_handle,`
			`ref_counter_offset,`
			`event_handle,`
			`event_sync_required,`
			`) = storage._share_cuda_()`
			`tensor_offset = tensor.storage_offset()`
			`shared_cache[handle] = StorageWeakRef(storage)`
			`# _backward_hooks purposely omitted here, see`
			`# Note [Don't serialize hooks]`
			`return (`
			`rebuild_cuda_tensor,`
			`(`
			`type(tensor),`
			`tensor.size(),`
			`tensor.stride(),`
			`tensor_offset, # tensor offset in its storage`
			`type(storage),`
			`tensor.dtype,`
			`device,`
			`handle, # identifier which CUDA allocation is the storage in.`
			`storage_size_bytes, # size(in bytes) of the storage`
			`storage_offset_bytes, # offset(in bytes) of the storage in the CUDA allocation`
			`tensor.requires_grad,`
			`ref_counter_handle,`
			`ref_counter_offset,`
			`event_handle,`
			`event_sync_required,`
			`),`
			`)`

			`# _backward_hooks purposely omitted here, see Note [Don't serialize hooks]`
			`metadata = (`
			`tensor.storage_offset(),`
			`tensor.size(),`
			`tensor.stride(),`
			`tensor.requires_grad,`
			`)`
			`return (rebuild_tensor, (type(tensor), storage, metadata))`


			`def rebuild_nested_tensor(`
			`rebuild_buffer_func,`
			`rebuild_buffer_args,`
			`rebuild_sizes_func,`
			`rebuild_sizes_args,`
			`rebuild_strides_func,`
			`rebuild_strides_args,`
			`rebuild_offsets_func,`
			`rebuild_offsets_args,`
			`):`
			`buffer = rebuild_buffer_func(*rebuild_buffer_args)`
			`sizes = rebuild_sizes_func(*rebuild_sizes_args)`
			`strides = rebuild_strides_func(*rebuild_strides_args)`
			`offsets = rebuild_offsets_func(*rebuild_offsets_args)`
			`return torch._nested_view_from_buffer_copy(buffer, sizes, strides, offsets)`


			`def reduce_nested_tensor(nt):`
			`rebuild_buffer_func, rebuild_buffer_args = reduce_tensor(nt.values())`
			`rebuild_sizes_func, rebuild_sizes_args = reduce_tensor(nt._nested_tensor_size())`
			`rebuild_strides_func, rebuild_strides_args = reduce_tensor(`
			`nt._nested_tensor_strides()`
			`)`
			`rebuild_offsets_func, rebuild_offsets_args = reduce_tensor(`
			`nt._nested_tensor_storage_offsets()`
			`)`

			`return (`
			`rebuild_nested_tensor,`
			`(`
			`rebuild_buffer_func,`
			`rebuild_buffer_args,`
			`rebuild_sizes_func,`
			`rebuild_sizes_args,`
			`rebuild_strides_func,`
			`rebuild_strides_args,`
			`rebuild_offsets_func,`
			`rebuild_offsets_args,`
			`),`
			`)`


			`def rebuild_sparse_coo_tensor(`
			`rebuild_indices_func,`
			`rebuild_indices_args,`
			`rebuild_values_func,`
			`rebuild_values_args,`
			`shape,`
			`is_coalesced,`
			`):`
			`indices = rebuild_indices_func(*rebuild_indices_args)`
			`values = rebuild_values_func(*rebuild_values_args)`
			`return torch.sparse_coo_tensor(indices, values, shape, is_coalesced=is_coalesced)`


			`def rebuild_sparse_compressed_tensor(`
			`rebuild_compressed_indices_func,`
			`rebuild_compressed_indices_args,`
			`rebuild_plain_indices_func,`
			`rebuild_plain_indices_args,`
			`rebuild_values_func,`
			`rebuild_values_args,`
			`shape,`
			`layout,`
			`):`
			`compressed_indices = rebuild_compressed_indices_func(`
			`*rebuild_compressed_indices_args`
			`)`
			`plain_indices = rebuild_plain_indices_func(*rebuild_plain_indices_args)`
			`values = rebuild_values_func(*rebuild_values_args)`
			`return torch.sparse_compressed_tensor(`
			`compressed_indices, plain_indices, values, shape, layout=layout`
			`)`


			`def reduce_sparse_tensor(sparse):`
			`if sparse.layout is torch.sparse_coo:`
			`rebuild_indices_func, rebuild_indices_args = reduce_tensor(sparse._indices())`
			`rebuild_values_func, rebuild_values_args = reduce_tensor(sparse._values())`
			`return (`
			`rebuild_sparse_coo_tensor,`
			`(`
			`rebuild_indices_func,`
			`rebuild_indices_args,`
			`rebuild_values_func,`
			`rebuild_values_args,`
			`sparse.shape,`
			`sparse.is_coalesced(),`
			`),`
			`)`
			`else:`
			`if sparse.layout in {torch.sparse_csr, torch.sparse_bsr}:`
			`compressed_indices = sparse.crow_indices()`
			`plain_indices = sparse.col_indices()`
			`elif sparse.layout in {torch.sparse_csc, torch.sparse_bsc}:`
			`compressed_indices = sparse.ccol_indices()`
			`plain_indices = sparse.row_indices()`
			`else:`
			`raise NotImplementedError(sparse.layout)`
			`(`
			`rebuild_compressed_indices_func,`
			`rebuild_compressed_indices_args,`
			`) = reduce_tensor(compressed_indices)`
			`rebuild_plain_indices_func, rebuild_plain_indices_args = reduce_tensor(`
			`plain_indices`
			`)`
			`rebuild_values_func, rebuild_values_args = reduce_tensor(sparse.values())`
			`return (`
			`rebuild_sparse_compressed_tensor,`
			`(`
			`rebuild_compressed_indices_func,`
			`rebuild_compressed_indices_args,`
			`rebuild_plain_indices_func,`
			`rebuild_plain_indices_args,`
			`rebuild_values_func,`
			`rebuild_values_args,`
			`sparse.shape,`
			`sparse.layout,`
			`),`
			`)`


			`def fd_id(fd):`
			`# Returns a tuple which uniquely identifies a file descriptor. In Mac OS,`
			`# this doesn't work with shared memory handles, which is why we don't`
			`# support the "file_descriptor" sharing method on that platform.`
			`stat = os.fstat(fd)`
			`return (stat.st_ino, stat.st_dev)`


			`def storage_from_cache(cls, key):`
			`storage_ref = shared_cache.get(key)`
			`if storage_ref is None:`
			`return None`
			`return torch.UntypedStorage._new_with_weak_ptr(storage_ref.cdata)`


			`def rebuild_storage_fd(cls, df, size):`
			`fd = df.detach()`
			`try:`
			`storage = storage_from_cache(cls, fd_id(fd))`
			`if storage is not None:`
			`return storage`
			`storage = cls._new_shared_fd_cpu(fd, size)`
			`shared_cache[fd_id(fd)] = StorageWeakRef(storage)`
			`return storage`
			`finally:`
			`os.close(fd)`


			`def rebuild_storage_filename(cls, manager, handle, size, dtype=None):`
			`storage: Union[torch.TypedStorage, torch.UntypedStorage] = storage_from_cache(`
			`cls, handle`
			`)`
			`if storage is not None:`
			`return storage._shared_decref()`
			`if dtype is None:`
			`storage = torch.UntypedStorage._new_shared_filename_cpu(manager, handle, size)`
			`else:`
			`byte_size = size * torch._utils._element_size(dtype)`
			`untyped_storage: torch.UntypedStorage = (`
			`torch.UntypedStorage._new_shared_filename_cpu(manager, handle, byte_size)`
			`)`
			`storage = torch.TypedStorage(`
			`wrap_storage=untyped_storage, dtype=dtype, _internal=True`
			`)`
			`shared_cache[handle] = StorageWeakRef(storage)`
			`return storage._shared_decref()`


			`def rebuild_storage_empty(cls):`
			`return cls()`


			`def rebuild_typed_storage(storage, dtype):`
			`return torch.storage.TypedStorage(wrap_storage=storage, dtype=dtype, _internal=True)`


			`# Use for torch.storage.TypedStorage`
			`def reduce_typed_storage(storage):`
			`return (rebuild_typed_storage, (storage._untyped_storage, storage.dtype))`


			`def rebuild_typed_storage_child(storage, storage_type):`
			`return storage_type(wrap_storage=storage, _internal=True)`


			`# Use for child classes of torch.storage.TypedStorage, like torch.FloatStorage`
			`def reduce_typed_storage_child(storage):`
			`return (rebuild_typed_storage_child, (storage._untyped_storage, type(storage)))`


			`def reduce_storage(storage):`
			`from . import get_sharing_strategy`

			`if storage.is_cuda:`
			`raise RuntimeError(`
			`"Cannot pickle CUDA storage; try pickling a CUDA tensor instead"`
			`)`
			`elif get_sharing_strategy() == "file_system":`
			`metadata = storage._share_filename_cpu_()`
			`cache_key = metadata[1]`
			`rebuild = rebuild_storage_filename`
			`if isinstance(storage, torch.TypedStorage):`
			`metadata += (storage.dtype,)`
			`storage._shared_incref()`
			`elif storage.size() == 0:`
			`# This is special cased because Empty tensors`
			`# (with size 0) cannot be mmapped.`
			`return (rebuild_storage_empty, (type(storage),))`
			`else:`
			`fd, size = storage._share_fd_cpu_()`
			`df = multiprocessing.reduction.DupFd(fd)`
			`cache_key = fd_id(fd)`
			`metadata = (df, size)`
			`rebuild = rebuild_storage_fd # type: ignore[assignment]`

			`shared_cache[cache_key] = StorageWeakRef(storage)`
			`return (rebuild, (type(storage),) + metadata)`


			`def init_reductions():`
			`ForkingPickler.register(torch.cuda.Event, reduce_event)`

			`for t in torch._storage_classes:`
			`if t.__name__ == "UntypedStorage":`
			`ForkingPickler.register(t, reduce_storage)`
			`else:`
			`ForkingPickler.register(t, reduce_typed_storage_child)`

			`ForkingPickler.register(torch.storage.TypedStorage, reduce_typed_storage)`

			`for t in torch._tensor_classes:`
			`ForkingPickler.register(t, reduce_tensor)`

			`# TODO: Maybe this should be in tensor_classes? :)`
			`ForkingPickler.register(torch.Tensor, reduce_tensor)`
			`ForkingPickler.register(torch.nn.parameter.Parameter, reduce_tensor)`