doxygen-python/html/reductions_8py_source.html

 import torch
 import torch.utils.hooks
 import os
 import weakref
 import threading
 import multiprocessing
 from multiprocessing.reduction import ForkingPickler
 import sys
 try:
     # Early load resource_sharer to prevent a partially initialized instance
     # from being inherited in a forked child process. The reduce_storage method
     # requires this module indirectly through DupFd(). The built-in mp.Queue
     # class pickles arguments in a background thread which may overlap with the
     # fork.
     import multiprocessing.resource_sharer
 except ImportError:
     pass


 class StorageWeakRef(object):
     r"""A weak reference to a Storage.

     The cdata member is a Python number containing the integer representation of
     the Storage pointer."""

     def __init__(self, storage):
         self.cdata = storage._weak_ref()
         # Save a direct reference to _free_weak_ref because the `torch` module
         # might be cleared during Python shutdown before this module is cleared.
         self._free_weak_ref = torch.Storage._free_weak_ref

     def expired(self):
         return torch.Storage._expired(self.cdata)

     def __del__(self):
         self._free_weak_ref(self.cdata)


 class SharedCache(dict):
     """dictionary from multiprocessing handles to StorageWeakRef"""

     def __init__(self):
         # free_dead_references() is called if the len exceeds the current
         # limit. The limit scales with the number of remaining live objects.
         self.limit = 128
         self.lock = threading.Lock()

     def __setitem__(self, key, storage_ref):
         dict.__setitem__(self, key, storage_ref)
         if len(self) > self.limit:
             self.free_dead_references()

     def free_dead_references(self):
         # Multiple Python threads may call free_dead_references() concurrently.
         # Without a lock, they may try deleting the same entry multiple times.
         with self.lock:
             live = 0
             for key, storage_ref in list(self.items()):
                 if storage_ref.expired():
                     del self[key]
                 else:
                     live += 1
             self.limit = max(128, live * 2)


 # mapping from handles to StorageWeakRef objects
 shared_cache = SharedCache()


 def rebuild_event(device, handle):
     return torch.cuda.Event.from_ipc_handle(device, handle)


 def reduce_event(event):
     handle = event.ipc_handle()
     return (rebuild_event, (event.device, handle))


 def rebuild_tensor(cls, storage, metadata):
     storage_offset, size, stride, requires_grad = metadata
     t = torch._utils._rebuild_tensor(storage, storage_offset, size, stride)
     if cls == torch.nn.parameter.Parameter:
         t = torch.nn.parameter.Parameter(t)
     t.requires_grad = requires_grad
     return t


 def rebuild_cuda_tensor(tensor_cls, tensor_size, tensor_stride, tensor_offset,
                         storage_cls, storage_device, storage_handle, storage_size_bytes, storage_offset_bytes,
                         requires_grad):
     # If storage_handle is None, storage points to nullptr.
     if storage_handle is None or storage_size_bytes == 0:
         storage = storage_cls(0)
     else:
         storage = storage_from_cache(storage_cls, (storage_handle, storage_offset_bytes))
         if storage is None:
             torch.cuda._lazy_init()
             storage = storage_cls._new_shared_cuda(
                 storage_device,
                 storage_handle,
                 storage_size_bytes,
                 storage_offset_bytes)
             shared_cache[(storage_handle, storage_offset_bytes)] = StorageWeakRef(storage)

     t = torch._utils._rebuild_tensor(storage, tensor_offset, tensor_size, tensor_stride)
     if tensor_cls == torch.nn.parameter.Parameter:
         t = torch.nn.parameter.Parameter(t)
     t.requires_grad = requires_grad
     return t


 def reduce_tensor(tensor):
     storage = tensor.storage()

     if tensor.requires_grad and not tensor.is_leaf:
         raise RuntimeError("Cowardly refusing to serialize non-leaf tensor which requires_grad, "
                            "since autograd does not support crossing process boundaries.  "
                            "If you just want to transfer the data, call detach() on the tensor "
                            "before serializing (e.g., putting it on the queue).")

     torch.utils.hooks.warn_if_has_hooks(tensor)

     # Note [CUDA IPC and the caching allocator]
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # When you send a CUDA tensor over IPC, you might expect that you will
     # get out the same storage from the other end.  However, the CUDA caching
     # allocator makes it difficult to preserve this invariant.  Consider
     # the following situation: a tensor of size 0x100 points to offset 0x20 of
     # a storage at 0xA100 of size 0x100.  (For simplicity, all of these
     # sizes are given in bytes).  HOWEVER, with the caching allocator, this storage
     # might be part of a larger cudaMalloc allocation 0xA000 of size 0x4000.
     #
     # When we want to send this CUDA tensor over IPC, we must send the
     # *entire* cudaMalloc allocation, i.e., the 0xA000 region, not just
     # the storage 0xA100 (because that is what CUDA supports).  So, on the
     # other end, there simply isn't any way to say, "Wait, you gave me
     # a bigger region (0xA000) than the one I wanted (0xA100)".
     #
     # OK, so if you sent the cudaMalloc allocation, can you just wrap that up as
     # one storage itself? No, because this cudaMalloc allocation might contain
     # storages of mixed types: float, bytes, double... If you make the entire
     # allocation a single storage of a type A, we'll hit an error when constructing
     # a tensor of type B on the storage.
     #
     # cudaIpcMemHandle is an identifier to access the sender cudaMalloc allocation on the
     # receiver side. However, cudaIpcMemHandles from each device in a given process may
     # only be opened by one context per device per other process.
     # If we open and close a memory handle multiples times in a process, CUDA is allowed
     # to give it a different address; similarly, once we close the memory, we're not
     # allowed to access it(and the storage/tensor built on top of it), even if it is
     # still live in the original process. As we cannot make a cudaMalloc allocation
     # to a single storage in one go, this requires us to cache the device pointer for
     # each cudaIpcMemHandle on C++ side to reconstruct types of storages, while keep
     # the old ones alives.
     # See [https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html]
     #
     # This is fine, because all we need to do is to save our position in the allocaiton,
     # and reconstruct storage and tensor from it.
     # 0xA000 ->  -------CUDA Allocation------
     #           |                            |
     #           |                            |
     #           |                            |
     #           |                            |
     # 0xA100 ->  --------storage1 begin------
     #           |                            |
     # 0xA120 ->  --------tensor1 begin ------
     #           |                            |
     #           |                            |
     #           |                            |
     #           |                            |
     #           |                            |
     # 0xA160 ->  --------tensor1 end---------
     #           |                            |
     #           |                            |
     #           |                            |
     # 0xA200 ->  --------storage1 end--------
     #           |                            |
     # 0xE000 ->  --------CUDA allocation-----
     #
     # To send tensor1, the following info are required from sender to receiver for
     # storage recontruction.
     #   1. cudaIpcMemHandle of 0xA000(which can be mapped to a basePtr in receiver process).
     #      basePtr may not be exactly 0xA000 since it's a different process.
     #   2. offset(0xA100) of storage1 in the CUDA allocation.
     #   3. size of storage1(0x100).
     #
     # On receiver side:
     #   1. Get the devPtr of the MemHandle to access the memory, reconstruct a storage
     #      of the same type using (basePtr, offset, size).
     #   2. we can reconstruct the tensor on top of the recontructed storage
     #   Tensor(size=0x040, offset=0x020, storage=Storage(data=basePtr+0xA100, size=0x0100))
     #
     # This strategy has a few implications:
     #
     # 1. When we serialize a CUDA tensor for IPC, we cannot do it all in one
     #    go (non-compositionally), and this requires to have a global map
     #    memHandle -> devPtr for each process.
     #
     # 2. We MUST NOT let the new IPC tensor be resizable.  Originally, a resize
     #    of the storage beyond 0x100 would merely have caused us to do a
     #    reallocation.  You don't really want to do this, but if you did,
     #    all that would happen is that you would lose IPC sharing.  But if
     #    you do this in the new world, we will happily let you write out of
     #    bounds of your "allocation", clobbering unrelated data in the cached
     #    allocator block.  BAD!
     #
     # By the way, in old versions of PyTorch, we supported this situation
     # natively using a "storage view", which permitted multiple storages to be
     # views on each other.  But this was the *only* use of storage views, so we
     # eliminated it so that we could just use tensor views to implement the same
     # thing.
     #
     if storage.is_cuda:
         (device, handle, storage_size_bytes, storage_offset_bytes) = storage._share_cuda_()
         tensor_offset = tensor.storage_offset()

         shared_cache[handle] = StorageWeakRef(storage)

         # _backward_hooks purposely omitted here, see
         # Note [Don't serialize hooks]
         return (rebuild_cuda_tensor,
                 (type(tensor),
                  tensor.size(),
                  tensor.stride(),
                  tensor_offset,  # tensor offset in its storage
                  type(storage),
                  device,
                  handle,  # identifier which CUDA allocation is the storage in.
                  storage_size_bytes,  # size(in bytes) of the storage
                  storage_offset_bytes,  # offset(in bytes) of the storage in the CUDA allocation
                  tensor.requires_grad))

     # _backward_hooks purposely omitted here, see Note [Don't serialize hooks]
     metadata = (tensor.storage_offset(), tensor.size(), tensor.stride(), tensor.requires_grad)
     return (rebuild_tensor, (type(tensor), storage, metadata))


 def fd_id(fd):
     # Returns a tuple which uniquely identifies a file descriptor. In Mac OS,
     # this doesn't work with shared memory handles, which is why we don't
     # support the "file_descriptor" sharing method on that platform.
     stat = os.fstat(fd)
     return (stat.st_ino, stat.st_dev)


 def storage_from_cache(cls, key):
     storage_ref = shared_cache.get(key)
     if storage_ref is None:
         return None
     return cls._new_with_weak_ptr(storage_ref.cdata)


 def rebuild_storage_fd(cls, df, size):
     if sys.version_info[0] == 2:
         fd = multiprocessing.reduction.rebuild_handle(df)
     else:
         fd = df.detach()
     try:
         storage = storage_from_cache(cls, fd_id(fd))
         if storage is not None:
             return storage
         storage = cls._new_shared_fd(fd, size)
         shared_cache[fd_id(fd)] = StorageWeakRef(storage)
         return storage
     finally:
         os.close(fd)


 def rebuild_storage_filename(cls, manager, handle, size):
     storage = storage_from_cache(cls, handle)
     if storage is not None:
         return storage._shared_decref()
     storage = cls._new_shared_filename(manager, handle, size)
     shared_cache[handle] = StorageWeakRef(storage)
     return storage._shared_decref()


 def rebuild_storage_empty(cls):
     return cls()


 def reduce_storage(storage):
     from . import get_sharing_strategy
     if storage.is_cuda:
         raise RuntimeError("Cannot pickle CUDA storage; try pickling a CUDA tensor instead")
     elif get_sharing_strategy() == 'file_system':
         metadata = storage._share_filename_()
         cache_key = metadata[1]
         rebuild = rebuild_storage_filename
         storage._shared_incref()
     elif storage.size() == 0:
         # This is special cased because Empty tensors
         # (with size 0) cannot be mmapped.
         return (rebuild_storage_empty, (type(storage),))
     else:
         fd, size = storage._share_fd_()
         if sys.version_info[0] == 2:
             df = multiprocessing.reduction.reduce_handle(fd)
         else:
             df = multiprocessing.reduction.DupFd(fd)
         cache_key = fd_id(fd)
         metadata = (df, size)
         rebuild = rebuild_storage_fd

     shared_cache[cache_key] = StorageWeakRef(storage)
     return (rebuild, (type(storage),) + metadata)


 def init_reductions():
     ForkingPickler.register(torch.cuda.Event, reduce_event)

     for t in torch._storage_classes:
         ForkingPickler.register(t, reduce_storage)

     for t in torch._tensor_classes:
         ForkingPickler.register(t, reduce_tensor)

     # TODO: Maybe this should be in tensor_classes? :)
     ForkingPickler.register(torch.Tensor, reduce_tensor)
     ForkingPickler.register(torch.nn.parameter.Parameter, reduce_tensor)
torch._utils._rebuild_tensor
def _rebuild_tensor(storage, storage_offset, size, stride)
Definition: _utils.py:127

torch.utils.hooks
Definition: hooks.py:1

torch.cuda._lazy_init
def _lazy_init()
Definition: __init__.py:148

torch.multiprocessing.reductions.SharedCache.lock
lock
Definition: reductions.py:46

torch.multiprocessing.get_sharing_strategy
def get_sharing_strategy()
Definition: __init__.py:68

torch.nn.parameter.Parameter
Definition: parameter.py:5

torch.multiprocessing.reductions.SharedCache
Definition: reductions.py:39

torch.multiprocessing.reductions.StorageWeakRef
Definition: reductions.py:20

torch.utils.hooks.warn_if_has_hooks
def warn_if_has_hooks(tensor)
Definition: hooks.py:51

torch.multiprocessing.reductions.StorageWeakRef._free_weak_ref
_free_weak_ref
Definition: reductions.py:30

torch.multiprocessing.reductions.SharedCache.limit
limit
Definition: reductions.py:45

torch.multiprocessing.reductions.StorageWeakRef.cdata
cdata
Definition: reductions.py:27

torch.multiprocessing.reductions.SharedCache.free_dead_references
def free_dead_references(self)
Definition: reductions.py:53