Caffe2 - C++ API
A deep learning, cross platform ML framework
Handle.cpp
1 #include <ATen/cudnn/Handle.h>
2 
3 #include <ATen/cuda/Exceptions.h>
4 
5 #include <unordered_map>
6 #include <vector>
7 #include <utility>
8 #include <mutex>
9 
10 namespace at { namespace native {
11 
12 namespace {
13 
14 struct Handle {
15  cudnnHandle_t handle;
16  Handle(bool create = false) : handle(nullptr)
17  {
18  if(create)
19  AT_CUDNN_CHECK(cudnnCreate(&handle));
20  }
21  // std::vector.emplace() and push_back() may route through temporaries and call
22  // copy/move constructors along the way. If this is the case, we don't want
23  // the destructors of temporaries to call cudnnDestroy on the handle.
24  // We can achieve safety (for the narrow case of stashing within std::vectors)
25  // by making Handle moveable but not copyable, and transferring handle ownership
26  // to the latest constructed object. This is not a substitute for full-blown
27  // reference counting, but reference counting may be overkill here.
28  // Another alternative is to wrap the saved Handles in unique_ptrs, i.e.,
29  // unordered_map<int, vector<unique_ptr<Handle>>> created_handles;
30  Handle(const Handle& rhs) = delete;
31  // Following https://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
32  Handle(Handle&& rhs) : Handle() { std::swap(handle, rhs.handle); }
33  // operator= takes argument by value
34  Handle& operator=(Handle rhs) { std::swap(handle, rhs.handle); return *this; }
35  ~Handle() {
36  if(handle)
37  {
38 // this is because of something dumb in the ordering of
39 // destruction. Sometimes atexit, the cuda context (or something)
40 // would already be destroyed by the time this gets destroyed. It
41 // happens in fbcode setting. @colesbury and I decided to not destroy
42 // the handle as a workaround.
43 // - @soumith
44 #ifdef NO_CUDNN_DESTROY_HANDLE
45 #else
46  cudnnDestroy(handle);
47 #endif
48  }
49  }
50 };
51 
52 std::mutex mutex;
53 
54 // Handles are lazily created as different threads request them,
55 // but are never destroyed until the end of the process.
56 // The maximum number of handles this process will create for each device is equal
57 // to the high-water mark of the number of concurrently active threads that request
58 // handles for that device.
59 // When threads terminate, they release their handles back into the pool for reuse.
60 // Otherwise, new handles would be created every time new threads were spawned,
61 // resulting in poor performance for Python modules that repeatedly or frequently
62 // spawned new sets of threads (like DataParallel, which creates a new set of threads
63 // for each forward pass).
64 //
65 // To prevent potential deadlocks, we explicitly choose not to cap the number
66 // of handles that are created per device.
67 // Example of danger: If we cap the max handles at 4, and 5 threads are sharing a device,
68 // only 4 can make forward progress at any time. The other 4 will not release their
69 // handles until they exit, so the fifth cannot make progress until then. This is
70 // not a problem...UNLESS all 5 threads attempt some sort of synchronization at an
71 // intermediate point (ie, before any of them have exited). We have no way to anticipate
72 // or enforce that user threads will not attempt such intermediate synchronization.
73 // The only way to ensure safety is to avoid imposing a cap on the number of handles.
74 std::unordered_map<int, std::vector<Handle>> created_handles;
75 std::unordered_map<int, std::vector<cudnnHandle_t>> available_handles;
76 
77 // PoolWindow lazily creates and caches the handles that a particular thread is using,
78 // so in the common case handle access doesn't incur either handle creation or a mutex lock.
79 class PoolWindow
80 {
81  public:
82  PoolWindow(){}
83  ~PoolWindow(){ release(); }
84 
85  cudnnHandle_t reserve(int device)
86  {
87  // If this thread already has a handle for this device, return it
88  if(my_handles.find(device) != my_handles.end())
89  return my_handles[device];
90 
91  // otherwise, either grab a handle from the pool if one is available,
92  // or if not, create a new one.
93  std::lock_guard<std::mutex> guard(mutex);
94 
95  if(available_handles[device].size() > 0)
96  {
97  my_handles[device] = available_handles[device].back();
98  available_handles[device].pop_back();
99  }
100  else
101  {
102  // In local testing, I do observe that emplace_back sometimes routes through temporaries
103  // that incur move-constructor and destructor calls. See comments in Handle above.
104  created_handles[device].emplace_back(true /*create*/);
105  my_handles[device] = created_handles[device].back().handle;
106  }
107 
108  return my_handles[device];
109  }
110 
111  private:
112  // Stores the per-device handles currently owned by this thread
113  std::unordered_map<int, cudnnHandle_t> my_handles;
114 
115  // Called by the destructor. Releases this thread's handles back into the pool.
116  void release()
117  {
118  // The conditional check below is certainly worthwhile for efficiency.
119  // However, it also serves another purpose: Without the conditional,
120  // as of cuda V9.0.176 and torch.backends.cudnn.version() = 7005,
121  // we observe weird nondeterministic hangs on Windows when the process first
122  // attempts to create a cudnn handle. Example with added debug print statements:
123  // https://ci.pytorch.org/jenkins/job/pytorch-builds/job/pytorch-win-ws2016-cuda9-cudnn7-py3-test2/19238/console
124  // The print statements reveal that when these hangs occur, the thread that is attempting
125  // to create a cudnn handle for the first time hangs on the call to cudnnCreate itself.
126  // These hangs have never manifested on anything but that particular Windows build.
127  // All other builds seem fine.
128  if(my_handles.size() > 0)
129  {
130  std::lock_guard<std::mutex> guard(mutex);
131  for(auto d_h : my_handles)
132  available_handles[d_h.first].push_back(d_h.second);
133  }
134  }
135 };
136 
137 // This will be destroyed when the thread terminates,
138 // releasing its reserved handles back to the pool.
139 thread_local PoolWindow myPoolWindow;
140 } // namespace
141 
142 
143 cudnnHandle_t getCudnnHandle()
144 {
145  int device;
146  AT_CUDA_CHECK(cudaGetDevice(&device));
147 
148  return myPoolWindow.reserve(device);
149 }
150 
151 }} // namespace at::cudnn
Flush-To-Zero and Denormals-Are-Zero mode.