Caffe2 - C++ API
A deep learning, cross platform ML framework
context_gpu.h
1 
17 #ifndef CAFFE2_CORE_CONTEXT_GPU_H_
18 #define CAFFE2_CORE_CONTEXT_GPU_H_
19 
20 #include <ctime>
21 #include <mutex>
22 
23 #include "caffe2/core/common_cudnn.h"
24 #include "caffe2/core/common_gpu.h"
25 #include "caffe2/core/context.h"
26 #include "caffe2/core/logging.h"
27 #include "caffe2/core/tensor.h"
28 #include "caffe2/core/types.h"
29 #include "caffe2/proto/caffe2.pb.h"
30 
31 namespace caffe2 {
32 
33 enum class CudaMemoryPoolType {
34  NONE = 0,
35  CUB = 1,
36 };
37 
43 CudaMemoryPoolType GetCudaMemoryPoolType();
44 
55  friend class CUDAContext;
56 
57  private:
59  for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
60  cuda_streams_[i] = vector<cudaStream_t>();
61  cublas_handles_[i] = vector<cublasHandle_t>();
62  cudnn_handles_[i] = vector<cudnnHandle_t>();
63  }
64  }
65 
66  cudaStream_t GetStream(int gpu, int stream_id) {
67  vector<cudaStream_t>& gpu_streams = cuda_streams_[gpu];
68  if (gpu_streams.size() <= stream_id) {
69  gpu_streams.resize(stream_id + 1, nullptr);
70  }
71  if (!gpu_streams[stream_id]) {
72  DeviceGuard guard(gpu);
73  CUDA_ENFORCE(cudaStreamCreateWithFlags(
74  &gpu_streams[stream_id], cudaStreamNonBlocking));
75  }
76  return gpu_streams[stream_id];
77  }
78 
79  cublasHandle_t GetHandle(int gpu, int stream_id) {
80  DeviceGuard guard(gpu);
81  vector<cublasHandle_t>& gpu_handles = cublas_handles_[gpu];
82  if (gpu_handles.size() <= stream_id) {
83  gpu_handles.resize(stream_id + 1, nullptr);
84  }
85  if (!gpu_handles[stream_id]) {
86  CUBLAS_ENFORCE(cublasCreate(&gpu_handles[stream_id]));
87  // The default is CUBLAS_POINTER_MODE_HOST. You can override
88  // it after obtaining the cublas handle, but do that with
89  // caution.
90  CUBLAS_ENFORCE(cublasSetPointerMode(
91  gpu_handles[stream_id], CUBLAS_POINTER_MODE_HOST));
92  CUBLAS_ENFORCE(
93  cublasSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
94  }
95  return gpu_handles[stream_id];
96  }
97 
98  cudnnHandle_t GetCudnnHandle(int gpu, int stream_id) {
99  DeviceGuard guard(gpu);
100  vector<cudnnHandle_t>& gpu_handles = cudnn_handles_[gpu];
101  if (gpu_handles.size() <= stream_id) {
102  gpu_handles.resize(stream_id + 1, nullptr);
103  }
104  if (!gpu_handles[stream_id]) {
105  CUDNN_ENFORCE(cudnnCreate(&gpu_handles[stream_id]));
106  CUDNN_ENFORCE(
107  cudnnSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
108  }
109  return gpu_handles[stream_id];
110  }
111 
112  ~ThreadLocalCUDAObjects() noexcept {
113  for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
114  for (auto& handle : cublas_handles_[i]) {
115  if (handle) {
116  CUBLAS_CHECK(cublasDestroy(handle));
117  }
118  }
119  for (auto& stream : cuda_streams_[i]) {
120  if (stream) {
121  CUDA_CHECK(cudaStreamDestroy(stream));
122  }
123  }
124  for (auto& handle : cudnn_handles_[i]) {
125  if (handle) {
126  CUDNN_CHECK(cudnnDestroy(handle));
127  }
128  }
129  }
130  }
131  vector<cudaStream_t> cuda_streams_[CAFFE2_COMPILE_TIME_MAX_GPUS];
132  vector<cublasHandle_t> cublas_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
133  vector<cudnnHandle_t> cudnn_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
134 };
135 
136 class CUDAContext final {
137  public:
138  // The default cuda context constructor.
139  explicit CUDAContext(const int gpu_id = -1);
140  explicit CUDAContext(const DeviceOption& option);
141 
142  ~CUDAContext() {
143  if (curand_generator_) {
144  CURAND_CHECK(curandDestroyGenerator(curand_generator_));
145  }
146  FinishDeviceComputation();
147  }
148 
149  inline void SwitchToDevice(int stream_id) {
150  set_stream_id(stream_id);
151  CaffeCudaSetDevice(gpu_id_);
152  }
153  inline void SwitchToDevice() {
154  SwitchToDevice(0);
155  }
156 
157  inline void WaitEvent(const Event& ev) {
158  ev.Wait(CUDA, this);
159  }
160 
161  inline void Record(Event* ev, const char* err_msg = nullptr) const {
162  CAFFE_ENFORCE(ev, "Event must not be null.");
163  ev->Record(CUDA, this, err_msg);
164  }
165 
166  void FinishDeviceComputation() {
167  cudaStreamSynchronize(cuda_objects_.GetStream(gpu_id_, stream_id_));
168  cudaError_t error = cudaGetLastError();
169  if (error != cudaSuccess) {
170  CAFFE_THROW("Encountered CUDA error: ", cudaGetErrorString(error));
171  }
172  }
173 
174  inline int cuda_gpu_id() const {
175  return gpu_id_;
176  }
177 
178  inline cudaStream_t cuda_stream() {
179  return cuda_stream(gpu_id_, stream_id_);
180  }
181 
182  inline cudaStream_t cuda_stream() const {
183  return cuda_stream(gpu_id_, stream_id_);
184  }
185 
186  static cudaStream_t cuda_stream(int gpu_id, int stream_id) {
187  return cuda_objects_.GetStream(gpu_id, stream_id);
188  }
189 
190  cublasHandle_t cublas_handle() {
191  return cuda_objects_.GetHandle(gpu_id_, stream_id_);
192  }
193 
194  cudnnHandle_t cudnn_handle() {
195  return cuda_objects_.GetCudnnHandle(gpu_id_, stream_id_);
196  }
197 
198  curandGenerator_t& curand_generator() {
199  if (!curand_generator_) {
200  DeviceGuard guard(gpu_id_);
201  CURAND_ENFORCE(
202  curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
203  CURAND_ENFORCE(
204  curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_));
205  CHECK_NOTNULL(curand_generator_);
206  }
207  CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream()));
208  return curand_generator_;
209  }
210 
211  static std::pair<void*, MemoryDeleter> New(size_t nbytes);
212 
213  // Get a mutex to lock out cudaMalloc / cudaFree calls when
214  // NCCL kernels are being launched. Should remove threat of
215  // deadlocks
216  static std::mutex& mutex();
217 
218  // Functions to query memory stats. Only available if flag
219  // --caffe2_gpu_memory_tracking is enabled.
220  static std::vector<long> TotalMemoryByGpu();
221  static std::vector<long> MaxMemoryByGpu();
222 
223  template <class SrcContext, class DstContext>
224  inline void CopyBytes(size_t nbytes, const void* src, void* dst) {
225  CUDA_ENFORCE(cudaMemcpyAsync(
226  dst,
227  src,
228  nbytes,
229  cudaMemcpyDefault,
230  cuda_objects_.GetStream(gpu_id_, stream_id_)));
231  }
232 
233  template <typename T, class SrcContext, class DstContext>
234  inline void Copy(int n, const T* src, T* dst) {
235  CopyBytes<SrcContext, DstContext>(n * sizeof(T),
236  static_cast<const void*>(src),
237  static_cast<void*>(dst));
238  }
239 
240  template <class SrcContext, class DstContext>
241  inline void
242  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
243  CAFFE_ENFORCE(!meta.copy(), "CUDAContext requires fundamental types.");
244  CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
245  }
246 
247  // By default CUDA operators have async device parts
248  static bool HasAsyncPartDefault() {
249  return true;
250  }
251 
252  static bool SupportsAsyncScheduling() {
253  return true;
254  }
255 
256  static bool IsStreamFree(const DeviceOption& option, int stream_id) {
257  auto stream = CUDAContext::cuda_stream(option.cuda_gpu_id(), stream_id);
258  return cudaStreamQuery(stream) == cudaSuccess;
259  }
260 
261  protected:
262  static void Delete(void* data);
263  void set_stream_id(int stream_id) {
264  stream_id_ = stream_id;
265  }
266 
267  int gpu_id_;
268  int stream_id_ = 0;
269  int random_seed_;
270  curandGenerator_t curand_generator_{nullptr};
271  static thread_local ThreadLocalCUDAObjects cuda_objects_;
272 };
273 
274 // For the CPU context, we also allow a (probably expensive) function
275 // to copy the data from a cuda context. Inside the function, we create
276 // a temporary CUDAContext object to carry out the copy. From the caller's
277 // side, these functions are synchronous with respect to the host, similar
278 // to a normal CPUContext::CopyBytes<CPUContext, CPUContext> call.
279 template<>
280 inline void CPUContext::CopyBytes<CUDAContext, CPUContext>(
281  size_t nbytes, const void* src, void* dst) {
282  CUDAContext context(GetGPUIDForPointer(src));
283  context.CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
284 }
285 template<>
286 inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
287  size_t nbytes, const void* src, void* dst) {
288  CUDAContext context(GetGPUIDForPointer(dst));
289  context.CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
290 }
291 
302  PinnedCPUAllocator() {}
303  ~PinnedCPUAllocator() override {}
304  std::pair<void*, MemoryDeleter> New(size_t nbytes) override {
305  void* data;
306  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
307  CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
308  memset(data, 0, nbytes);
309  return {data, Delete};
310  }
311 
312  MemoryDeleter GetDeleter() override {
313  return Delete;
314  }
315 
316  private:
317  static void Delete(void* data) {
318  // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
319  // or not. If a CUDAContext::New() call is made, inside the CUDAContext
320  // function we will switch the cpu side allocator to a PinnedCPUAllocator.
321  // But, if one calls CPUContext::New() before any cuda allocations,
322  // PinnedCPUAllocator can still delete the corresponding memory.
323  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
324  cudaError_t err = cudaFreeHost(data);
325  if (err == cudaErrorInvalidValue) {
326  free(data);
327  // Calling cudaGetLastError will reset the cuda error.
328  cudaGetLastError();
329  } else {
330  // For all other errors, still do a cuda check.
331  CUDA_ENFORCE(err);
332  }
333  }
334 };
335 
336 // For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
338 
339 } // namespace caffe2
340 
341 #endif // CAFFE2_CORE_CONTEXT_GPU_H_
An allocator that does the CPU memory allocation with pinned memory.
Definition: context_gpu.h:301
A struct to host thread-local cuda objects.
Definition: context_gpu.h:54
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:82
CudaMemoryPoolType GetCudaMemoryPoolType()
Gets the current memory pool type used by Caffe2.
Copyright (c) 2016-present, Facebook, Inc.
TypedCopy copy() const
Returns the typed copy function pointer for individual iterms.
Definition: typeid.h:171
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:149
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
Definition: common_gpu.cc:138
TypeMeta is a thin class that allows us to store the type of a container such as a blob...
Definition: typeid.h:104
const size_t & itemsize() const
Returns the size of the item.
Definition: typeid.h:159