Caffe2 - C++ API
A deep learning, cross platform ML framework
common_gpu.h
1 #ifndef CAFFE2_CORE_COMMON_GPU_H_
2 #define CAFFE2_CORE_COMMON_GPU_H_
3 
4 #include <assert.h>
5 #include <cuda.h>
6 #include <cuda_runtime.h>
7 
8 // Disable strict aliasing errors for CUDA 9.
9 // The cuda_fp16.h header in CUDA 9 RC triggers this diagnostic.
10 // It is included by cusparse.h as well, so guarding the
11 // inclusion of that header here is not enough.
12 #if CUDA_VERSION >= 9000
13 #ifdef __GNUC__
14 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
15 #pragma GCC diagnostic push
16 #endif
17 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
18 #endif // __GNUC__
19 #endif // CUDA_VERSION >= 9000
20 
21 #include <cublas_v2.h>
22 #include <curand.h>
23 #include <driver_types.h>
24 
25 #include "caffe2/core/common.h"
26 #include "caffe2/core/logging.h"
27 
28 #include "c10/cuda/CUDAMacros.h"
29 #include "c10/cuda/CUDAMathCompat.h"
30 #include <c10/cuda/CUDAGuard.h>
31 
32 // Defines CAFFE2_CUDA_EXPORT and CAFFE2_CUDA_IMPORT. On Windows, this
33 // corresponds to different declarations (dllexport and dllimport). On
34 // Linux/Mac, it just resolves to the same "default visibility" setting.
35 #if defined(_MSC_VER)
36 #if defined(CAFFE2_BUILD_SHARED_LIBS)
37 #define CAFFE2_CUDA_EXPORT __declspec(dllexport)
38 #define CAFFE2_CUDA_IMPORT __declspec(dllimport)
39 #else
40 #define CAFFE2_CUDA_EXPORT
41 #define CAFFE2_CUDA_IMPORT
42 #endif
43 #else
44 #if defined(__GNUC__)
45 #define CAFFE2_CUDA_EXPORT __attribute__((__visibility__("default")))
46 #else
47 #define CAFFE2_CUDA_EXPORT
48 #endif
49 #define CAFFE2_CUDA_IMPORT CAFFE2_CUDA_EXPORT
50 #endif
51 
52 // CAFFE2_CUDA_API is a macro that, depends on whether you are building the
53 // main caffe2 library or not, resolves to either CAFFE2_CUDA_EXPORT or
54 // CAFFE2_CUDA_IMPORT.
55 //
56 // This is used in e.g. Caffe2's protobuf files: when building the main library,
57 // it is defined as CAFFE2_CUDA_EXPORT to fix a Windows global-variable-in-dll
58 // issue, and for anyone dependent on Caffe2 it will be defined as
59 // CAFFE2_CUDA_IMPORT.
60 
61 #ifdef CAFFE2_CUDA_BUILD_MAIN_LIB
62 #define CAFFE2_CUDA_API CAFFE2_CUDA_EXPORT
63 #else
64 #define CAFFE2_CUDA_API CAFFE2_CUDA_IMPORT
65 #endif
66 
67 // This is a macro defined for cuda fp16 support. In default, cuda fp16 is
68 // supported by NVCC 7.5, but it is also included in the Tegra X1 platform with
69 // a (custom?) NVCC 7.0. As a result, we would normally just check the cuda
70 // version here, but would also allow a use to pass in the flag
71 // CAFFE_HAS_CUDA_FP16 manually.
72 
73 #ifndef CAFFE_HAS_CUDA_FP16
74 #if CUDA_VERSION >= 7050 || defined(__HIP_PLATFORM_HCC__)
75 #define CAFFE_HAS_CUDA_FP16
76 #endif // CUDA_VERSION >= 7050
77 #endif // CAFFE_HAS_CUDA_FP16
78 
79 #ifdef CAFFE_HAS_CUDA_FP16
80 #include <cuda_fp16.h>
81 #endif
82 
83 // cuda major revision number below which fp16 compute is not supoorted
84 #ifndef __HIP_PLATFORM_HCC__
85 constexpr int kFp16CUDADevicePropMajor = 6;
86 #else
87 constexpr int kFp16CUDADevicePropMajor = 3;
88 #endif
89 
90 // Re-enable strict aliasing diagnostic if it was disabled.
91 #if CUDA_VERSION >= 9000
92 #ifdef __GNUC__
93 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
94 #pragma GCC diagnostic pop
95 #endif
96 #endif // __GNUC__
97 #endif // CUDA_VERSION >= 9000
98 
106 #define CAFFE2_CUDA_MAX_PEER_SIZE 8
107 
108 namespace caffe2 {
109 
110 #if CUDA_VERSION >= 9000
111 
114 class TensorCoreEngine {};
115 #endif
116 
117 #if CUDA_VERSION >= 10000
118 #define CAFFE2_CUDA_PTRATTR_MEMTYPE type
119 #else
120 #define CAFFE2_CUDA_PTRATTR_MEMTYPE memoryType
121 #endif
122 
126 inline int CudaVersion() {
127  return CUDA_VERSION;
128 }
129 
133 CAFFE2_CUDA_API int NumCudaDevices();
134 
149 inline bool HasCudaGPU() {
150  return NumCudaDevices() > 0;
151 }
152 
156 CAFFE2_CUDA_API int CaffeCudaGetDevice();
157 
161 CAFFE2_CUDA_API void CaffeCudaSetDevice(const int id);
162 
166 CAFFE2_CUDA_API int GetGPUIDForPointer(const void* ptr);
167 
171 CAFFE2_CUDA_API const cudaDeviceProp& GetDeviceProperty(const int device);
172 
176 CAFFE2_CUDA_API void DeviceQuery(const int deviceid);
177 
185 CAFFE2_CUDA_API bool GetCudaPeerAccessPattern(vector<vector<bool>>* pattern);
186 
190 CAFFE2_CUDA_API bool TensorCoreAvailable();
191 
195 CAFFE2_CUDA_API const char* cublasGetErrorString(cublasStatus_t error);
196 
200 CAFFE2_CUDA_API const char* curandGetErrorString(curandStatus_t error);
201 
202 // CUDA: various checks for different function calls.
203 #define CUDA_ENFORCE(condition, ...) \
204  do { \
205  cudaError_t error = condition; \
206  CAFFE_ENFORCE_EQ( \
207  error, \
208  cudaSuccess, \
209  "Error at: ", \
210  __FILE__, \
211  ":", \
212  __LINE__, \
213  ": ", \
214  cudaGetErrorString(error), \
215  ##__VA_ARGS__); \
216  } while (0)
217 #define CUDA_CHECK(condition) \
218  do { \
219  cudaError_t error = condition; \
220  CHECK(error == cudaSuccess) << cudaGetErrorString(error); \
221  } while (0)
222 
223 #define CUDA_DRIVERAPI_ENFORCE(condition) \
224  do { \
225  CUresult result = condition; \
226  if (result != CUDA_SUCCESS) { \
227  const char* msg; \
228  cuGetErrorName(result, &msg); \
229  CAFFE_THROW("Error at: ", __FILE__, ":", __LINE__, ": ", msg); \
230  } \
231  } while (0)
232 #define CUDA_DRIVERAPI_CHECK(condition) \
233  do { \
234  CUresult result = condition; \
235  if (result != CUDA_SUCCESS) { \
236  const char* msg; \
237  cuGetErrorName(result, &msg); \
238  LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
239  << msg; \
240  } \
241  } while (0)
242 
243 #define CUBLAS_ENFORCE(condition) \
244  do { \
245  cublasStatus_t status = condition; \
246  CAFFE_ENFORCE_EQ( \
247  status, \
248  CUBLAS_STATUS_SUCCESS, \
249  "Error at: ", \
250  __FILE__, \
251  ":", \
252  __LINE__, \
253  ": ", \
254  ::caffe2::cublasGetErrorString(status)); \
255  } while (0)
256 #define CUBLAS_CHECK(condition) \
257  do { \
258  cublasStatus_t status = condition; \
259  CHECK(status == CUBLAS_STATUS_SUCCESS) \
260  << ::caffe2::cublasGetErrorString(status); \
261  } while (0)
262 
263 #define CURAND_ENFORCE(condition) \
264  do { \
265  curandStatus_t status = condition; \
266  CAFFE_ENFORCE_EQ( \
267  status, \
268  CURAND_STATUS_SUCCESS, \
269  "Error at: ", \
270  __FILE__, \
271  ":", \
272  __LINE__, \
273  ": ", \
274  ::caffe2::curandGetErrorString(status)); \
275  } while (0)
276 #define CURAND_CHECK(condition) \
277  do { \
278  curandStatus_t status = condition; \
279  CHECK(status == CURAND_STATUS_SUCCESS) \
280  << ::caffe2::curandGetErrorString(status); \
281  } while (0)
282 
283 #define CUDA_1D_KERNEL_LOOP(i, n) \
284  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
285  i += blockDim.x * gridDim.x)
286 
287 #define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
288  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
289  i += blockDim.x * gridDim.x) \
290  for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
291  j += blockDim.y * gridDim.y)
292 
293 // CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda
294 // kernels. This is not supported by Apple platforms so we special case it.
295 // See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion
296 #if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__)
297 #define CUDA_KERNEL_ASSERT(...)
298 #else // __APPLE__
299 #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
300 #endif // __APPLE__
301 
302 // The following helper functions are here so that you can write a kernel call
303 // when you are not particularly interested in maxing out the kernels'
304 // performance. Usually, this will give you a reasonable speed, but if you
305 // really want to find the best performance, it is advised that you tune the
306 // size of the blocks and grids more reasonably.
307 // A legacy note: this is derived from the old good Caffe days, when I simply
308 // hard-coded the number of threads and wanted to keep backward compatibility
309 // for different computation capabilities.
310 // For more info on CUDA compute capabilities, visit the NVidia website at:
311 // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
312 
313 // The number of cuda threads to use. Since work is assigned to SMs at the
314 // granularity of a block, 128 is chosen to allow utilizing more SMs for
315 // smaller input sizes.
316 // 1D grid
317 constexpr int CAFFE_CUDA_NUM_THREADS = 128;
318 // 2D grid
319 constexpr int CAFFE_CUDA_NUM_THREADS_2D_DIMX = 16;
320 constexpr int CAFFE_CUDA_NUM_THREADS_2D_DIMY = 16;
321 
322 // The maximum number of blocks to use in the default kernel call. We set it to
323 // 4096 which would work for compute capability 2.x (where 65536 is the limit).
324 // This number is very carelessly chosen. Ideally, one would like to look at
325 // the hardware at runtime, and pick the number of blocks that makes most
326 // sense for the specific runtime environment. This is a todo item.
327 // 1D grid
328 constexpr int CAFFE_MAXIMUM_NUM_BLOCKS = 4096;
329 // 2D grid
330 constexpr int CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMX = 128;
331 constexpr int CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMY = 128;
332 
333 constexpr int kCUDAGridDimMaxX = 2147483647;
334 constexpr int kCUDAGridDimMaxY = 65535;
335 constexpr int kCUDAGridDimMaxZ = 65535;
336 
340 inline int CAFFE_GET_BLOCKS(const int N) {
341  return std::max(
342  std::min(
343  (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
344  CAFFE_MAXIMUM_NUM_BLOCKS),
345  // Use at least 1 block, since CUDA does not allow empty block
346  1);
347 }
348 
352 inline dim3 CAFFE_GET_BLOCKS_2D(const int N, const int /* M */) {
353  dim3 grid;
354  // Not calling the 1D version for each dim to keep all constants as literals
355 
356  grid.x = std::max(
357  std::min(
358  (N + CAFFE_CUDA_NUM_THREADS_2D_DIMX - 1) /
359  CAFFE_CUDA_NUM_THREADS_2D_DIMX,
360  CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMX),
361  // Use at least 1 block, since CUDA does not allow empty block
362  1);
363 
364  grid.y = std::max(
365  std::min(
366  (N + CAFFE_CUDA_NUM_THREADS_2D_DIMY - 1) /
367  CAFFE_CUDA_NUM_THREADS_2D_DIMY,
368  CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMY),
369  // Use at least 1 block, since CUDA does not allow empty block
370  1);
371 
372  return grid;
373 }
374 
376 
377 template <typename T, int N>
378 struct SimpleArray {
379  T data[N];
380 };
381 
382 constexpr int kCUDATensorMaxDims = 8;
383 
384 #define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(val, Func, T, ...) \
385  do { \
386  CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \
387  switch (val) { \
388  case 1: { \
389  Func<T, 1>(__VA_ARGS__); \
390  break; \
391  } \
392  case 2: { \
393  Func<T, 2>(__VA_ARGS__); \
394  break; \
395  } \
396  case 3: { \
397  Func<T, 3>(__VA_ARGS__); \
398  break; \
399  } \
400  case 4: { \
401  Func<T, 4>(__VA_ARGS__); \
402  break; \
403  } \
404  case 5: { \
405  Func<T, 5>(__VA_ARGS__); \
406  break; \
407  } \
408  case 6: { \
409  Func<T, 6>(__VA_ARGS__); \
410  break; \
411  } \
412  case 7: { \
413  Func<T, 7>(__VA_ARGS__); \
414  break; \
415  } \
416  case 8: { \
417  Func<T, 8>(__VA_ARGS__); \
418  break; \
419  } \
420  default: { \
421  break; \
422  } \
423  } \
424  } while (false)
425 
426 #define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(val, Func, T1, T2, ...) \
427  do { \
428  CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \
429  switch (val) { \
430  case 1: { \
431  Func<T1, T2, 1>(__VA_ARGS__); \
432  break; \
433  } \
434  case 2: { \
435  Func<T1, T2, 2>(__VA_ARGS__); \
436  break; \
437  } \
438  case 3: { \
439  Func<T1, T2, 3>(__VA_ARGS__); \
440  break; \
441  } \
442  case 4: { \
443  Func<T1, T2, 4>(__VA_ARGS__); \
444  break; \
445  } \
446  case 5: { \
447  Func<T1, T2, 5>(__VA_ARGS__); \
448  break; \
449  } \
450  case 6: { \
451  Func<T1, T2, 6>(__VA_ARGS__); \
452  break; \
453  } \
454  case 7: { \
455  Func<T1, T2, 7>(__VA_ARGS__); \
456  break; \
457  } \
458  case 8: { \
459  Func<T1, T2, 8>(__VA_ARGS__); \
460  break; \
461  } \
462  default: { \
463  break; \
464  } \
465  } \
466  } while (false)
467 
468 #define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(val, Func, T1, T2, T3, ...) \
469  do { \
470  CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \
471  switch (val) { \
472  case 1: { \
473  Func<T1, T2, T3, 1>(__VA_ARGS__); \
474  break; \
475  } \
476  case 2: { \
477  Func<T1, T2, T3, 2>(__VA_ARGS__); \
478  break; \
479  } \
480  case 3: { \
481  Func<T1, T2, T3, 3>(__VA_ARGS__); \
482  break; \
483  } \
484  case 4: { \
485  Func<T1, T2, T3, 4>(__VA_ARGS__); \
486  break; \
487  } \
488  case 5: { \
489  Func<T1, T2, T3, 5>(__VA_ARGS__); \
490  break; \
491  } \
492  case 6: { \
493  Func<T1, T2, T3, 6>(__VA_ARGS__); \
494  break; \
495  } \
496  case 7: { \
497  Func<T1, T2, T3, 7>(__VA_ARGS__); \
498  break; \
499  } \
500  case 8: { \
501  Func<T1, T2, T3, 8>(__VA_ARGS__); \
502  break; \
503  } \
504  default: { \
505  break; \
506  } \
507  } \
508  } while (false)
509 
510 } // namespace caffe2
511 
512 #endif // CAFFE2_CORE_COMMON_GPU_H_
int CudaVersion()
A runtime function to report the cuda version that Caffe2 is built with.
Definition: common_gpu.h:126
bool HasCudaGPU()
Check if the current running session has a cuda gpu present.
Definition: common_gpu.h:149
int CaffeCudaGetDevice()
Gets the current GPU id.
Definition: common_gpu.cc:96
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
Definition: common_gpu.cc:217
const char * curandGetErrorString(curandStatus_t error)
Return a human readable curand error string.
Definition: common_gpu.cc:266
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:106
dim3 CAFFE_GET_BLOCKS_2D(const int N, const int)
Compute the number of blocks needed to run N threads for a 2D grid.
Definition: common_gpu.h:352
const char * cublasGetErrorString(cublasStatus_t error)
Return a human readable cublas error string.
Definition: common_gpu.cc:229
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:139
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.
Definition: common_gpu.h:340
A variant of DeviceGuard that is specialized for CUDA.
Definition: CUDAGuard.h:20
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
Definition: common_gpu.cc:102
int NumCudaDevices()
Returns the number of devices.
Definition: common_gpu.cc:15
void DeviceQuery(const int device)
Runs a device query function and prints out the results to LOG(INFO).
Definition: common_gpu.cc:156