Caffe2 - C++ API
A deep learning, cross platform ML framework
common_gpu.h
1 
17 #ifndef CAFFE2_CORE_COMMON_GPU_H_
18 #define CAFFE2_CORE_COMMON_GPU_H_
19 
20 #include <assert.h>
21 #include <cuda.h>
22 #include <cuda_runtime.h>
23 
24 // Disable strict aliasing errors for CUDA 9.
25 // The cuda_fp16.h header in CUDA 9 RC triggers this diagnostic.
26 // It is included by cusparse.h as well, so guarding the
27 // inclusion of that header here is not enough.
28 #if CUDA_VERSION >= 9000
29 #ifdef __GNUC__
30 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
31 #pragma GCC diagnostic push
32 #endif
33 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
34 #endif // __GNUC__
35 #endif // CUDA_VERSION >= 9000
36 
37 #include <cublas_v2.h>
38 #include <curand.h>
39 #include <driver_types.h>
40 
41 #include "caffe2/core/logging.h"
42 #include "caffe2/core/common.h"
43 
44 // This is a macro defined for cuda fp16 support. In default, cuda fp16 is
45 // supported by NVCC 7.5, but it is also included in the Tegra X1 platform with
46 // a (custom?) NVCC 7.0. As a result, we would normally just check the cuda
47 // version here, but would also allow a use to pass in the flag
48 // CAFFE_HAS_CUDA_FP16 manually.
49 
50 #ifndef CAFFE_HAS_CUDA_FP16
51 #if CUDA_VERSION >= 7050
52 #define CAFFE_HAS_CUDA_FP16
53 #endif // CUDA_VERSION >= 7050
54 #endif // CAFFE_HAS_CUDA_FP16
55 
56 #ifdef CAFFE_HAS_CUDA_FP16
57 #include <cuda_fp16.h>
58 #endif
59 
60 // Re-enable strict aliasing diagnostic if it was disabled.
61 #if CUDA_VERSION >= 9000
62 #ifdef __GNUC__
63 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
64 #pragma GCC diagnostic pop
65 #endif
66 #endif // __GNUC__
67 #endif // CUDA_VERSION >= 9000
68 
72 #define CAFFE2_COMPILE_TIME_MAX_GPUS 16
73 
80 #define CAFFE2_CUDA_MAX_PEER_SIZE 8
81 
82 namespace caffe2 {
83 
84 #if CUDA_VERSION >= 9000
85 
88 class TensorCoreEngine {};
89 #endif
90 
94 inline int CudaVersion() { return CUDA_VERSION; }
95 
99 int NumCudaDevices();
100 
115 inline bool HasCudaGPU() { return NumCudaDevices() > 0; }
116 
120 int CaffeCudaGetDevice();
121 
125 void CaffeCudaSetDevice(const int id);
126 
130 int GetGPUIDForPointer(const void* ptr);
131 
135 const cudaDeviceProp& GetDeviceProperty(const int device);
136 
140 void DeviceQuery(const int deviceid);
141 
149 bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern);
150 
154 bool TensorCoreAvailable();
155 
159 const char* cublasGetErrorString(cublasStatus_t error);
160 
164 const char* curandGetErrorString(curandStatus_t error);
165 
166 // CUDA: various checks for different function calls.
167 #define CUDA_ENFORCE(condition, ...) \
168  do { \
169  cudaError_t error = condition; \
170  CAFFE_ENFORCE_EQ( \
171  error, \
172  cudaSuccess, \
173  "Error at: ", \
174  __FILE__, \
175  ":", \
176  __LINE__, \
177  ": ", \
178  cudaGetErrorString(error), ##__VA_ARGS__); \
179  } while (0)
180 #define CUDA_CHECK(condition) \
181  do { \
182  cudaError_t error = condition; \
183  CHECK(error == cudaSuccess) << cudaGetErrorString(error); \
184  } while (0)
185 
186 #define CUDA_DRIVERAPI_ENFORCE(condition) \
187  do { \
188  CUresult result = condition; \
189  if (result != CUDA_SUCCESS) { \
190  const char* msg; \
191  cuGetErrorName(result, &msg); \
192  CAFFE_THROW("Error at: ", __FILE__, ":", __LINE__, ": ", msg); \
193  } \
194  } while (0)
195 #define CUDA_DRIVERAPI_CHECK(condition) \
196  do { \
197  CUresult result = condition; \
198  if (result != CUDA_SUCCESS) { \
199  const char* msg; \
200  cuGetErrorName(result, &msg); \
201  LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
202  << msg; \
203  } \
204  } while (0)
205 
206 #define CUBLAS_ENFORCE(condition) \
207  do { \
208  cublasStatus_t status = condition; \
209  CAFFE_ENFORCE_EQ( \
210  status, \
211  CUBLAS_STATUS_SUCCESS, \
212  "Error at: ", \
213  __FILE__, \
214  ":", \
215  __LINE__, \
216  ": ", \
217  ::caffe2::cublasGetErrorString(status)); \
218  } while (0)
219 #define CUBLAS_CHECK(condition) \
220  do { \
221  cublasStatus_t status = condition; \
222  CHECK(status == CUBLAS_STATUS_SUCCESS) \
223  << ::caffe2::cublasGetErrorString(status); \
224  } while (0)
225 
226 #define CURAND_ENFORCE(condition) \
227  do { \
228  curandStatus_t status = condition; \
229  CAFFE_ENFORCE_EQ( \
230  status, \
231  CURAND_STATUS_SUCCESS, \
232  "Error at: ", \
233  __FILE__, \
234  ":", \
235  __LINE__, \
236  ": ", \
237  ::caffe2::curandGetErrorString(status)); \
238  } while (0)
239 #define CURAND_CHECK(condition) \
240  do { \
241  curandStatus_t status = condition; \
242  CHECK(status == CURAND_STATUS_SUCCESS) \
243  << ::caffe2::curandGetErrorString(status); \
244  } while (0)
245 
246 #define CUDA_1D_KERNEL_LOOP(i, n) \
247  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
248  i += blockDim.x * gridDim.x)
249 
250 // CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda
251 // kernels. This is not supported by Apple platforms so we special case it.
252 // See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion
253 #ifdef __APPLE__
254 #define CUDA_KERNEL_ASSERT(...)
255 #else // __APPLE__
256 #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
257 #endif // __APPLE__
258 
259 // The following helper functions are here so that you can write a kernel call
260 // when you are not particularly interested in maxing out the kernels'
261 // performance. Usually, this will give you a reasonable speed, but if you
262 // really want to find the best performance, it is advised that you tune the
263 // size of the blocks and grids more reasonably.
264 // A legacy note: this is derived from the old good Caffe days, when I simply
265 // hard-coded the number of threads and wanted to keep backward compatibility
266 // for different computation capabilities.
267 // For more info on CUDA compute capabilities, visit the NVidia website at:
268 // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
269 
270 // The number of cuda threads to use. 512 is used for backward compatibility,
271 // and it is observed that setting it to 1024 usually does not bring much
272 // performance gain (which makes sense, because warp size being 32 means that
273 // blindly setting a huge block for a random kernel isn't optimal).
274 constexpr int CAFFE_CUDA_NUM_THREADS = 512;
275 // The maximum number of blocks to use in the default kernel call. We set it to
276 // 4096 which would work for compute capability 2.x (where 65536 is the limit).
277 // This number is very carelessly chosen. Ideally, one would like to look at
278 // the hardware at runtime, and pick the number of blocks that makes most
279 // sense for the specific runtime environment. This is a todo item.
280 constexpr int CAFFE_MAXIMUM_NUM_BLOCKS = 4096;
281 
285 inline int CAFFE_GET_BLOCKS(const int N) {
286  return std::min((N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
287  CAFFE_MAXIMUM_NUM_BLOCKS);
288 }
289 
290 class DeviceGuard {
291  public:
292  explicit DeviceGuard(int newDevice) : previous_(CaffeCudaGetDevice()) {
293  if (previous_ != newDevice) {
294  CaffeCudaSetDevice(newDevice);
295  }
296  }
297 
298  ~DeviceGuard() noexcept {
299  CaffeCudaSetDevice(previous_);
300  }
301 
302  private:
303  int previous_;
304 };
305 
306 } // namespace caffe2
307 #endif // CAFFE2_CORE_COMMON_GPU_H_
void DeviceQuery(const int device)
Runs a device query function and prints out the results to LOG(INFO).
Definition: common_gpu.cc:199
int CudaVersion()
A runtime function to report the cuda version that Caffe2 is built with.
Definition: common_gpu.h:94
bool HasCudaGPU()
Check if the current running session has a cuda gpu present.
Definition: common_gpu.h:115
bool GetCudaPeerAccessPattern(vector< vector< bool > > *pattern)
Return a peer access pattern by returning a matrix (in the format of a nested vector) of boolean valu...
Definition: common_gpu.cc:234
int NumCudaDevices()
Returns the number of devices.
Definition: common_gpu.cc:42
Copyright (c) 2016-present, Facebook, Inc.
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:149
int CaffeCudaGetDevice()
Gets the current GPU id.
Definition: common_gpu.cc:125
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
Definition: common_gpu.cc:138
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:182
const char * curandGetErrorString(curandStatus_t error)
Return a human readable curand error string.
Definition: common_gpu.cc:297
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.
Definition: common_gpu.h:285
const char * cublasGetErrorString(cublasStatus_t error)
Return a human readable cublas error string.
Definition: common_gpu.cc:266
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
Definition: common_gpu.cc:254