Caffe2 - C++ API
A deep learning, cross platform ML framework
common_gpu.cc
1 
17 #include "caffe2/core/common_gpu.h"
18 
19 #include <atomic>
20 #include <cstdlib>
21 #include <sstream>
22 
23 #include "caffe2/core/asan.h"
24 #include "caffe2/core/common.h"
25 #include "caffe2/core/init.h"
26 #include "caffe2/core/logging.h"
27 
28 CAFFE2_DEFINE_bool(
29  caffe2_cuda_full_device_control,
30  false,
31  "If true, assume all the cudaSetDevice and cudaGetDevice calls will be "
32  "controlled by Caffe2, and non-Caffe2 code will ensure that the entry and "
33  "exit point has the same cuda device. Under the hood, Caffe2 will use "
34  "thread local variables to cache the device, in order to speed up set and "
35  "get device calls. This is an experimental feature that may have non "
36  "trivial side effects, so use it with care and only enable it if you are "
37  "absolutely sure. Also, this flag should not be changed after the program "
38  "initializes.");
39 
40 namespace caffe2 {
41 
43  if (getenv("CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
44  static bool first = true;
45  if (first) {
46  first = false;
47  std::cerr << "DEBUG: caffe2::NumCudaDevices() invoked for the first time"
48  << std::endl;
49  }
50  }
51  static int count = -1;
52  if (count < 0) {
53  auto err = cudaGetDeviceCount(&count);
54  switch (err) {
55  case cudaSuccess:
56  // Everything is good.
57  break;
58  case cudaErrorNoDevice:
59  count = 0;
60  break;
61  case cudaErrorInsufficientDriver:
62  LOG(WARNING) << "Insufficient cuda driver. Cannot use cuda.";
63  count = 0;
64  break;
65  case cudaErrorInitializationError:
66  LOG(WARNING) << "Cuda driver initialization failed, you might not "
67  "have a cuda gpu.";
68  count = 0;
69  break;
70  case cudaErrorUnknown:
71  LOG(ERROR) << "Found an unknown error - this may be due to an "
72  "incorrectly set up environment, e.g. changing env "
73  "variable CUDA_VISIBLE_DEVICES after program start. "
74  "I will set the available devices to be zero.";
75  count = 0;
76  break;
77  case cudaErrorMemoryAllocation:
78 #if CAFFE2_ASAN_ENABLED
79  // In ASAN mode, we know that a cudaErrorMemoryAllocation error will
80  // pop up.
81  LOG(ERROR) << "It is known that CUDA does not work well with ASAN. As "
82  "a result we will simply shut down CUDA support. If you "
83  "would like to use GPUs, turn off ASAN.";
84  count = 0;
85  break;
86 #else // CAFFE2_ASAN_ENABLED
87  // If we are not in ASAN mode and we get cudaErrorMemoryAllocation,
88  // this means that something is wrong before NumCudaDevices() call.
89  LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
90  "some cuda functions before calling NumCudaDevices() "
91  "that might have already set an error? Error: "
92  << err;
93  break;
94 #endif // CAFFE2_ASAN_ENABLED
95  default:
96  LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
97  "some cuda functions before calling NumCudaDevices() "
98  "that might have already set an error? Error: "
99  << err;
100  }
101  }
102  return count;
103 }
104 
105 namespace {
106 int gDefaultGPUID = 0;
107 // Only used when FLAGS_caffe2_cuda_full_device_control is set true.
108 thread_local int gCurrentDevice = -1;
109 } // namespace
110 
111 void SetDefaultGPUID(const int deviceid) {
112  CAFFE_ENFORCE_LT(
113  deviceid,
114  NumCudaDevices(),
115  "The default gpu id should be smaller than the number of gpus "
116  "on this machine: ",
117  deviceid,
118  " vs ",
119  NumCudaDevices());
120  gDefaultGPUID = deviceid;
121 }
122 
123 int GetDefaultGPUID() { return gDefaultGPUID; }
124 
126  if (FLAGS_caffe2_cuda_full_device_control) {
127  if (gCurrentDevice < 0) {
128  CUDA_ENFORCE(cudaGetDevice(&gCurrentDevice));
129  }
130  return gCurrentDevice;
131  } else {
132  int gpu_id = 0;
133  CUDA_ENFORCE(cudaGetDevice(&gpu_id));
134  return gpu_id;
135  }
136 }
137 
138 void CaffeCudaSetDevice(const int id) {
139  if (FLAGS_caffe2_cuda_full_device_control) {
140  if (gCurrentDevice != id) {
141  CUDA_ENFORCE(cudaSetDevice(id));
142  }
143  gCurrentDevice = id;
144  } else {
145  CUDA_ENFORCE(cudaSetDevice(id));
146  }
147 }
148 
149 int GetGPUIDForPointer(const void* ptr) {
150  cudaPointerAttributes attr;
151  cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
152 
153  if (err == cudaErrorInvalidValue) {
154  // Occurs when the pointer is in the CPU address space that is
155  // unmanaged by CUDA; make sure the last error state is cleared,
156  // since it is persistent
157  err = cudaGetLastError();
158  CHECK(err == cudaErrorInvalidValue);
159  return -1;
160  }
161 
162  // Otherwise, there must be no error
163  CUDA_ENFORCE(err);
164 
165  if (attr.memoryType == cudaMemoryTypeHost) {
166  return -1;
167  }
168 
169  return attr.device;
170 }
171 
174  for (int i = 0; i < NumCudaDevices(); ++i) {
175  CUDA_ENFORCE(cudaGetDeviceProperties(&props[i], i));
176  }
177  }
178 
179  vector<cudaDeviceProp> props;
180 };
181 
182 const cudaDeviceProp& GetDeviceProperty(const int deviceid) {
183  // According to C++11 standard section 6.7, static local variable init is
184  // thread safe. See
185  // https://stackoverflow.com/questions/8102125/is-local-static-variable-initialization-thread-safe-in-c11
186  // for details.
187  static CudaDevicePropWrapper props;
188  CAFFE_ENFORCE_LT(
189  deviceid,
190  NumCudaDevices(),
191  "The gpu id should be smaller than the number of gpus ",
192  "on this machine: ",
193  deviceid,
194  " vs ",
195  NumCudaDevices());
196  return props.props[deviceid];
197 }
198 
199 void DeviceQuery(const int device) {
200  const cudaDeviceProp& prop = GetDeviceProperty(device);
201  std::stringstream ss;
202  ss << std::endl;
203  ss << "Device id: " << device << std::endl;
204  ss << "Major revision number: " << prop.major << std::endl;
205  ss << "Minor revision number: " << prop.minor << std::endl;
206  ss << "Name: " << prop.name << std::endl;
207  ss << "Total global memory: " << prop.totalGlobalMem << std::endl;
208  ss << "Total shared memory per block: " << prop.sharedMemPerBlock
209  << std::endl;
210  ss << "Total registers per block: " << prop.regsPerBlock << std::endl;
211  ss << "Warp size: " << prop.warpSize << std::endl;
212  ss << "Maximum memory pitch: " << prop.memPitch << std::endl;
213  ss << "Maximum threads per block: " << prop.maxThreadsPerBlock
214  << std::endl;
215  ss << "Maximum dimension of block: "
216  << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
217  << prop.maxThreadsDim[2] << std::endl;
218  ss << "Maximum dimension of grid: "
219  << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
220  << prop.maxGridSize[2] << std::endl;
221  ss << "Clock rate: " << prop.clockRate << std::endl;
222  ss << "Total constant memory: " << prop.totalConstMem << std::endl;
223  ss << "Texture alignment: " << prop.textureAlignment << std::endl;
224  ss << "Concurrent copy and execution: "
225  << (prop.deviceOverlap ? "Yes" : "No") << std::endl;
226  ss << "Number of multiprocessors: " << prop.multiProcessorCount
227  << std::endl;
228  ss << "Kernel execution timeout: "
229  << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
230  LOG(INFO) << ss.str();
231  return;
232 }
233 
234 bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
235  int gpu_count;
236  if (cudaGetDeviceCount(&gpu_count) != cudaSuccess) return false;
237  pattern->clear();
238  pattern->resize(gpu_count, vector<bool>(gpu_count, false));
239  for (int i = 0; i < gpu_count; ++i) {
240  for (int j = 0; j < gpu_count; ++j) {
241  int can_access = true;
242  if (i != j) {
243  if (cudaDeviceCanAccessPeer(&can_access, i, j)
244  != cudaSuccess) {
245  return false;
246  }
247  }
248  (*pattern)[i][j] = static_cast<bool>(can_access);
249  }
250  }
251  return true;
252 }
253 
255  // requires CUDA 9.0 and above
256 #if CUDA_VERSION < 9000
257  return false;
258 #else
259  int device = CaffeCudaGetDevice();
260  auto& prop = GetDeviceProperty(device);
261 
262  return prop.major >= 7;
263 #endif
264 }
265 
266 const char* cublasGetErrorString(cublasStatus_t error) {
267  switch (error) {
268  case CUBLAS_STATUS_SUCCESS:
269  return "CUBLAS_STATUS_SUCCESS";
270  case CUBLAS_STATUS_NOT_INITIALIZED:
271  return "CUBLAS_STATUS_NOT_INITIALIZED";
272  case CUBLAS_STATUS_ALLOC_FAILED:
273  return "CUBLAS_STATUS_ALLOC_FAILED";
274  case CUBLAS_STATUS_INVALID_VALUE:
275  return "CUBLAS_STATUS_INVALID_VALUE";
276  case CUBLAS_STATUS_ARCH_MISMATCH:
277  return "CUBLAS_STATUS_ARCH_MISMATCH";
278  case CUBLAS_STATUS_MAPPING_ERROR:
279  return "CUBLAS_STATUS_MAPPING_ERROR";
280  case CUBLAS_STATUS_EXECUTION_FAILED:
281  return "CUBLAS_STATUS_EXECUTION_FAILED";
282  case CUBLAS_STATUS_INTERNAL_ERROR:
283  return "CUBLAS_STATUS_INTERNAL_ERROR";
284 #if CUDA_VERSION >= 6000
285  case CUBLAS_STATUS_NOT_SUPPORTED:
286  return "CUBLAS_STATUS_NOT_SUPPORTED";
287 #if CUDA_VERSION >= 6050
288  case CUBLAS_STATUS_LICENSE_ERROR:
289  return "CUBLAS_STATUS_LICENSE_ERROR";
290 #endif // CUDA_VERSION >= 6050
291 #endif // CUDA_VERSION >= 6000
292  }
293  // To suppress compiler warning.
294  return "Unrecognized cublas error string";
295 }
296 
297 const char* curandGetErrorString(curandStatus_t error) {
298  switch (error) {
299  case CURAND_STATUS_SUCCESS:
300  return "CURAND_STATUS_SUCCESS";
301  case CURAND_STATUS_VERSION_MISMATCH:
302  return "CURAND_STATUS_VERSION_MISMATCH";
303  case CURAND_STATUS_NOT_INITIALIZED:
304  return "CURAND_STATUS_NOT_INITIALIZED";
305  case CURAND_STATUS_ALLOCATION_FAILED:
306  return "CURAND_STATUS_ALLOCATION_FAILED";
307  case CURAND_STATUS_TYPE_ERROR:
308  return "CURAND_STATUS_TYPE_ERROR";
309  case CURAND_STATUS_OUT_OF_RANGE:
310  return "CURAND_STATUS_OUT_OF_RANGE";
311  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
312  return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
313  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
314  return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
315  case CURAND_STATUS_LAUNCH_FAILURE:
316  return "CURAND_STATUS_LAUNCH_FAILURE";
317  case CURAND_STATUS_PREEXISTING_FAILURE:
318  return "CURAND_STATUS_PREEXISTING_FAILURE";
319  case CURAND_STATUS_INITIALIZATION_FAILED:
320  return "CURAND_STATUS_INITIALIZATION_FAILED";
321  case CURAND_STATUS_ARCH_MISMATCH:
322  return "CURAND_STATUS_ARCH_MISMATCH";
323  case CURAND_STATUS_INTERNAL_ERROR:
324  return "CURAND_STATUS_INTERNAL_ERROR";
325  }
326  // To suppress compiler warning.
327  return "Unrecognized curand error string";
328 }
329 
330 // Turn on the flag g_caffe2_has_cuda_linked to true for HasCudaRuntime()
331 // function.
332 namespace {
333 class CudaRuntimeFlagFlipper {
334  public:
335  CudaRuntimeFlagFlipper() {
336  internal::SetCudaRuntimeFlag();
337  }
338 };
339 static CudaRuntimeFlagFlipper g_flipper;
340 } // namespace
341 
342 } // namespace caffe2
void DeviceQuery(const int device)
Runs a device query function and prints out the results to LOG(INFO).
Definition: common_gpu.cc:199
bool GetCudaPeerAccessPattern(vector< vector< bool > > *pattern)
Return a peer access pattern by returning a matrix (in the format of a nested vector) of boolean valu...
Definition: common_gpu.cc:234
int NumCudaDevices()
Returns the number of devices.
Definition: common_gpu.cc:42
Copyright (c) 2016-present, Facebook, Inc.
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:149
int CaffeCudaGetDevice()
Gets the current GPU id.
Definition: common_gpu.cc:125
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
Definition: common_gpu.cc:138
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:182
const char * curandGetErrorString(curandStatus_t error)
Return a human readable curand error string.
Definition: common_gpu.cc:297
const char * cublasGetErrorString(cublasStatus_t error)
Return a human readable cublas error string.
Definition: common_gpu.cc:266
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
Definition: common_gpu.cc:254