Caffe2 - C++ API
A deep learning, cross platform ML framework
common_gpu.cc
1 #include "caffe2/core/common_gpu.h"
2 
3 #include <atomic>
4 #include <cstdlib>
5 #include <iostream>
6 #include <sstream>
7 
8 #include "caffe2/core/asan.h"
9 #include "caffe2/core/common.h"
10 #include "caffe2/core/init.h"
11 #include "caffe2/core/logging.h"
12 
13 namespace caffe2 {
14 
16  if (getenv("CAFFE2_DEBUG_CUDA_INIT_ORDER")) {
17  static bool first = true;
18  if (first) {
19  first = false;
20  std::cerr << "DEBUG: caffe2::NumCudaDevices() invoked for the first time"
21  << std::endl;
22  }
23  }
24  static int count = -1;
25  if (count < 0) {
26  auto err = cudaGetDeviceCount(&count);
27  switch (err) {
28  case cudaSuccess:
29  // Everything is good.
30  break;
31  case cudaErrorNoDevice:
32  count = 0;
33  break;
34  case cudaErrorInsufficientDriver:
35  LOG(WARNING) << "Insufficient cuda driver. Cannot use cuda.";
36  count = 0;
37  break;
38  case cudaErrorInitializationError:
39  LOG(WARNING) << "Cuda driver initialization failed, you might not "
40  "have a cuda gpu.";
41  count = 0;
42  break;
43  case cudaErrorUnknown:
44  LOG(ERROR) << "Found an unknown error - this may be due to an "
45  "incorrectly set up environment, e.g. changing env "
46  "variable CUDA_VISIBLE_DEVICES after program start. "
47  "I will set the available devices to be zero.";
48  count = 0;
49  break;
50  case cudaErrorMemoryAllocation:
51 #if CAFFE2_ASAN_ENABLED
52  // In ASAN mode, we know that a cudaErrorMemoryAllocation error will
53  // pop up.
54  LOG(ERROR) << "It is known that CUDA does not work well with ASAN. As "
55  "a result we will simply shut down CUDA support. If you "
56  "would like to use GPUs, turn off ASAN.";
57  count = 0;
58  break;
59 #else // CAFFE2_ASAN_ENABLED
60  // If we are not in ASAN mode and we get cudaErrorMemoryAllocation,
61  // this means that something is wrong before NumCudaDevices() call.
62  LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
63  "some cuda functions before calling NumCudaDevices() "
64  "that might have already set an error? Error: "
65  << err;
66  break;
67 #endif // CAFFE2_ASAN_ENABLED
68  default:
69  LOG(FATAL) << "Unexpected error from cudaGetDeviceCount(). Did you run "
70  "some cuda functions before calling NumCudaDevices() "
71  "that might have already set an error? Error: "
72  << err;
73  }
74  }
75  return count;
76 }
77 
78 namespace {
79 int gDefaultGPUID = 0;
80 } // namespace
81 
82 void SetDefaultGPUID(const int deviceid) {
83  CAFFE_ENFORCE_LT(
84  deviceid,
86  "The default gpu id should be smaller than the number of gpus "
87  "on this machine: ",
88  deviceid,
89  " vs ",
90  NumCudaDevices());
91  gDefaultGPUID = deviceid;
92 }
93 
94 int GetDefaultGPUID() { return gDefaultGPUID; }
95 
97  int gpu_id = 0;
98  CUDA_ENFORCE(cudaGetDevice(&gpu_id));
99  return gpu_id;
100 }
101 
102 void CaffeCudaSetDevice(const int id) {
103  CUDA_ENFORCE(cudaSetDevice(id));
104 }
105 
106 int GetGPUIDForPointer(const void* ptr) {
107  cudaPointerAttributes attr;
108  cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
109 
110  if (err == cudaErrorInvalidValue) {
111  // Occurs when the pointer is in the CPU address space that is
112  // unmanaged by CUDA; make sure the last error state is cleared,
113  // since it is persistent
114  err = cudaGetLastError();
115  CHECK(err == cudaErrorInvalidValue);
116  return -1;
117  }
118 
119  // Otherwise, there must be no error
120  CUDA_ENFORCE(err);
121 
122  if (attr.CAFFE2_CUDA_PTRATTR_MEMTYPE == cudaMemoryTypeHost) {
123  return -1;
124  }
125 
126  return attr.device;
127 }
128 
131  for (int i = 0; i < NumCudaDevices(); ++i) {
132  CUDA_ENFORCE(cudaGetDeviceProperties(&props[i], i));
133  }
134  }
135 
136  vector<cudaDeviceProp> props;
137 };
138 
139 const cudaDeviceProp& GetDeviceProperty(const int deviceid) {
140  // According to C++11 standard section 6.7, static local variable init is
141  // thread safe. See
142  // https://stackoverflow.com/questions/8102125/is-local-static-variable-initialization-thread-safe-in-c11
143  // for details.
144  static CudaDevicePropWrapper props;
145  CAFFE_ENFORCE_LT(
146  deviceid,
147  NumCudaDevices(),
148  "The gpu id should be smaller than the number of gpus ",
149  "on this machine: ",
150  deviceid,
151  " vs ",
152  NumCudaDevices());
153  return props.props[deviceid];
154 }
155 
156 void DeviceQuery(const int device) {
157  const cudaDeviceProp& prop = GetDeviceProperty(device);
158  std::stringstream ss;
159  ss << std::endl;
160  ss << "Device id: " << device << std::endl;
161  ss << "Major revision number: " << prop.major << std::endl;
162  ss << "Minor revision number: " << prop.minor << std::endl;
163  ss << "Name: " << prop.name << std::endl;
164  ss << "Total global memory: " << prop.totalGlobalMem << std::endl;
165  ss << "Total shared memory per block: " << prop.sharedMemPerBlock
166  << std::endl;
167  ss << "Total registers per block: " << prop.regsPerBlock << std::endl;
168  ss << "Warp size: " << prop.warpSize << std::endl;
169 #ifndef __HIP_PLATFORM_HCC__
170  ss << "Maximum memory pitch: " << prop.memPitch << std::endl;
171 #endif
172  ss << "Maximum threads per block: " << prop.maxThreadsPerBlock
173  << std::endl;
174  ss << "Maximum dimension of block: "
175  << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
176  << prop.maxThreadsDim[2] << std::endl;
177  ss << "Maximum dimension of grid: "
178  << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
179  << prop.maxGridSize[2] << std::endl;
180  ss << "Clock rate: " << prop.clockRate << std::endl;
181  ss << "Total constant memory: " << prop.totalConstMem << std::endl;
182 #ifndef __HIP_PLATFORM_HCC__
183  ss << "Texture alignment: " << prop.textureAlignment << std::endl;
184  ss << "Concurrent copy and execution: "
185  << (prop.deviceOverlap ? "Yes" : "No") << std::endl;
186 #endif
187  ss << "Number of multiprocessors: " << prop.multiProcessorCount
188  << std::endl;
189 #ifndef __HIP_PLATFORM_HCC__
190  ss << "Kernel execution timeout: "
191  << (prop.kernelExecTimeoutEnabled ? "Yes" : "No") << std::endl;
192 #endif
193  LOG(INFO) << ss.str();
194  return;
195 }
196 
197 bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern) {
198  int gpu_count;
199  if (cudaGetDeviceCount(&gpu_count) != cudaSuccess) return false;
200  pattern->clear();
201  pattern->resize(gpu_count, vector<bool>(gpu_count, false));
202  for (int i = 0; i < gpu_count; ++i) {
203  for (int j = 0; j < gpu_count; ++j) {
204  int can_access = true;
205  if (i != j) {
206  if (cudaDeviceCanAccessPeer(&can_access, i, j)
207  != cudaSuccess) {
208  return false;
209  }
210  }
211  (*pattern)[i][j] = static_cast<bool>(can_access);
212  }
213  }
214  return true;
215 }
216 
218  // requires CUDA 9.0 and above
219 #if CUDA_VERSION < 9000
220  return false;
221 #else
222  int device = CaffeCudaGetDevice();
223  auto& prop = GetDeviceProperty(device);
224 
225  return prop.major >= 7;
226 #endif
227 }
228 
229 const char* cublasGetErrorString(cublasStatus_t error) {
230  switch (error) {
231  case CUBLAS_STATUS_SUCCESS:
232  return "CUBLAS_STATUS_SUCCESS";
233  case CUBLAS_STATUS_NOT_INITIALIZED:
234  return "CUBLAS_STATUS_NOT_INITIALIZED";
235  case CUBLAS_STATUS_ALLOC_FAILED:
236  return "CUBLAS_STATUS_ALLOC_FAILED";
237  case CUBLAS_STATUS_INVALID_VALUE:
238  return "CUBLAS_STATUS_INVALID_VALUE";
239  case CUBLAS_STATUS_ARCH_MISMATCH:
240  return "CUBLAS_STATUS_ARCH_MISMATCH";
241 #ifndef __HIP_PLATFORM_HCC__
242  case CUBLAS_STATUS_MAPPING_ERROR:
243  return "CUBLAS_STATUS_MAPPING_ERROR";
244  case CUBLAS_STATUS_EXECUTION_FAILED:
245  return "CUBLAS_STATUS_EXECUTION_FAILED";
246 #endif
247  case CUBLAS_STATUS_INTERNAL_ERROR:
248  return "CUBLAS_STATUS_INTERNAL_ERROR";
249 #if CUDA_VERSION >= 6000
250  case CUBLAS_STATUS_NOT_SUPPORTED:
251  return "CUBLAS_STATUS_NOT_SUPPORTED";
252 #if CUDA_VERSION >= 6050
253  case CUBLAS_STATUS_LICENSE_ERROR:
254  return "CUBLAS_STATUS_LICENSE_ERROR";
255 #endif // CUDA_VERSION >= 6050
256 #endif // CUDA_VERSION >= 6000
257 #ifdef __HIP_PLATFORM_HCC__
258  case rocblas_status_invalid_size:
259  return "rocblas_status_invalid_size";
260 #endif
261  }
262  // To suppress compiler warning.
263  return "Unrecognized cublas error string";
264 }
265 
266 const char* curandGetErrorString(curandStatus_t error) {
267  switch (error) {
268  case CURAND_STATUS_SUCCESS:
269  return "CURAND_STATUS_SUCCESS";
270  case CURAND_STATUS_VERSION_MISMATCH:
271  return "CURAND_STATUS_VERSION_MISMATCH";
272  case CURAND_STATUS_NOT_INITIALIZED:
273  return "CURAND_STATUS_NOT_INITIALIZED";
274  case CURAND_STATUS_ALLOCATION_FAILED:
275  return "CURAND_STATUS_ALLOCATION_FAILED";
276  case CURAND_STATUS_TYPE_ERROR:
277  return "CURAND_STATUS_TYPE_ERROR";
278  case CURAND_STATUS_OUT_OF_RANGE:
279  return "CURAND_STATUS_OUT_OF_RANGE";
280  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
281  return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
282  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
283  return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
284  case CURAND_STATUS_LAUNCH_FAILURE:
285  return "CURAND_STATUS_LAUNCH_FAILURE";
286  case CURAND_STATUS_PREEXISTING_FAILURE:
287  return "CURAND_STATUS_PREEXISTING_FAILURE";
288  case CURAND_STATUS_INITIALIZATION_FAILED:
289  return "CURAND_STATUS_INITIALIZATION_FAILED";
290  case CURAND_STATUS_ARCH_MISMATCH:
291  return "CURAND_STATUS_ARCH_MISMATCH";
292  case CURAND_STATUS_INTERNAL_ERROR:
293  return "CURAND_STATUS_INTERNAL_ERROR";
294 #ifdef __HIP_PLATFORM_HCC__
295  case HIPRAND_STATUS_NOT_IMPLEMENTED:
296  return "HIPRAND_STATUS_NOT_IMPLEMENTED";
297 #endif
298  }
299  // To suppress compiler warning.
300  return "Unrecognized curand error string";
301 }
302 
303 // Turn on the flag g_caffe2_has_cuda_linked to true for HasCudaRuntime()
304 // function.
305 namespace {
306 class CudaRuntimeFlagFlipper {
307  public:
308  CudaRuntimeFlagFlipper() {
309  internal::SetCudaRuntimeFlag();
310  }
311 };
312 static CudaRuntimeFlagFlipper g_flipper;
313 } // namespace
314 
315 } // namespace caffe2
int CaffeCudaGetDevice()
Gets the current GPU id.
Definition: common_gpu.cc:96
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
Definition: common_gpu.cc:217
const char * curandGetErrorString(curandStatus_t error)
Return a human readable curand error string.
Definition: common_gpu.cc:266
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:106
const char * cublasGetErrorString(cublasStatus_t error)
Return a human readable cublas error string.
Definition: common_gpu.cc:229
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:139
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
Definition: common_gpu.cc:102
int NumCudaDevices()
Returns the number of devices.
Definition: common_gpu.cc:15
void DeviceQuery(const int device)
Runs a device query function and prints out the results to LOG(INFO).
Definition: common_gpu.cc:156