1 #ifndef CAFFE2_CORE_COMMON_GPU_H_ 2 #define CAFFE2_CORE_COMMON_GPU_H_ 6 #include <cuda_runtime.h> 12 #if CUDA_VERSION >= 9000 14 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) 15 #pragma GCC diagnostic push 17 #pragma GCC diagnostic ignored "-Wstrict-aliasing" 19 #endif // CUDA_VERSION >= 9000 21 #include <cublas_v2.h> 23 #include <driver_types.h> 25 #include "caffe2/core/common.h" 26 #include "caffe2/core/logging.h" 28 #include "c10/cuda/CUDAMacros.h" 29 #include "c10/cuda/CUDAMathCompat.h" 30 #include <c10/cuda/CUDAGuard.h> 36 #if defined(CAFFE2_BUILD_SHARED_LIBS) 37 #define CAFFE2_CUDA_EXPORT __declspec(dllexport) 38 #define CAFFE2_CUDA_IMPORT __declspec(dllimport) 40 #define CAFFE2_CUDA_EXPORT 41 #define CAFFE2_CUDA_IMPORT 45 #define CAFFE2_CUDA_EXPORT __attribute__((__visibility__("default"))) 47 #define CAFFE2_CUDA_EXPORT 49 #define CAFFE2_CUDA_IMPORT CAFFE2_CUDA_EXPORT 61 #ifdef CAFFE2_CUDA_BUILD_MAIN_LIB 62 #define CAFFE2_CUDA_API CAFFE2_CUDA_EXPORT 64 #define CAFFE2_CUDA_API CAFFE2_CUDA_IMPORT 73 #ifndef CAFFE_HAS_CUDA_FP16 74 #if CUDA_VERSION >= 7050 || defined(__HIP_PLATFORM_HCC__) 75 #define CAFFE_HAS_CUDA_FP16 76 #endif // CUDA_VERSION >= 7050 77 #endif // CAFFE_HAS_CUDA_FP16 79 #ifdef CAFFE_HAS_CUDA_FP16 80 #include <cuda_fp16.h> 84 #ifndef __HIP_PLATFORM_HCC__ 85 constexpr
int kFp16CUDADevicePropMajor = 6;
87 constexpr
int kFp16CUDADevicePropMajor = 3;
91 #if CUDA_VERSION >= 9000 93 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) 94 #pragma GCC diagnostic pop 97 #endif // CUDA_VERSION >= 9000 106 #define CAFFE2_CUDA_MAX_PEER_SIZE 8 110 #if CUDA_VERSION >= 9000 114 class TensorCoreEngine {};
117 #if CUDA_VERSION >= 10000 118 #define CAFFE2_CUDA_PTRATTR_MEMTYPE type 120 #define CAFFE2_CUDA_PTRATTR_MEMTYPE memoryType 176 CAFFE2_CUDA_API
void DeviceQuery(
const int deviceid);
185 CAFFE2_CUDA_API
bool GetCudaPeerAccessPattern(vector<vector<bool>>* pattern);
203 #define CUDA_ENFORCE(condition, ...) \ 205 cudaError_t error = condition; \ 214 cudaGetErrorString(error), \ 217 #define CUDA_CHECK(condition) \ 219 cudaError_t error = condition; \ 220 CHECK(error == cudaSuccess) << cudaGetErrorString(error); \ 223 #define CUDA_DRIVERAPI_ENFORCE(condition) \ 225 CUresult result = condition; \ 226 if (result != CUDA_SUCCESS) { \ 228 cuGetErrorName(result, &msg); \ 229 CAFFE_THROW("Error at: ", __FILE__, ":", __LINE__, ": ", msg); \ 232 #define CUDA_DRIVERAPI_CHECK(condition) \ 234 CUresult result = condition; \ 235 if (result != CUDA_SUCCESS) { \ 237 cuGetErrorName(result, &msg); \ 238 LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \ 243 #define CUBLAS_ENFORCE(condition) \ 245 cublasStatus_t status = condition; \ 248 CUBLAS_STATUS_SUCCESS, \ 254 ::caffe2::cublasGetErrorString(status)); \ 256 #define CUBLAS_CHECK(condition) \ 258 cublasStatus_t status = condition; \ 259 CHECK(status == CUBLAS_STATUS_SUCCESS) \ 260 << ::caffe2::cublasGetErrorString(status); \ 263 #define CURAND_ENFORCE(condition) \ 265 curandStatus_t status = condition; \ 268 CURAND_STATUS_SUCCESS, \ 274 ::caffe2::curandGetErrorString(status)); \ 276 #define CURAND_CHECK(condition) \ 278 curandStatus_t status = condition; \ 279 CHECK(status == CURAND_STATUS_SUCCESS) \ 280 << ::caffe2::curandGetErrorString(status); \ 283 #define CUDA_1D_KERNEL_LOOP(i, n) \ 284 for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ 285 i += blockDim.x * gridDim.x) 287 #define CUDA_2D_KERNEL_LOOP(i, n, j, m) \ 288 for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ 289 i += blockDim.x * gridDim.x) \ 290 for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \ 291 j += blockDim.y * gridDim.y) 296 #if defined(__APPLE__) || defined(__HIP_PLATFORM_HCC__) 297 #define CUDA_KERNEL_ASSERT(...) 299 #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__) 317 constexpr
int CAFFE_CUDA_NUM_THREADS = 128;
319 constexpr
int CAFFE_CUDA_NUM_THREADS_2D_DIMX = 16;
320 constexpr
int CAFFE_CUDA_NUM_THREADS_2D_DIMY = 16;
328 constexpr
int CAFFE_MAXIMUM_NUM_BLOCKS = 4096;
330 constexpr
int CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMX = 128;
331 constexpr
int CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMY = 128;
333 constexpr
int kCUDAGridDimMaxX = 2147483647;
334 constexpr
int kCUDAGridDimMaxY = 65535;
335 constexpr
int kCUDAGridDimMaxZ = 65535;
343 (N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
344 CAFFE_MAXIMUM_NUM_BLOCKS),
358 (N + CAFFE_CUDA_NUM_THREADS_2D_DIMX - 1) /
359 CAFFE_CUDA_NUM_THREADS_2D_DIMX,
360 CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMX),
366 (N + CAFFE_CUDA_NUM_THREADS_2D_DIMY - 1) /
367 CAFFE_CUDA_NUM_THREADS_2D_DIMY,
368 CAFFE_MAXIMUM_NUM_BLOCKS_2D_DIMY),
377 template <
typename T,
int N>
382 constexpr
int kCUDATensorMaxDims = 8;
384 #define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(val, Func, T, ...) \ 386 CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \ 389 Func<T, 1>(__VA_ARGS__); \ 393 Func<T, 2>(__VA_ARGS__); \ 397 Func<T, 3>(__VA_ARGS__); \ 401 Func<T, 4>(__VA_ARGS__); \ 405 Func<T, 5>(__VA_ARGS__); \ 409 Func<T, 6>(__VA_ARGS__); \ 413 Func<T, 7>(__VA_ARGS__); \ 417 Func<T, 8>(__VA_ARGS__); \ 426 #define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(val, Func, T1, T2, ...) \ 428 CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \ 431 Func<T1, T2, 1>(__VA_ARGS__); \ 435 Func<T1, T2, 2>(__VA_ARGS__); \ 439 Func<T1, T2, 3>(__VA_ARGS__); \ 443 Func<T1, T2, 4>(__VA_ARGS__); \ 447 Func<T1, T2, 5>(__VA_ARGS__); \ 451 Func<T1, T2, 6>(__VA_ARGS__); \ 455 Func<T1, T2, 7>(__VA_ARGS__); \ 459 Func<T1, T2, 8>(__VA_ARGS__); \ 468 #define DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(val, Func, T1, T2, T3, ...) \ 470 CAFFE_ENFORCE_LE(val, kCUDATensorMaxDims); \ 473 Func<T1, T2, T3, 1>(__VA_ARGS__); \ 477 Func<T1, T2, T3, 2>(__VA_ARGS__); \ 481 Func<T1, T2, T3, 3>(__VA_ARGS__); \ 485 Func<T1, T2, T3, 4>(__VA_ARGS__); \ 489 Func<T1, T2, T3, 5>(__VA_ARGS__); \ 493 Func<T1, T2, T3, 6>(__VA_ARGS__); \ 497 Func<T1, T2, T3, 7>(__VA_ARGS__); \ 501 Func<T1, T2, T3, 8>(__VA_ARGS__); \ 512 #endif // CAFFE2_CORE_COMMON_GPU_H_ int CudaVersion()
A runtime function to report the cuda version that Caffe2 is built with.
bool HasCudaGPU()
Check if the current running session has a cuda gpu present.
int CaffeCudaGetDevice()
Gets the current GPU id.
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
const char * curandGetErrorString(curandStatus_t error)
Return a human readable curand error string.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
dim3 CAFFE_GET_BLOCKS_2D(const int N, const int)
Compute the number of blocks needed to run N threads for a 2D grid.
const char * cublasGetErrorString(cublasStatus_t error)
Return a human readable cublas error string.
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.
A variant of DeviceGuard that is specialized for CUDA.
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
int NumCudaDevices()
Returns the number of devices.
void DeviceQuery(const int device)
Runs a device query function and prints out the results to LOG(INFO).