1 #ifndef CAFFE2_CORE_CONTEXT_GPU_H_ 2 #define CAFFE2_CORE_CONTEXT_GPU_H_ 7 #include "caffe2/core/common.h" 8 #include "caffe2/core/common_gpu.h" 9 #include "caffe2/core/context.h" 10 #include "caffe2/core/context_base.h" 11 #include "caffe2/core/logging.h" 12 #include "caffe2/core/numa.h" 13 #include "caffe2/core/tensor.h" 14 #include "caffe2/core/types.h" 15 #include "caffe2/proto/caffe2_pb.h" 19 #ifdef CAFFE2_USE_CUDNN 20 #include "caffe2/core/common_cudnn.h" 21 #endif // CAFFE2_USE_CUDNN 23 #include <c10/core/Device.h> 24 #include <c10/core/Stream.h> 25 #include <c10/cuda/CUDAStream.h> 26 #include <c10/cuda/CUDAGuard.h> 30 enum class CudaMemoryPoolType {
61 for (
DeviceIndex i = 0; i < C10_COMPILE_TIME_MAX_GPUS; ++i) {
62 cuda_streams_[i] = vector<c10::cuda::CUDAStream>();
72 c10::cuda::setCurrentCUDAStream(GetCUDAStream(gpu, stream_id));
78 vector<c10::cuda::CUDAStream>& gpu_streams = cuda_streams_[gpu];
79 while (gpu_streams.size() <=
static_cast<size_t>(stream_id)) {
82 gpu_streams.emplace_back(c10::cuda::getStreamFromPool(
false, gpu));
84 return gpu_streams[stream_id];
91 return c10::cuda::getCurrentCUDAStream(gpu).
stream();
95 return GetCUDAStream(gpu, stream_id).stream();
102 return GetHandle(c10::cuda::getCurrentCUDAStream(gpu));
109 auto& r = cublas_handles_[cuda_stream];
111 CUBLAS_ENFORCE(cublasCreate(&r));
115 CUBLAS_ENFORCE(cublasSetPointerMode(r, CUBLAS_POINTER_MODE_HOST));
116 CUBLAS_ENFORCE(cublasSetStream(r, cuda_stream));
121 #ifdef CAFFE2_USE_CUDNN 126 return GetCudnnHandle(c10::cuda::getCurrentCUDAStream(gpu));
131 auto& r = cudnn_handles_[cuda_stream];
133 CUDNN_ENFORCE(cudnnCreate(&r));
134 CUDNN_ENFORCE(cudnnSetStream(r, cuda_stream));
138 #endif // CAFFE2_USE_CUDNN 141 for (
auto element : cublas_handles_) {
142 if (element.second) {
143 CUBLAS_CHECK(cublasDestroy(element.second));
146 #ifdef CAFFE2_USE_CUDNN 147 for (
auto element : cudnn_handles_) {
148 if (element.second) {
149 CUDNN_CHECK(cudnnDestroy(element.second));
152 #endif // CAFFE2_USE_CUDNN 157 vector<c10::cuda::CUDAStream> cuda_streams_[C10_COMPILE_TIME_MAX_GPUS];
158 std::unordered_map<c10::cuda::CUDAStream, cublasHandle_t> cublas_handles_;
159 #ifdef CAFFE2_USE_CUDNN 160 std::unordered_map<c10::cuda::CUDAStream, cudnnHandle_t> cudnn_handles_;
161 #endif // CAFFE2_USE_CUDNN 173 if (curand_generator_) {
174 CURAND_CHECK(curandDestroyGenerator(curand_generator_));
182 FinishDeviceComputation();
185 inline void SwitchToDevice(
StreamId stream_id)
override {
186 getCudaObjects().SetCurrentStreamId(gpu_id_, stream_id);
191 using BaseContext::SwitchToDevice;
193 inline void WaitEvent(
const Event& ev)
override {
197 inline void Record(
Event* ev,
const char* err_msg =
nullptr)
const override {
198 CAFFE_ENFORCE(ev,
"Event must not be null.");
199 ev->Record(CUDA,
this, err_msg);
205 void FinishDeviceComputation()
override {
206 CUDA_ENFORCE(cudaStreamSynchronize(getCudaObjects().GetStream(gpu_id_)));
207 cudaError_t error = cudaGetLastError();
208 if (error != cudaSuccess) {
209 CAFFE_THROW(
"Encountered CUDA error: ", cudaGetErrorString(error));
213 inline int device_id()
const {
217 inline cudaStream_t cuda_stream()
const {
218 return getCudaObjects().GetStream(gpu_id_);
222 return getCudaObjects().GetStream(gpu_id, stream_id);
225 cublasHandle_t cublas_handle() {
226 return getCudaObjects().GetHandle(gpu_id_);
229 #ifdef CAFFE2_USE_CUDNN 230 cudnnHandle_t cudnn_handle() {
231 return getCudaObjects().GetCudnnHandle(gpu_id_);
233 #endif // CAFFE2_USE_CUDNN 235 curandGenerator_t& curand_generator() {
236 if (!curand_generator_) {
239 curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
241 curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_));
242 CHECK_NOTNULL(curand_generator_);
244 CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream()));
245 return curand_generator_;
249 return GetAllocator(CUDA)->allocate(nbytes);
255 static std::mutex& mutex();
259 static std::vector<long> TotalMemoryByGpu();
260 static std::vector<long> MaxMemoryByGpu();
262 template <
class SrcContext,
class DstContext>
263 inline void CopyBytes(
size_t nbytes,
const void* src,
void* dst) {
264 CUDA_ENFORCE(cudaMemcpyAsync(
269 getCudaObjects().GetStream(gpu_id_)));
272 void CopyBytesSameDevice(
size_t nbytes,
const void* src,
void* dst)
override {
273 CopyBytes<CUDAContext, CUDAContext>(nbytes, src, dst);
276 void CopyBytesToCPU(
size_t nbytes,
const void* src,
void* dst)
override {
277 CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
280 void CopyBytesFromCPU(
size_t nbytes,
const void* src,
void* dst)
override {
281 CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
284 template <
typename T,
class SrcContext,
class DstContext>
285 inline void Copy(
int n,
const T* src,
T* dst) {
286 CopyBytes<SrcContext, DstContext>(n *
sizeof(
T),
287 static_cast<const void*>(src),
288 static_cast<void*
>(dst));
291 template <
class SrcContext,
class DstContext>
293 CopyItems(
const TypeMeta& meta,
size_t n,
const void* src,
void* dst) {
294 CAFFE_ENFORCE(!meta.
copy(),
"CUDAContext requires fundamental types.");
295 CopyBytes<SrcContext, DstContext>(n * meta.
itemsize(), src, dst);
298 static void CopyBytesAsync(
304 static void CopyBytesSync(
312 static bool HasAsyncPartDefault() {
316 static bool SupportsAsyncScheduling() {
320 static bool IsStreamFree(
const DeviceOption& option,
StreamId stream_id) {
321 auto stream = CUDAContext::cuda_stream(option.device_id(), stream_id);
322 return cudaStreamQuery(stream) == cudaSuccess;
329 DeviceType device_type()
const override {
333 static constexpr DeviceType GetDeviceType() {
340 curandGenerator_t curand_generator_{
nullptr};
348 #endif // CAFFE2_CORE_CONTEXT_GPU_H_
A struct to host thread-local cuda objects.
Virtual interface for the Context class in Caffe2.
Represents a a compute device on which a tensor is located.
int16_t DeviceIndex
An index representing a specific device; e.g., the 1 in GPU 1.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
int32_t StreamId
An index representing a specific stream.
CAFFE2_CUDA_API CudaMemoryPoolType GetCudaMemoryPoolType()
Gets the current memory pool type used by Caffe2.
A variant of DeviceGuard that is specialized for CUDA.
void CaffeCudaSetDevice(const int id)
Gets the current GPU id.
cudaStream_t stream() const
Explicit conversion to cudaStream_t.
DeviceIndex device_index() const
Get the CUDA device index that this stream is associated with.