1 #include "caffe2/core/context_gpu.h" 2 #include "caffe2/core/cudnn_wrappers.h" 3 #include "caffe2/operators/conv_op_cache_cudnn.h" 4 #include "caffe2/operators/conv_transpose_op.h" 5 #include "caffe2/operators/op_utils_cudnn.h" 11 template <
class... Args>
14 cudnn_wrapper_(&context_),
15 cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
17 kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
19 OperatorBase::GetSingleArgument<int>(
"exhaustive_search", 0)),
21 OperatorBase::GetSingleArgument<int>(
"deterministic", 0)),
22 cudnn_state_(OperatorBase::GetSingleArgument<int>(
"cudnn_state", 0)),
23 force_algo_(OperatorBase::GetRepeatedArgument<int>(
25 vector<int>{-1, -1, -1})),
27 OperatorBase::GetSingleArgument<bool>(
"enable_tensor_core", 1)) {
28 CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
35 !individual_force_algo,
36 "Cannot specify both force_algo and any of",
37 "force_algo_fwd, force_algo_dgrad, force_algo_wgrad");
39 force_algo_ = std::vector<int>{-1, -1, -1};
40 force_algo_[ALGO_FWD] =
41 OperatorBase::GetSingleArgument<int>(
"force_algo_fwd", -1);
42 force_algo_[ALGO_DGRAD] =
43 OperatorBase::GetSingleArgument<int>(
"force_algo_dgrad", -1);
44 force_algo_[ALGO_WGRAD] =
45 OperatorBase::GetSingleArgument<int>(
"force_algo_wgrad", -1);
48 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
49 CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_));
50 if (InputSize() == 3) {
51 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
53 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
54 CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_));
57 ~CudnnConvTransposeOpBase()
override {
58 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
59 CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_));
60 if (InputSize() == 3) {
61 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
63 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
64 CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_));
68 vector<int64_t> cudnn_input_dims_;
69 vector<int64_t> cudnn_filter_dims_;
72 cudnnTensorDescriptor_t bottom_desc_;
73 cudnnFilterDescriptor_t filter_desc_;
74 cudnnTensorDescriptor_t bias_desc_;
75 cudnnTensorDescriptor_t top_desc_;
76 cudnnConvolutionDescriptor_t conv_desc_;
77 const size_t cudnn_ws_nbytes_limit_;
78 size_t cudnn_ws_nbytes_;
79 bool exhaustive_search_;
82 vector<int> force_algo_;
83 bool enable_tensor_core_;
89 template <
class... Args>
93 ~CudnnConvTransposeOp()
override {}
95 bool RunOnDevice()
override;
99 cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
102 INPUT_TAGS(INPUT, FILTER, BIAS);
105 template <
typename T>
108 template <
class... Args>
111 no_bias_(OperatorBase::GetSingleArgument<bool>(
"no_bias",
false)) {
113 !(no_bias_ && OutputSize() == 3),
114 "If bias is not present, you should not have 3 grad output.");
117 ~CudnnConvTransposeGradientOp()
override {}
119 bool RunOnDevice()
override;
122 cudnnConvolutionFwdAlgo_t algo_;
123 cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
129 INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
130 OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
137 template <
typename T>
139 auto& X =
Input(INPUT);
140 auto& filter =
Input(FILTER);
143 case StorageOrder::NHWC:
146 case StorageOrder::NCHW:
150 LOG(FATAL) <<
"Unknown storage order: " << order_;
153 auto* Y = Output(0, sizes, at::dtype<T>());
155 int N = 0,
M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
157 case StorageOrder::NHWC:
164 CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
165 CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
166 CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
167 CAFFE_ENFORCE_EQ(filter.dim32(3), C);
169 case StorageOrder::NCHW:
176 CAFFE_ENFORCE_EQ(filter.dim32(1), C);
177 CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h());
178 CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w());
181 LOG(FATAL) <<
"Unknown storage order: " << order_;
184 if (InputSize() == 3) {
185 auto& bias =
Input(BIAS);
186 CAFFE_ENFORCE_EQ(bias.dim(), 1);
187 CAFFE_ENFORCE_EQ(bias.dim32(0), C);
191 bool input_changed = (X.sizes() != cudnn_input_dims_);
192 bool filter_changed = (filter.sizes() != cudnn_filter_dims_);
194 if (input_changed || filter_changed) {
195 VLOG(1) <<
"Changing the cudnn descriptor configurations.";
197 cudnn_input_dims_ = X.sizes().vec();
198 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
207 if (filter_changed) {
208 cudnn_filter_dims_ = filter.sizes().vec();
209 CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
217 if (InputSize() == 3) {
218 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
229 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
241 "The current padding scheme leads to unequal padding on the top and " 242 "bottom, which is not supported by cudnn.");
246 "The current padding scheme leads to unequal padding on the left " 247 "and right, which is not supported by cudnn.");
249 #if CUDNN_VERSION_MIN(6,0,0) 250 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
258 CUDNN_CROSS_CORRELATION,
261 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
269 CUDNN_CROSS_CORRELATION));
271 #if CUDNN_VERSION_MIN(7, 0, 0) 274 if (enable_tensor_core_) {
276 cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
279 if (force_algo_[ALGO_DGRAD] >= 0) {
280 bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
281 }
else if (deterministic_) {
282 bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
283 }
else if (exhaustive_search_) {
285 data_algo_cache_.getAlgorithm(X.sizes(), filter.sizes(), 0, [&]() {
286 int returned_algo_count;
288 cudnnConvolutionBwdDataAlgoPerf_t,
289 kNUM_CUDNN_BWD_DATA_ALGS>
291 cudnn_wrapper_.with_cudnn_state(
293 state->workspace().reset();
294 CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithm(
295 state->cudnn_handle(),
300 kNUM_CUDNN_BWD_DATA_ALGS,
301 &returned_algo_count,
302 data_perf_stat.data()));
305 LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
306 return data_perf_stat[0].algo;
309 CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm(
310 cudnn_wrapper_.inline_cudnn_handle(),
315 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
316 cudnn_ws_nbytes_limit_,
320 size_t bwd_data_ws_size;
321 CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize(
322 cudnn_wrapper_.inline_cudnn_handle(),
329 cudnn_ws_nbytes_ = bwd_data_ws_size;
330 VLOG(1) <<
"CuDNN algorithm: " << bwd_data_algo_;
331 VLOG(1) <<
"CuDNN workspace size: " << bwd_data_ws_size;
336 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
337 CUDNN_ENFORCE(cudnnConvolutionBackwardData(
338 state->cudnn_handle(),
341 filter.template data<T>(),
343 X.template data<T>(),
346 state->workspace().get(cudnn_ws_nbytes_),
350 Y->template mutable_data<T>()));
353 if (InputSize() == 3) {
354 CUDNN_ENFORCE(cudnnAddTensor(
355 cudnn_wrapper_.inline_cudnn_handle(),
358 Input(BIAS).template data<T>(),
361 Y->template mutable_data<T>()));
369 template <
typename T>
371 auto& X =
Input(INPUT);
372 auto& filter =
Input(FILTER);
373 auto& dY =
Input(OUTPUT_GRAD);
375 CAFFE_ENFORCE_EQ(X.dim(), 4);
376 CAFFE_ENFORCE_EQ(filter.dim(), 4);
379 case StorageOrder::NHWC:
382 case StorageOrder::NCHW:
386 LOG(FATAL) <<
"Unknown storage order: " << order_;
389 int N = 0,
M = 0, H = 0, W = 0, H_out = 0, W_out = 0;
391 case StorageOrder::NHWC:
398 CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
399 CAFFE_ENFORCE_EQ(filter.dim32(1), kernel_h());
400 CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_w());
401 CAFFE_ENFORCE_EQ(filter.dim32(3), C);
403 case StorageOrder::NCHW:
410 CAFFE_ENFORCE_EQ(filter.dim32(1), C);
411 CAFFE_ENFORCE_EQ(filter.dim32(2), kernel_h());
412 CAFFE_ENFORCE_EQ(filter.dim32(3), kernel_w());
415 LOG(FATAL) <<
"Unknown storage order: " << order_;
419 auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>());
422 bool input_changed = (X.sizes() != cudnn_input_dims_);
423 bool filter_changed = (filter.sizes() != cudnn_filter_dims_);
424 if (input_changed || filter_changed) {
425 VLOG(1) <<
"Changing the cudnn descriptor configurations.";
427 cudnn_input_dims_ = X.sizes().vec();
428 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
437 if (filter_changed) {
438 cudnn_filter_dims_ = filter.sizes().vec();
439 CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
448 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
459 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
471 "The current padding scheme leads to unequal padding on the top and " 472 "bottom, which is not supported by cudnn.");
476 "The current padding scheme leads to unequal padding on the left " 477 "and right, which is not supported by cudnn.");
478 #if CUDNN_VERSION_MIN(6,0,0) 479 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
487 CUDNN_CROSS_CORRELATION,
490 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
498 CUDNN_CROSS_CORRELATION));
500 #if CUDNN_VERSION_MIN(7, 0, 0) 503 if (enable_tensor_core_) {
505 cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
508 if (force_algo_[ALGO_WGRAD] >= 0) {
510 (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
511 }
else if (deterministic_) {
512 algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
513 bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
514 }
else if (exhaustive_search_) {
516 filter_algo_cache_.getAlgorithm(X.sizes(), filter.sizes(), 0, [&]() {
517 LOG(INFO) <<
"CUDNN Convolution bwd: doing exhaustive search.";
523 int returned_algo_count;
529 cudnnConvolutionBwdFilterAlgoPerf_t,
530 kNUM_CUDNN_BWD_FILTER_ALGS>
533 cudnn_wrapper_.with_cudnn_state(
535 state->workspace().reset();
536 CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithm(
537 state->cudnn_handle(),
542 kNUM_CUDNN_BWD_FILTER_ALGS,
543 &returned_algo_count,
544 filter_perf_stat.data()));
546 LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
547 return filter_perf_stat[0].algo;
551 forward_algo_cache_.getAlgorithm(X.sizes(), filter.sizes(), 0, [&]() {
552 int returned_algo_count;
553 std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
555 cudnn_wrapper_.with_cudnn_state(
557 state->workspace().reset();
558 CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithm(
559 state->cudnn_handle(),
564 kNUM_CUDNN_BWD_DATA_ALGS,
565 &returned_algo_count,
566 fwd_perf_stat.data()));
569 LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
570 return fwd_perf_stat[0].algo;
574 CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm(
575 cudnn_wrapper_.inline_cudnn_handle(),
580 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
581 cudnn_ws_nbytes_limit_,
584 CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm(
585 cudnn_wrapper_.inline_cudnn_handle(),
590 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
591 cudnn_ws_nbytes_limit_,
595 size_t bwd_filter_ws_size, fwd_ws_size;
596 CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterWorkspaceSize(
597 cudnn_wrapper_.inline_cudnn_handle(),
603 &bwd_filter_ws_size));
605 CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize(
606 cudnn_wrapper_.inline_cudnn_handle(),
613 cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, fwd_ws_size);
615 VLOG(1) <<
"CuDNN bwd algorithm: " << bwd_filter_algo_ <<
", " << algo_;
616 VLOG(1) <<
"CuDNN workspace size: " << cudnn_ws_nbytes_;
621 auto* dbias = Output(BIAS_OR_INPUT_GRAD, {C}, at::dtype<T>());
622 CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
623 cudnn_wrapper_.inline_cudnn_handle(),
626 dY.template data<T>(),
629 dbias->template mutable_data<T>()));
632 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
633 CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
634 state->cudnn_handle(),
637 dY.template data<T>(),
639 X.template data<T>(),
642 state->workspace().get(cudnn_ws_nbytes_),
646 dfilter->template mutable_data<T>()));
648 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
652 no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
655 CUDNN_ENFORCE(cudnnConvolutionForward(
656 state->cudnn_handle(),
659 dY.template data<T>(),
661 filter.template data<T>(),
664 state->workspace().get(cudnn_ws_nbytes_),
668 dX->template mutable_data<T>()));
675 REGISTER_CUDNN_OPERATOR(
676 ConvTransposeGradient,
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
const Tensor & Input(int idx, DeviceType type=CUDAContext::GetDeviceType())
Retrieve a non-owning reference to the input at position 'idx' for this operator. ...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
bool HasArgument(const string &name) const
Checks if the operator has an argument of the given name.
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...