1 #include "caffe2/operators/conv_pool_op_base.h" 3 #include "caffe2/core/common_gpu.h" 4 #include "caffe2/core/context_gpu.h" 5 #include "caffe2/core/cudnn_wrappers.h" 6 #include "caffe2/operators/conv_op.h" 7 #include "caffe2/operators/conv_op_cache_cudnn.h" 8 #include "caffe2/operators/op_utils_cudnn.h" 9 #include "caffe2/utils/math.h" 17 cudnn_wrapper_(&context_),
18 cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
20 kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
22 OperatorBase::GetSingleArgument<int>(
"exhaustive_search", 0)),
24 OperatorBase::GetSingleArgument<int>(
"deterministic", 0)),
25 cudnn_state_(OperatorBase::GetSingleArgument<int>(
"cudnn_state", 0)),
26 force_algo_(OperatorBase::GetRepeatedArgument<int>(
28 vector<int>{-1, -1, -1})),
30 OperatorBase::GetSingleArgument<bool>(
"enable_tensor_core", 1)) {
31 CHECK(!deterministic_ || !exhaustive_search_);
32 CAFFE_ENFORCE(group_ > 0);
33 CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
34 for (
int i = 0; i < kernel_.size(); ++i) {
35 OPERATOR_NEEDS_FEATURE(
36 pads_[i] == pads_[kernel_.size() + i],
37 "The current padding scheme leads to unequal padding on the left " 38 "and right, which is not supported by cudnn.");
41 #if !(CUDNN_VERSION_MIN(6, 0, 0)) 42 OPERATOR_NEEDS_FEATURE(
43 dilation_h() == 1 && dilation_w() == 1,
44 "The cudnn convolution does not support dilation yet.");
47 #if !(CUDNN_VERSION_MIN(7, 1, 0)) 49 for (
int dim = 0; dim < kernel_.size(); ++dim) {
50 OPERATOR_NEEDS_FEATURE(
52 "When group is used, dilation should not be set at the same time.");
57 #if CUDNN_VERSION_MIN(7, 0, 0) 61 enable_tensor_core_ =
false;
69 !individual_force_algo,
70 "Cannot specify both force_algo and any of",
71 "force_algo_fwd, force_algo_dgrad, force_algo_wgrad");
73 force_algo_ = std::vector<int>{-1, -1, -1};
74 force_algo_[ALGO_FWD] =
75 OperatorBase::GetSingleArgument<int>(
"force_algo_fwd", -1);
76 force_algo_[ALGO_DGRAD] =
77 OperatorBase::GetSingleArgument<int>(
"force_algo_dgrad", -1);
78 force_algo_[ALGO_WGRAD] =
79 OperatorBase::GetSingleArgument<int>(
"force_algo_wgrad", -1);
82 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
83 CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_));
84 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
85 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
86 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_));
87 CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_));
91 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
92 CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_));
93 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
94 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
95 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_));
96 CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_));
102 template <
typename T>
103 void SetTensorNdDescriptorWithGroup(
105 cudnnTensorDescriptor_t tensorDesc,
111 #if CUDNN_VERSION_MIN(7, 0, 0) 114 const int CC = C / group_;
117 case StorageOrder::NHWC:
119 CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
131 vector<int> dims = {N, H, W, D, CC};
132 vector<int> strides = {H * W * D * CC, W * D * CC, D * CC, CC, 1};
133 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
141 case StorageOrder::NCHW:
143 CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
155 vector<int> dims = {N, CC, H, W, D};
156 vector<int> strides = {CC * H * W * D, H * W * D, W * D, D, 1};
157 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
166 LOG(FATAL) <<
"Unknown storage order: " << order_;
170 void DuplicateConvDesc(
171 cudnnConvolutionDescriptor_t input,
174 cudnnConvolutionDescriptor_t copy) {
175 if (kernelDims == 1 || kernelDims == 2) {
176 cudnnConvolutionMode_t mode;
177 cudnnDataType_t dataType;
180 int stride_height = 0;
181 int stride_width = 0;
182 int dilation_height = 0;
183 int dilation_width = 0;
185 #if CUDNN_VERSION_MIN(6, 0, 0) 186 CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
197 CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
208 #if CUDNN_VERSION_MIN(6, 0, 0) 209 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
220 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
231 cudnnConvolutionMode_t mode;
232 cudnnDataType_t dataType;
234 vector<int> ones(dilationDims, 1);
235 CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
245 CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
256 template <
typename T>
257 cudnnDataType_t DetermineComputeTypeFromInput(
const T& X) {
259 cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
260 if (X.template IsType<at::Half>()) {
261 if (float16_compute_ && prop.major >= 6) {
262 VLOG(1) <<
"CUDNN Convolution: float16_compute specified and " 263 <<
"supported, input data is Half - using Half " 265 computeType = CUDNN_DATA_HALF;
266 }
else if (float16_compute_) {
267 VLOG(1) <<
"CUDNN Convolution: float16_compute specified but" 268 <<
"not supported, input data is Half - using float32 " 271 VLOG(1) <<
"CUDNN Convolution: float16_compute not specified but " 272 <<
"input data is Half - using float32 compute.";
275 VLOG(1) <<
"CUDNN Convolution: using float32 compute.";
280 void SetConvDescFromArguments() {
281 #if CUDNN_VERSION_MIN(6, 0, 0) 282 if (kernel_.size() == 1 || kernel_.size() == 2) {
283 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
286 kernel_.size() == 1 ? 0 : pad_l(),
288 kernel_.size() == 1 ? 1 : stride_w(),
290 kernel_.size() == 1 ? 1 : dilation_w(),
291 CUDNN_CROSS_CORRELATION,
294 CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
300 CUDNN_CROSS_CORRELATION,
304 if (kernel_.size() == 2) {
305 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
313 CUDNN_CROSS_CORRELATION));
315 vector<int> ones(dilation_.size(), 1);
316 CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
322 CUDNN_CROSS_CORRELATION,
328 void SetConvDescComputeType(
329 cudnnConvolutionDescriptor_t conv_desc,
330 cudnnDataType_t
math) {
331 if (kernel_.size() == 2) {
332 cudnnConvolutionMode_t mode;
333 cudnnDataType_t dataType;
336 int stride_height = 0;
337 int stride_width = 0;
338 int dilation_height = 0;
339 int dilation_width = 0;
341 #if CUDNN_VERSION_MIN(6, 0, 0) 342 CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
353 CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
364 #if CUDNN_VERSION_MIN(6, 0, 0) 365 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
376 CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
387 cudnnConvolutionMode_t mode;
388 cudnnDataType_t dataType;
390 vector<int> ones(dilation_.size(), 1);
391 CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
401 CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
412 vector<int64_t> cudnn_input_dims_;
413 vector<int64_t> cudnn_filter_dims_;
416 cudnnTensorDescriptor_t bottom_desc_;
417 cudnnFilterDescriptor_t filter_desc_;
418 cudnnTensorDescriptor_t bias_desc_;
419 cudnnTensorDescriptor_t top_desc_;
421 cudnnTensorDescriptor_t top_desc_for_bias_;
422 cudnnConvolutionDescriptor_t conv_desc_;
423 const size_t cudnn_ws_nbytes_limit_;
424 size_t cudnn_ws_nbytes_;
425 bool exhaustive_search_;
428 vector<int> force_algo_;
429 bool enable_tensor_core_;
430 cudnnDataType_t compute_type_;
440 template <
typename T_X,
typename T_W,
typename T_B,
typename T_Y>
441 bool DoRunWithType();
443 bool RunOnDevice()
override;
446 cudnnConvolutionFwdAlgo_t algo_;
447 using ConvFwdAlgorithmWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>;
451 INPUT_TAGS(INPUT, FILTER, BIAS);
458 no_bias_(OperatorBase::GetSingleArgument<int>(
"no_bias", 0)) {
460 !(no_bias_ && OutputSize() == 3),
461 "If bias is not present, you should not have 3 grad output.");
463 CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_data_conv_desc_));
464 CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_filter_conv_desc_));
468 CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_data_conv_desc_));
469 CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_filter_conv_desc_));
480 bool DoRunWithType();
482 bool RunOnDevice()
override;
485 cudnnConvolutionDescriptor_t bwd_filter_conv_desc_;
486 cudnnConvolutionDescriptor_t bwd_data_conv_desc_;
487 cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
488 cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
489 using ConvBwdFilterAlgorithmWithCost =
490 std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>;
491 using ConvBwdDataAlgorithmWithCost =
492 std::tuple<cudnnConvolutionBwdDataAlgo_t, float>;
498 INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
499 OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
506 static constexpr std::array<cudnnDataType_t, 2> kComputeTypesToTry = {
509 static constexpr std::array<const char*, 2> kComputePassNames = {
513 template <
typename T_X,
typename T_W,
typename T_B,
typename T_Y>
514 bool CudnnConvOp::DoRunWithType() {
515 auto& X =
Input(INPUT);
516 auto& filter =
Input(FILTER);
519 CAFFE_ENFORCE(X.dim() >= 3 && X.dim() <= 5);
520 CAFFE_ENFORCE(filter.dim() >= 3 && filter.dim() <= 5);
521 const int M = filter.dim32(0);
523 auto* Y = Output(0, output_sizes, at::dtype<T_Y>());
525 int N = 0,
C = 0, H = 0, W = 0,
D = 0, H_out = 0, W_out = 0, D_out = 0;
526 int group_offset_X = 0, group_offset_Y = 0;
529 case StorageOrder::NHWC:
532 W = X.dim() > 3 ? X.dim32(2) : 1;
533 D = X.dim() > 4 ? X.dim32(3) : 1;
534 C = X.dim32(X.dim() - 1);
536 W_out = Y->dim() > 3 ? Y->dim32(2) : 1;
537 D_out = Y->dim() > 4 ? Y->dim32(3) : 1;
538 for (
int i = 0; i < kernel_.size(); ++i) {
539 CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
541 CAFFE_ENFORCE_EQ(filter.dim32(filter.dim() - 1),
C / group_);
542 group_offset_X =
C / group_;
543 group_offset_Y = M / group_;
545 case StorageOrder::NCHW:
549 W = X.dim() > 3 ? X.dim32(3) : 1;
550 D = X.dim() > 4 ? X.dim32(4) : 1;
552 W_out = Y->dim() > 3 ? Y->dim32(3) : 1;
553 D_out = Y->dim() > 4 ? Y->dim32(4) : 1;
554 CAFFE_ENFORCE_EQ(filter.dim32(1),
C / group_);
555 for (
int i = 0; i < kernel_.size(); ++i) {
556 CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
558 group_offset_X =
C / group_ * H * W *
D;
559 group_offset_Y = M / group_ * H_out * W_out * D_out;
562 LOG(FATAL) <<
"Unknown storage order: " << order_;
567 "If you set group, the number of input channels should be divisible " 571 "If you set group, the number of output channels should be divisible " 575 Y->template mutable_data<T_Y>();
579 int group_offset_filter = filter.numel() / group_;
582 bool input_changed = (X.sizes() != cudnn_input_dims_);
583 bool filter_changed = (filter.sizes() != cudnn_filter_dims_);
584 if (input_changed || filter_changed) {
585 VLOG(1) <<
"Changing the cudnn descriptor configurations.";
587 cudnn_input_dims_ = X.sizes().vec();
588 SetTensorNdDescriptorWithGroup<T_X>(X.dim(), bottom_desc_, N,
C, H, W,
D);
590 if (filter_changed) {
591 cudnn_filter_dims_ = filter.sizes().vec();
592 if (kernel_.size() == 1 || kernel_.size() == 2) {
593 #if CUDNN_VERSION_MIN(7, 0, 0) 596 const int MM = M / group_;
598 CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
605 kernel_.size() == 1 ? 1 : kernel_w()));
607 vector<int> dims(filter.sizes().begin(), filter.sizes().end());
608 #if !CUDNN_VERSION_MIN(7, 0, 0) 611 order_ == StorageOrder::NCHW ? dims[1] /= group_
612 : dims[filter.ndim() - 1] /= group_;
614 CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
621 if (InputSize() == 3) {
622 if (kernel_.size() == 1 || kernel_.size() == 2) {
623 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
632 std::vector<int> bias_dims(X.dim(), 1);
634 std::vector<int> strides = {M, 1, 1, 1, 1, 1};
635 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
638 X.dim() > 3 ? X.dim() : 4,
645 SetTensorNdDescriptorWithGroup<T_Y>(
646 X.dim(), top_desc_, N, M, H_out, W_out, D_out);
648 if (kernel_.size() == 1 || kernel_.size() == 2) {
649 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
658 vector<int> dims = {N, M, H_out, W_out, D_out};
659 vector<int> strides = {M * H_out * W_out * D_out,
660 H_out * W_out * D_out,
664 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
667 X.dim() > 3 ? X.dim() : 4,
672 compute_type_ = DetermineComputeTypeFromInput(X);
673 SetConvDescFromArguments();
675 #if CUDNN_VERSION_MIN(7, 0, 0) 676 if (enable_tensor_core_) {
678 cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
682 CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
685 if (force_algo_[ALGO_FWD] >= 0) {
686 algo_ = (cudnnConvolutionFwdAlgo_t)force_algo_[ALGO_FWD];
687 }
else if (deterministic_) {
688 algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
689 }
else if (exhaustive_search_) {
693 std::array<ConvFwdAlgorithmWithCost, 2> algosToCompare;
694 for (
int i = 0; i < 2; i++) {
695 SetConvDescComputeType(conv_desc_, kComputeTypesToTry[i]);
697 algosToCompare[i] = algo_cache_.getAlgorithm(
698 X.sizes(), filter.sizes(), kComputeTypesToTry[i], [&]() {
699 VLOG(1) <<
"CUDNN Convolution fwd: doing exhaustive " 700 <<
"search for " << kComputePassNames[i];
704 int returned_algo_count;
705 std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
709 cudnn_wrapper_.with_cudnn_state(
712 CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(
713 state->cudnn_handle(),
715 X.template data<T_X>(),
717 filter.template data<T_W>(),
720 Y->template mutable_data<T_Y>(),
722 &returned_algo_count,
723 fwd_perf_stat.data(),
724 state->workspace().get(cudnn_ws_nbytes_limit_),
725 cudnn_ws_nbytes_limit_));
727 LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
728 float algo_time = fwd_perf_stat[0].status == CUDNN_STATUS_SUCCESS
729 ? fwd_perf_stat[0].time
731 return ConvFwdAlgorithmWithCost(fwd_perf_stat[0].algo, algo_time);
735 if (compute_type_ == CUDNN_DATA_FLOAT) {
740 if (compute_type_ == CUDNN_DATA_FLOAT) {
742 algo_ = std::get<0>(algosToCompare[0]);
746 (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
749 algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
750 SetConvDescComputeType(conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
754 CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm(
755 cudnn_wrapper_.inline_cudnn_handle(),
760 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
761 cudnn_ws_nbytes_limit_,
764 for (
int step = 0; step < 2; ++step) {
765 cudnnStatus_t _status = cudnnGetConvolutionForwardWorkspaceSize(
766 cudnn_wrapper_.inline_cudnn_handle(),
774 if (_status == CUDNN_STATUS_SUCCESS) {
777 if (_status == CUDNN_STATUS_NOT_SUPPORTED) {
778 cudnnConvolutionFwdAlgo_t new_algo = deterministic_
779 ? CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
780 : CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
781 VLOG(1) <<
"Forward algorithm " << (int)algo_
782 <<
" is not currently supported for given parameters." 783 <<
" Trying the default algorithm " << (
int)new_algo;
788 CUDNN_ENFORCE(_status);
790 VLOG(1) <<
"CuDNN algorithm: " << algo_;
791 VLOG(1) <<
"CuDNN workspace size: " << cudnn_ws_nbytes_;
796 #if CUDNN_VERSION_MIN(7, 0, 0) 797 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
798 CUDNN_ENFORCE(cudnnConvolutionForward(
799 state->cudnn_handle(),
802 X.template data<T_X>(),
804 filter.template data<T_W>(),
807 state->workspace().get(cudnn_ws_nbytes_),
811 Y->template mutable_data<T_Y>()));
815 for (
int i = 0; i < group_; ++i) {
816 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
817 CUDNN_ENFORCE(cudnnConvolutionForward(
818 state->cudnn_handle(),
821 X.template data<T_X>() + i * group_offset_X,
823 filter.template data<T_W>() + i * group_offset_filter,
826 state->workspace().get(cudnn_ws_nbytes_),
830 Y->template mutable_data<T_Y>() + i * group_offset_Y));
835 if (InputSize() == 3) {
836 auto& bias =
Input(BIAS);
838 CAFFE_ENFORCE_EQ(bias.dim(), 1);
839 CAFFE_ENFORCE_EQ(bias.dim32(0), M);
841 CUDNN_ENFORCE(cudnnAddTensor(
842 cudnn_wrapper_.inline_cudnn_handle(),
845 bias.template data<T_B>(),
848 Y->template mutable_data<T_Y>()));
854 bool CudnnConvOp::RunOnDevice() {
855 if (
Input(0).IsType<float>()) {
856 return DoRunWithType<
862 return DoRunWithType<
868 LOG(FATAL) <<
"Only float (32bit) and Half are supported by " 869 <<
"cudnn convolution, but input " << debug_def().input(0)
870 <<
" has [" <<
Input(0).dtype().name() <<
"]";
883 bool CudnnConvGradientOp::DoRunWithType() {
884 auto& X =
Input(INPUT);
885 auto& filter =
Input(FILTER);
886 auto& dY =
Input(OUTPUT_GRAD);
888 CAFFE_ENFORCE(X.dim() >= 3 && X.dim() <= 5);
889 CAFFE_ENFORCE(filter.dim() >= 3 && filter.dim() <= 5);
891 const int M = filter.dim32(0);
892 int N = 0,
C = 0, H = 0, W = 0,
D = 0, H_out = 0, W_out = 0, D_out = 0;
893 int group_offset_X = 0, group_offset_Y = 0;
896 case StorageOrder::NHWC:
899 W = X.dim() > 3 ? X.dim32(2) : 1;
900 D = X.dim() > 4 ? X.dim32(3) : 1;
901 C = X.dim32(X.dim() - 1);
903 W_out = dY.dim() > 3 ? dY.dim32(2) : 1;
904 D_out = dY.dim() > 4 ? dY.dim32(3) : 1;
905 for (
int i = 0; i < kernel_.size(); ++i) {
906 CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
908 CAFFE_ENFORCE_EQ(filter.dim32(filter.dim() - 1),
C / group_);
909 group_offset_X =
C / group_;
910 group_offset_Y = M / group_;
912 case StorageOrder::NCHW:
916 W = X.dim() > 3 ? X.dim32(3) : 1;
917 D = X.dim() > 4 ? X.dim32(4) : 1;
919 W_out = dY.dim() > 3 ? dY.dim32(3) : 1;
920 D_out = dY.dim() > 4 ? dY.dim32(4) : 1;
921 CAFFE_ENFORCE_EQ(filter.dim32(1),
C / group_);
922 for (
int i = 0; i < kernel_.size(); ++i) {
923 CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
925 group_offset_X =
C / group_ * H * W *
D;
926 group_offset_Y = M / group_ * H_out * W_out * D_out;
929 LOG(FATAL) <<
"Unknown storage order: " << order_;
934 "If you set group, the number of input channels should be divisible " 938 "If you set group, the number of output channels should be divisible " 941 int group_offset_filter = filter.numel() / group_;
942 if (kernel_.size() == 1) {
944 }
else if (kernel_.size() == 2) {
946 }
else if (kernel_.size() == 3) {
949 CAFFE_THROW(
"Unsupported kernel size:", kernel_.size());
951 auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T_DW>());
954 math::Set<T_DW, CUDAContext>(
957 dfilter->template mutable_data<T_DW>(),
960 auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T_DB>());
961 math::Set<T_DB, CUDAContext>(
964 dbias->template mutable_data<T_DB>(),
967 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
969 no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
972 dX->template mutable_data<T_DX>();
978 bool input_changed = (X.sizes() != cudnn_input_dims_);
979 bool filter_changed = (filter.sizes() != cudnn_filter_dims_);
980 if (input_changed || filter_changed) {
981 VLOG(1) <<
"Changing the cudnn descriptor configurations.";
983 cudnn_input_dims_ = X.sizes().vec();
984 SetTensorNdDescriptorWithGroup<T_X>(X.dim(), bottom_desc_, N,
C, H, W,
D);
986 if (filter_changed) {
987 cudnn_filter_dims_ = filter.sizes().vec();
988 if (kernel_.size() == 1 || kernel_.size() == 2) {
989 #if CUDNN_VERSION_MIN(7, 0, 0) 992 const int MM = M / group_;
994 CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
1001 kernel_.size() == 1 ? 1 : kernel_w()));
1003 vector<int> dims(filter.sizes().begin(), filter.sizes().end());
1004 #if !CUDNN_VERSION_MIN(7, 0, 0) 1007 order_ == StorageOrder::NCHW ? dims[1] /= group_
1008 : dims[filter.ndim() - 1] /= group_;
1011 CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
1019 if (kernel_.size() == 1 || kernel_.size() == 2) {
1020 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
1029 std::vector<int> bias_dims(X.dim(), 1);
1031 std::vector<int> strides = {M, 1, 1, 1, 1, 1};
1032 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
1035 X.dim() > 3 ? X.dim() : 4,
1042 SetTensorNdDescriptorWithGroup<T_DX>(
1043 X.dim(), top_desc_, N, M, H_out, W_out, D_out);
1045 if (kernel_.size() == 1 || kernel_.size() == 2) {
1046 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
1055 vector<int> dims = {N, M, H_out, W_out, D_out};
1056 vector<int> strides = {M * H_out * W_out * D_out,
1057 H_out * W_out * D_out,
1061 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
1064 X.dim() > 3 ? X.dim() : 4,
1069 compute_type_ = DetermineComputeTypeFromInput(X);
1070 SetConvDescFromArguments();
1073 conv_desc_, kernel_.size(), dilation_.size(), bwd_filter_conv_desc_);
1075 conv_desc_, kernel_.size(), dilation_.size(), bwd_data_conv_desc_);
1077 #if CUDNN_VERSION_MIN(7, 0, 0) 1078 if (enable_tensor_core_) {
1079 CUDNN_ENFORCE(cudnnSetConvolutionMathType(
1080 bwd_filter_conv_desc_, CUDNN_TENSOR_OP_MATH));
1081 CUDNN_ENFORCE(cudnnSetConvolutionMathType(
1082 bwd_data_conv_desc_, CUDNN_TENSOR_OP_MATH));
1086 CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_filter_conv_desc_, group_));
1087 CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_data_conv_desc_, group_));
1091 if (force_algo_[ALGO_WGRAD] >= 0) {
1093 (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
1094 }
else if (deterministic_) {
1095 bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
1096 }
else if (exhaustive_search_) {
1100 std::array<ConvBwdFilterAlgorithmWithCost, 2> algosToCompare;
1101 for (
int i = 0; i < 2; i++) {
1102 SetConvDescComputeType(bwd_filter_conv_desc_, kComputeTypesToTry[i]);
1104 algosToCompare[i] = filter_algo_cache_.getAlgorithm(
1105 X.sizes(), filter.sizes(), kComputeTypesToTry[i], [&]() {
1106 VLOG(1) <<
"CUDNN Convolution bwd: doing filter exhaustive" 1107 <<
"search for " << kComputePassNames[i];
1111 int returned_algo_count;
1116 cudnnConvolutionBwdFilterAlgoPerf_t,
1117 kNUM_CUDNN_BWD_FILTER_ALGS>
1120 cudnn_wrapper_.with_cudnn_state(
1122 CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithmEx(
1123 state->cudnn_handle(),
1125 X.template data<T_X>(),
1127 dY.template data<T_DY>(),
1128 bwd_filter_conv_desc_,
1130 dfilter->template mutable_data<T_DW>(),
1131 kNUM_CUDNN_BWD_FILTER_ALGS,
1132 &returned_algo_count,
1133 filter_perf_stat.data(),
1134 state->workspace().get(cudnn_ws_nbytes_limit_),
1135 cudnn_ws_nbytes_limit_));
1137 LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
1139 filter_perf_stat[0].status == CUDNN_STATUS_SUCCESS
1140 ? filter_perf_stat[0].time
1142 return ConvBwdFilterAlgorithmWithCost(
1143 filter_perf_stat[0].algo, algo_time);
1147 if (compute_type_ == CUDNN_DATA_FLOAT) {
1152 if (compute_type_ == CUDNN_DATA_FLOAT) {
1154 bwd_filter_algo_ = std::get<0>(algosToCompare[0]);
1158 (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
1161 bwd_filter_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
1162 SetConvDescComputeType(
1163 bwd_filter_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
1167 CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm(
1168 cudnn_wrapper_.inline_cudnn_handle(),
1171 bwd_filter_conv_desc_,
1173 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
1174 cudnn_ws_nbytes_limit_,
1175 &bwd_filter_algo_));
1178 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1179 if (force_algo_[ALGO_DGRAD] >= 0) {
1180 bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
1181 }
else if (deterministic_) {
1182 bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
1183 }
else if (exhaustive_search_) {
1187 std::array<ConvBwdDataAlgorithmWithCost, 2> algosToCompare;
1188 for (
int i = 0; i < 2; i++) {
1189 SetConvDescComputeType(bwd_data_conv_desc_, kComputeTypesToTry[i]);
1191 algosToCompare[i] = data_algo_cache_.getAlgorithm(
1192 X.sizes(), filter.sizes(), kComputeTypesToTry[i], [&]() {
1193 VLOG(1) <<
"CUDNN Convolution bwd: doing data exhaustive" 1194 <<
"search for " << kComputePassNames[i];
1195 int returned_algo_count;
1198 cudnnConvolutionBwdDataAlgoPerf_t,
1199 kNUM_CUDNN_BWD_DATA_ALGS>
1201 cudnn_wrapper_.with_cudnn_state(
1204 no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
1207 const T_W* filter_data = filter.template data<T_W>();
1208 const T_DY* dYdata = dY.template data<T_DY>();
1209 T_DX* dXdata = dX->template mutable_data<T_DX>();
1210 CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(
1211 state->cudnn_handle(),
1216 bwd_data_conv_desc_,
1219 kNUM_CUDNN_BWD_DATA_ALGS,
1220 &returned_algo_count,
1221 data_perf_stat.data(),
1222 state->workspace().get(cudnn_ws_nbytes_limit_),
1223 cudnn_ws_nbytes_limit_));
1226 LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
1228 data_perf_stat[0].status == CUDNN_STATUS_SUCCESS
1229 ? data_perf_stat[0].time
1231 return ConvBwdDataAlgorithmWithCost(
1232 data_perf_stat[0].algo, algo_time);
1236 if (compute_type_ == CUDNN_DATA_FLOAT) {
1241 if (compute_type_ == CUDNN_DATA_FLOAT) {
1243 bwd_data_algo_ = std::get<0>(algosToCompare[0]);
1247 (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
1250 bwd_data_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
1251 SetConvDescComputeType(
1252 bwd_data_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
1255 CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm(
1256 cudnn_wrapper_.inline_cudnn_handle(),
1259 bwd_data_conv_desc_,
1261 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
1262 cudnn_ws_nbytes_limit_,
1268 size_t bwd_filter_ws_size, bwd_data_ws_size;
1270 for (
int step = 0; step < 2; ++step) {
1271 cudnnStatus_t _status = cudnnGetConvolutionBackwardFilterWorkspaceSize(
1272 cudnn_wrapper_.inline_cudnn_handle(),
1275 bwd_filter_conv_desc_,
1278 &bwd_filter_ws_size);
1280 if (_status == CUDNN_STATUS_SUCCESS) {
1283 if (_status == CUDNN_STATUS_NOT_SUPPORTED) {
1284 cudnnConvolutionBwdFilterAlgo_t new_algo = deterministic_
1285 ? CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
1286 : CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
1287 VLOG(1) <<
"Backward Filter algorithm " << (int)bwd_filter_algo_
1288 <<
" is not currently supported for given parameters." 1289 <<
" Trying the default algorithm " << (
int)new_algo;
1290 bwd_filter_algo_ = new_algo;
1294 CUDNN_ENFORCE(_status);
1297 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1299 for (
int step = 0; step < 2; ++step) {
1300 cudnnStatus_t _status = cudnnGetConvolutionBackwardDataWorkspaceSize(
1301 cudnn_wrapper_.inline_cudnn_handle(),
1304 bwd_data_conv_desc_,
1309 if (_status == CUDNN_STATUS_SUCCESS) {
1312 if (_status == CUDNN_STATUS_NOT_SUPPORTED) {
1313 cudnnConvolutionBwdDataAlgo_t new_algo = deterministic_
1314 ? CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
1315 : CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
1316 VLOG(1) <<
"Backward Data algorithm " << (int)bwd_data_algo_
1317 <<
" is not currently supported for given parameters." 1318 <<
" Trying the default algorithm " << (
int)new_algo;
1319 bwd_data_algo_ = new_algo;
1323 CUDNN_ENFORCE(_status);
1326 bwd_data_ws_size = 0;
1328 cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);
1330 VLOG(1) <<
"CuDNN bwd data & filter algorithm: " << bwd_data_algo_ <<
", " 1331 << bwd_filter_algo_;
1332 VLOG(1) <<
"CuDNN workspace size: " << cudnn_ws_nbytes_;
1337 auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T_DB>());
1338 CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
1339 cudnn_wrapper_.inline_cudnn_handle(),
1342 dY.template data<T_DY>(),
1345 dbias->template mutable_data<T_DB>()));
1348 #if CUDNN_VERSION_MIN(7, 0, 0) 1349 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
1350 CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
1351 state->cudnn_handle(),
1354 X.template data<T_X>(),
1356 dY.template data<T_DY>(),
1357 bwd_filter_conv_desc_,
1359 state->workspace().get(cudnn_ws_nbytes_),
1363 dfilter->template mutable_data<T_DW>()));
1364 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1368 no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
1371 CUDNN_ENFORCE(cudnnConvolutionBackwardData(
1372 state->cudnn_handle(),
1375 filter.template data<T_W>(),
1377 dY.template data<T_DY>(),
1378 bwd_data_conv_desc_,
1380 state->workspace().get(cudnn_ws_nbytes_),
1384 dX->template mutable_data<T_DX>()));
1388 for (
int i = 0; i < group_; ++i) {
1389 cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](
CuDNNState* state) {
1390 CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
1391 state->cudnn_handle(),
1394 X.template data<T_X>() + i * group_offset_X,
1396 dY.template data<T_DY>() + i * group_offset_Y,
1397 bwd_filter_conv_desc_,
1399 state->workspace().get(cudnn_ws_nbytes_),
1403 dfilter->template mutable_data<T_DW>() + i * group_offset_filter));
1404 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1406 auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
1408 CUDNN_ENFORCE(cudnnConvolutionBackwardData(
1409 state->cudnn_handle(),
1412 filter.template data<T_W>() + i * group_offset_filter,
1414 dY.template data<T_DY>() + i * group_offset_Y,
1415 bwd_data_conv_desc_,
1417 state->workspace().get(cudnn_ws_nbytes_),
1421 dX->template mutable_data<T_DX>() + i * group_offset_X));
1431 bool CudnnConvGradientOp::RunOnDevice() {
1432 if (
Input(0).IsType<float>()) {
1433 return DoRunWithType<
1442 return DoRunWithType<
1451 LOG(FATAL) <<
"Unsupported input types";
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
const Tensor & Input(int idx, DeviceType type=CUDAContext::GetDeviceType())
Retrieve a non-owning reference to the input at position 'idx' for this operator. ...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
bool HasArgument(const string &name) const
Checks if the operator has an argument of the given name.
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...