Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_op_cudnn.cc
1 
17 #include "caffe2/core/context_gpu.h"
18 #include "caffe2/core/cudnn_wrappers.h"
19 #include "caffe2/operators/conv_op.h"
20 #include "caffe2/operators/conv_op_cache_cudnn.h"
21 #include "caffe2/operators/conv_pool_op_base.h"
22 
23 namespace caffe2 {
24 
25 // Earlier in the days Caffe sets the default cudnn workspace to 8MB. We bump
26 // it up to 64MB in Caffe2, as this enables the use of Winograd in many cases,
27 // something very beneficial to more recent CNN models.
28 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 64 * 1024 * 1024;
29 
30 // Manually specified number of algorithms implemented in CuDNN.
31 // This does not have any performance implications, as we will always find the
32 // fastest algorithm; setting them to the right number of algorithms will enable
33 // us to best report the statistics when doing an exhaustive search, though.
34 #if CUDNN_VERSION_MIN(7,0,0)
35 // Note: Double each of these due to potential
36 // tensorcode + non-tensorcore versions
37 // which are treated as seperate returned algos
38 static constexpr size_t kNUM_CUDNN_FWD_ALGS =
39  2*CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
40 static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
41  2*CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
42 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
43  2*CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
44 #else
45 static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
46 static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
47 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
48 #endif
49 
50 namespace {
51 template <typename ArrayOfcudnnConvolutionAlgoPerf_t>
52 inline void LogCuDNNPerfStats(
53  const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat,
54  int returned_algo_count) {
55  VLOG(1) << "Perf result: (algo: stat, time, memory)";
56  for (int i = 0; i < returned_algo_count; ++i) {
57  const auto& stat = perf_stat[i];
58  VLOG(1) << stat.algo << ": " << stat.status << " " << stat.time << " "
59  << stat.memory;
60  }
61 }
62 
63 // Easier indexing into force_algo_ vector
64 enum {
65  ALGO_FWD = 0,
66  ALGO_WGRAD = 1,
67  ALGO_DGRAD = 2
68 } algoIndex_t;
69 
70 } // namespace
71 
72 class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
73  public:
74  CudnnConvOpBase(const OperatorDef& operator_def, Workspace* ws)
75  : ConvPoolOpBase<CUDAContext>(operator_def, ws),
76  cudnn_wrapper_(&context_),
77  cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
78  "ws_nbytes_limit",
79  kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
80  exhaustive_search_(
81  OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)),
82  deterministic_(
83  OperatorBase::GetSingleArgument<int>("deterministic", 0)),
84  cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)),
85  force_algo_(OperatorBase::GetRepeatedArgument<int>("force_algo", vector<int>{-1,-1,-1})),
86  enable_tensor_core_(OperatorBase::GetSingleArgument<bool>("enable_tensor_core", 1)) {
87  CHECK(!deterministic_ || !exhaustive_search_);
88  CAFFE_ENFORCE(group_ > 0);
89  CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
90  for (int i = 0; i < kernel_.size(); ++i) {
91  OPERATOR_NEEDS_FEATURE(
92  pads_[i] == pads_[kernel_.size() + i],
93  "The current padding scheme leads to unequal padding on the left "
94  "and right, which is not supported by cudnn.");
95  }
96  // dilated convolution supported by some algorithms in cuDNN v6
97 #if !(CUDNN_VERSION_MIN(6,0,0))
98  OPERATOR_NEEDS_FEATURE(
99  dilation_h() == 1 && dilation_w() == 1,
100  "The cudnn convolution does not support dilation yet.");
101 #endif
102 
103  bool individual_force_algo = OperatorBase::HasArgument("force_algo_fwd") ||
104  OperatorBase::HasArgument("force_algo_dgrad") ||
105  OperatorBase::HasArgument("force_algo_wgrad");
106  if (OperatorBase::HasArgument("force_algo")) {
107  CAFFE_ENFORCE(!individual_force_algo,
108  "Cannot specify both force_algo and any of",
109  "force_algo_fwd, force_algo_dgrad, force_algo_wgrad");
110  } else {
111  force_algo_ = std::vector<int>{-1,-1,-1};
112  force_algo_[ALGO_FWD] =
113  OperatorBase::GetSingleArgument<int>("force_algo_fwd", -1);
114  force_algo_[ALGO_DGRAD] =
115  OperatorBase::GetSingleArgument<int>("force_algo_dgrad", -1);
116  force_algo_[ALGO_WGRAD] =
117  OperatorBase::GetSingleArgument<int>("force_algo_wgrad", -1);
118  }
119 
120  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
121  CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_));
122  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
123  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
124  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_));
125  CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_));
126  }
127 
128  ~CudnnConvOpBase() {
129  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
130  CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_));
131  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
132  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
133  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_));
134  CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_));
135  }
136 
137  protected:
138  // A helper function to set up the tensor Nd desriptor, depending on the order
139  // the group and the type given.
140  template <typename T>
141  void SetTensorNdDescriptorWithGroup(
142  int size,
143  cudnnTensorDescriptor_t desc_,
144  int N,
145  int C,
146  int H,
147  int W,
148  int D) {
149 #if CUDNN_VERSION_MIN(7, 0, 0)
150  const int CC = C;
151 #else
152  const int CC = C / group_;
153 #endif
154  switch (order_) {
155  case StorageOrder::NHWC:
156  if (size == 4) {
157  CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
158  desc_,
160  N,
161  CC,
162  H,
163  W,
164  H * W * C,
165  1,
166  W * C,
167  C));
168  } else {
169  vector<int> dims = {N, H, W, D, CC};
170  vector<int> strides = {H * W * D * CC, W * D * CC, D * CC, CC, 1};
171  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
172  desc_,
174  size > 3 ? size : 4,
175  dims.data(),
176  strides.data()));
177  }
178  break;
179  case StorageOrder::NCHW:
180  if (size == 4) {
181  CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
182  desc_,
184  N,
185  CC,
186  H,
187  W,
188  C * H * W,
189  H * W,
190  W,
191  1));
192  } else {
193  vector<int> dims = {N, CC, H, W, D};
194  vector<int> strides = {CC * H * W * D, H * W * D, W * D, D, 1};
195  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
196  desc_,
198  size > 3 ? size : 4,
199  dims.data(),
200  strides.data()));
201  }
202  break;
203  default:
204  LOG(FATAL) << "Unknown storage order: " << order_;
205  }
206  }
207 
208  vector<TIndex> cudnn_input_dims_;
209  vector<TIndex> cudnn_filter_dims_;
210 
211  CuDNNWrapper cudnn_wrapper_;
212  cudnnTensorDescriptor_t bottom_desc_;
213  cudnnFilterDescriptor_t filter_desc_;
214  cudnnTensorDescriptor_t bias_desc_;
215  cudnnTensorDescriptor_t top_desc_;
216  // top desc for bias add in case we do group convolution
217  cudnnTensorDescriptor_t top_desc_for_bias_;
218  cudnnConvolutionDescriptor_t conv_desc_;
219  const size_t cudnn_ws_nbytes_limit_;
220  size_t cudnn_ws_nbytes_;
221  bool exhaustive_search_;
222  bool deterministic_;
223  size_t cudnn_state_;
224  vector<int> force_algo_; // stored as FWD, dFILTER, dDATA
225  bool enable_tensor_core_;
226 };
227 
228 
229 class CudnnConvOp final : public CudnnConvOpBase {
230  public:
231  CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
232  : CudnnConvOpBase(operator_def, ws) {}
233 
234  ~CudnnConvOp() {}
235 
236  template <typename T_X, typename T_W, typename T_B, typename MATH, typename T_Y>
237  bool DoRunWithType();
238 
239  bool RunOnDevice() override;
240 
241  private:
242  cudnnConvolutionFwdAlgo_t algo_;
244  // Input: X, W, b
245  // Output: Y
246  INPUT_TAGS(INPUT, FILTER, BIAS);
247 };
248 
249 class CudnnConvGradientOp final : public CudnnConvOpBase {
250  public:
251  CudnnConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
252  : CudnnConvOpBase(operator_def, ws),
253  no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
254  CAFFE_ENFORCE(
255  !(no_bias_ && OutputSize() == 3),
256  "If bias is not present, you should not have 3 grad output.");
257  }
258 
259  ~CudnnConvGradientOp() {}
260 
261  template <typename T_X, typename T_DY, typename T_W, typename T_B,
262  typename MATH,
263  typename T_DX, typename T_DW, typename T_DB>
264  bool DoRunWithType();
265 
266  bool RunOnDevice() override;
267 
268  private:
269  cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
270  cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
273  bool no_bias_;
274  // input: X, W, dY
275  // output: dW, db, and optionally dX
276  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
277  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
278 };
279 
281 // Implementations
283 
284 template <typename T_X, typename T_W, typename T_B, typename MATH, typename T_Y>
285 bool CudnnConvOp::DoRunWithType() {
286  auto& X = Input(INPUT);
287  auto& filter = Input(FILTER);
288  auto* Y = Output(0);
289 
290  // Figure out the output shape
291  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
292  CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5);
293  const int M = filter.dim32(0);
295  int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
296  int group_offset_X = 0, group_offset_Y = 0;
297 
298  switch (order_) {
299  case StorageOrder::NHWC:
300  N = X.dim32(0);
301  H = X.dim32(1);
302  W = X.ndim() > 3 ? X.dim32(2) : 1;
303  D = X.ndim() > 4 ? X.dim32(3) : 1;
304  C = X.dim32(X.ndim() - 1);
305  H_out = Y->dim32(1);
306  W_out = Y->ndim() > 3 ? Y->dim32(2) : 1;
307  D_out = Y->ndim() > 4 ? Y->dim32(3) : 1;
308  for (int i = 0; i < kernel_.size(); ++i) {
309  CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
310  }
311  CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
312  group_offset_X = C / group_;
313  group_offset_Y = M / group_;
314  break;
315  case StorageOrder::NCHW:
316  N = X.dim32(0);
317  C = X.dim32(1);
318  H = X.dim32(2);
319  W = X.ndim() > 3 ? X.dim32(3) : 1;
320  D = X.ndim() > 4 ? X.dim32(4) : 1;
321  H_out = Y->dim32(2);
322  W_out = Y->ndim() > 3 ? Y->dim32(3) : 1;
323  D_out = Y->ndim() > 4 ? Y->dim32(4) : 1;
324  CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
325  for (int i = 0; i < kernel_.size(); ++i) {
326  CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
327  }
328  group_offset_X = C / group_ * H * W * D;
329  group_offset_Y = M / group_ * H_out * W_out * D_out;
330  break;
331  default:
332  LOG(FATAL) << "Unknown storage order: " << order_;
333  }
334 
335  CAFFE_ENFORCE(
336  C % group_ == 0,
337  "If you set group, the number of input channels should be divisible "
338  "by group.");
339  CAFFE_ENFORCE(
340  M % group_ == 0,
341  "If you set group, the number of output channels should be divisible "
342  "by group.");
343 
344  int group_offset_filter = filter.size() / group_;
345 
346  // Set up the cudnn algorithms & workspace if necessary
347  bool input_changed = (X.dims() != cudnn_input_dims_);
348  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
349  if (input_changed || filter_changed) {
350  VLOG(1) << "Changing the cudnn descriptor configurations.";
351  if (input_changed) {
352  cudnn_input_dims_ = X.dims();
353  SetTensorNdDescriptorWithGroup<T_X>(
354  X.ndim(), bottom_desc_, N, C, H, W, D);
355  }
356  if (filter_changed) {
357  cudnn_filter_dims_ = filter.dims();
358  if (kernel_.size() == 2) {
359 #if CUDNN_VERSION_MIN(7, 0, 0)
360  const int MM = M;
361 #else
362  const int MM = M / group_;
363 #endif
364  CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
365  filter_desc_,
367  GetCudnnTensorFormat(order_),
368  MM,
369  C / group_,
370  kernel_h(),
371  kernel_w()));
372  } else {
373  vector<int> dims(filter.dims().begin(), filter.dims().end());
374  dims[0] /= group_;
375 #if !CUDNN_VERSION_MIN(7,0,0)
376  order_ == StorageOrder::NCHW ? dims[1] /= group_
377  : dims[filter.ndim() - 1] /= group_;
378 #endif
379  dims[filter.ndim() - 1] /= group_;
380  CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
381  filter_desc_,
383  GetCudnnTensorFormat(order_),
384  dims.size(),
385  dims.data()));
386  }
387  if (InputSize() == 3) {
388  if (kernel_.size() == 2) {
389  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
390  bias_desc_,
391  GetCudnnTensorFormat(order_),
393  1,
394  M,
395  1,
396  1));
397  } else {
398  std::vector<int> bias_dims(X.ndim(), 1);
399  bias_dims[1] = M;
400  std::vector<int> strides = {M, 1, 1, 1, 1, 1};
401  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
402  bias_desc_,
404  X.ndim() > 3 ? X.ndim() : 4,
405  bias_dims.data(),
406  strides.data()));
407  }
408  }
409  }
410  // Set the output
411  SetTensorNdDescriptorWithGroup<T_Y>(
412  X.ndim(), top_desc_, N, M, H_out, W_out, D_out);
413  // Set the output with descriptor useful for bias addition in one run.
414  if (kernel_.size() == 2) {
415  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
416  top_desc_for_bias_,
417  GetCudnnTensorFormat(order_),
419  N,
420  M,
421  H_out,
422  W_out));
423  } else {
424  vector<int> dims = {N, M, H_out, W_out, D_out};
425  vector<int> strides = {M * H_out * W_out * D_out,
426  H_out * W_out * D_out,
427  W_out * D_out,
428  D_out,
429  1};
430  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
431  top_desc_for_bias_,
433  X.ndim() > 3 ? X.ndim() : 4,
434  dims.data(),
435  strides.data()));
436  }
437  // Set the convolution descriptor
438 #if CUDNN_VERSION_MIN(6,0,0)
439  if (kernel_.size() == 2) {
440  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
441  conv_desc_,
442  pad_t(),
443  pad_l(),
444  stride_h(),
445  stride_w(),
446  dilation_h(),
447  dilation_w(),
448  CUDNN_CROSS_CORRELATION,
450  } else {
451  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
452  conv_desc_,
453  kernel_.size(),
454  pads_.data(),
455  stride_.data(),
456  dilation_.data(),
457  CUDNN_CROSS_CORRELATION,
459  }
460 #else
461  if (kernel_.size() == 2) {
462  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
463  conv_desc_,
464  pad_t(),
465  pad_l(),
466  stride_h(),
467  stride_w(),
468  1,
469  1,
470  CUDNN_CROSS_CORRELATION));
471  } else {
472  vector<int> ones(dilation_.size(), 1);
473  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
474  conv_desc_,
475  kernel_.size(),
476  pads_.data(),
477  stride_.data(),
478  ones.data(),
479  CUDNN_CROSS_CORRELATION,
481  }
482 #endif
483 
484 #if CUDNN_VERSION_MIN(7,0,0)
485  // enable TensorCore math if desired
486  enable_tensor_core_ &= TensorCoreAvailable();
487  if (enable_tensor_core_) {
488  CUDNN_ENFORCE(cudnnSetConvolutionMathType(
489  conv_desc_, CUDNN_TENSOR_OP_MATH));
490  }
491 
492  // enable cuDNN conv groups
493  CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
494 #endif
495 
496  if (force_algo_[ALGO_FWD] >= 0) {
497  algo_ = (cudnnConvolutionFwdAlgo_t)force_algo_[ALGO_FWD];
498  } else if (deterministic_) {
499  algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
500  } else if (exhaustive_search_) {
501  algo_ = algo_cache_.getAlgorithm(X.dims(), filter.dims(), [&]() {
502  VLOG(1) << "CUDNN Convolution: doing exhaustive search.";
503  // When we do an exhaustive search, we will ignore the workspace size
504  // limit and simply go for the fastest algorithm. If you happen to run
505  // out of memory later, you will be on your own...
506  int returned_algo_count;
507  std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
508  perf_stat;
509 
510  // no need to clean up workspace,
511  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
512  // Actually run the search.
513  CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(
514  state->cudnn_handle(),
515  bottom_desc_,
516  X.template data<T_X>(),
517  filter_desc_,
518  filter.template data<T_W>(),
519  conv_desc_,
520  top_desc_,
521  Y->template mutable_data<T_Y>(),
522  kNUM_CUDNN_FWD_ALGS,
523  &returned_algo_count,
524  perf_stat.data(),
525  state->workspace().get(cudnn_ws_nbytes_limit_),
526  cudnn_ws_nbytes_limit_));
527  });
528  LogCuDNNPerfStats(perf_stat, returned_algo_count);
529  return perf_stat[0].algo;
530  });
531  } else {
532  // Get the convolution algorithm based on the workspace limit.
533  CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm(
534  cudnn_wrapper_.inline_cudnn_handle(),
535  bottom_desc_,
536  filter_desc_,
537  conv_desc_,
538  top_desc_,
539  CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
540  cudnn_ws_nbytes_limit_,
541  &algo_));
542  }
543  CUDNN_ENFORCE(cudnnGetConvolutionForwardWorkspaceSize(
544  cudnn_wrapper_.inline_cudnn_handle(),
545  bottom_desc_,
546  filter_desc_,
547  conv_desc_,
548  top_desc_,
549  algo_,
550  &cudnn_ws_nbytes_));
551  VLOG(1) << "CuDNN algorithm: " << algo_;
552  VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
553  }
554 
555  // Now, actually run the computation.
556  // Run directly through cuDNN if possible
557 #if CUDNN_VERSION_MIN(7,0,0)
558  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
559  CUDNN_ENFORCE(cudnnConvolutionForward(
560  state->cudnn_handle(),
562  bottom_desc_,
563  X.template data<T_X>(),
564  filter_desc_,
565  filter.template data<T_W>(),
566  conv_desc_,
567  algo_,
568  state->workspace().get(cudnn_ws_nbytes_),
569  cudnn_ws_nbytes_,
571  top_desc_,
572  Y->template mutable_data<T_Y>()));
573  });
574 #else
575  // otherwise manually run through groups
576  for (int i = 0; i < group_; ++i) {
577  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
578  CUDNN_ENFORCE(cudnnConvolutionForward(
579  state->cudnn_handle(),
581  bottom_desc_,
582  X.template data<T_X>() + i * group_offset_X,
583  filter_desc_,
584  filter.template data<T_W>() + i * group_offset_filter,
585  conv_desc_,
586  algo_,
587  state->workspace().get(cudnn_ws_nbytes_),
588  cudnn_ws_nbytes_,
590  top_desc_,
591  Y->template mutable_data<T_Y>() + i * group_offset_Y));
592  });
593  }
594 #endif
595  // Bias
596  if (InputSize() == 3) {
597  auto& bias = Input(BIAS);
598 
599  CAFFE_ENFORCE_EQ(bias.ndim(), 1);
600  CAFFE_ENFORCE_EQ(bias.dim32(0), M);
601 
602  CUDNN_ENFORCE(cudnnAddTensor(
603  cudnn_wrapper_.inline_cudnn_handle(),
605  bias_desc_,
606  bias.template data<T_B>(),
608  top_desc_for_bias_,
609  Y->template mutable_data<T_Y>()));
610  }
611  // Done.
612  return true;
613 }
614 
615 bool CudnnConvOp::RunOnDevice() {
616 
617  if (Input(0).IsType<float>()) {
618  return DoRunWithType<float, // X
619  float, // W
620  float, // B
621  float, // Math
622  float>(); // Y
623  } else if (Input(0).IsType<float16>()) {
624  return DoRunWithType<float16, // X
625  float16, // W
626  float16, // B
627  float, // Math
628  float16>(); // Y
629  } else {
630  LOG(FATAL) << "Only float (32bit) and float16 are supported by "
631  << "cudnn convolution, but input " << debug_def().input(0)
632  << " has [" << Input(0).meta().name() << "]";
633  }
634  return true;
635 }
636 
637 template <typename T_X, typename T_DY, typename T_W, typename T_B,
638  typename MATH,
639  typename T_DX, typename T_DW, typename T_DB>
640 bool CudnnConvGradientOp::DoRunWithType() {
641  auto& X = Input(INPUT);
642  auto& filter = Input(FILTER);
643  auto& dY = Input(OUTPUT_GRAD);
644  auto* dfilter = Output(FILTER_GRAD);
645 
646  CAFFE_ENFORCE(X.ndim() >= 3 && X.ndim() <= 5);
647  CAFFE_ENFORCE(filter.ndim() >= 3 && filter.ndim() <= 5);
648 
649  const int M = filter.dim32(0);
650  int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
651  int group_offset_X = 0, group_offset_Y = 0;
652 
653  switch (order_) {
654  case StorageOrder::NHWC:
655  N = X.dim32(0);
656  H = X.dim32(1);
657  W = X.ndim() > 3 ? X.dim32(2) : 1;
658  D = X.ndim() > 4 ? X.dim32(3) : 1;
659  C = X.dim32(X.ndim() - 1);
660  H_out = dY.dim32(1);
661  W_out = dY.ndim() > 3 ? dY.dim32(2) : 1;
662  D_out = dY.ndim() > 4 ? dY.dim32(3) : 1;
663  for (int i = 0; i < kernel_.size(); ++i) {
664  CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
665  }
666  CAFFE_ENFORCE_EQ(filter.dim32(filter.ndim() - 1), C / group_);
667  group_offset_X = C / group_;
668  group_offset_Y = M / group_;
669  break;
670  case StorageOrder::NCHW:
671  N = X.dim32(0);
672  C = X.dim32(1);
673  H = X.dim32(2);
674  W = X.ndim() > 3 ? X.dim32(3) : 1;
675  D = X.ndim() > 4 ? X.dim32(4) : 1;
676  H_out = dY.dim32(2);
677  W_out = dY.ndim() > 3 ? dY.dim32(3) : 1;
678  D_out = dY.ndim() > 4 ? dY.dim32(4) : 1;
679  CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
680  for (int i = 0; i < kernel_.size(); ++i) {
681  CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
682  }
683  group_offset_X = C / group_ * H * W * D;
684  group_offset_Y = M / group_ * H_out * W_out * D_out;
685  break;
686  default:
687  LOG(FATAL) << "Unknown storage order: " << order_;
688  }
689 
690  CAFFE_ENFORCE(
691  C % group_ == 0,
692  "If you set group, the number of input channels should be divisible "
693  "by group.");
694  CAFFE_ENFORCE(
695  M % group_ == 0,
696  "If you set group, the number of output channels should be divisible "
697  "by group.");
698 
699  int group_offset_filter = filter.size() / group_;
700  if (kernel_.size() == 1) {
702  } else if (kernel_.size() == 2) {
704  } else if (kernel_.size() == 3) {
706  } else {
707  CAFFE_THROW("Unsupported kernel size:", kernel_.size());
708  }
709  dfilter->ResizeLike(filter);
710 
711  // Set up the cudnn algorithms & workspace if necessary
712  bool input_changed = (X.dims() != cudnn_input_dims_);
713  bool filter_changed = (filter.dims() != cudnn_filter_dims_);
714  if (input_changed || filter_changed) {
715  VLOG(1) << "Changing the cudnn descriptor configurations.";
716  if (input_changed) {
717  cudnn_input_dims_ = X.dims();
718  SetTensorNdDescriptorWithGroup<T_X>(
719  X.ndim(), bottom_desc_, N, C, H, W, D);
720  }
721  if (filter_changed) {
722  cudnn_filter_dims_ = filter.dims();
723  if (kernel_.size() == 2) {
724 #if CUDNN_VERSION_MIN(7, 0, 0)
725  const int MM = M;
726 #else
727  const int MM = M / group_;
728 #endif
729  CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
730  filter_desc_,
732  GetCudnnTensorFormat(order_),
733  MM,
734  C / group_,
735  kernel_h(),
736  kernel_w()));
737  } else {
738  vector<int> dims(filter.dims().begin(), filter.dims().end());
739 #if !CUDNN_VERSION_MIN(7,0,0)
740  dims[0] /= group_;
741 #endif
742  order_ == StorageOrder::NCHW ? dims[1] /= group_
743  : dims[filter.ndim() - 1] /= group_;
744  CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
745  filter_desc_,
747  GetCudnnTensorFormat(order_),
748  dims.size(),
749  dims.data()));
750  }
751  if (!no_bias_) {
752  if (kernel_.size() == 2) {
753  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
754  bias_desc_,
755  GetCudnnTensorFormat(order_),
757  1,
758  M,
759  1,
760  1));
761  } else {
762  std::vector<int> bias_dims(X.ndim(), 1);
763  bias_dims[1] = M;
764  std::vector<int> strides = {M, 1, 1, 1, 1, 1};
765  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
766  bias_desc_,
768  X.ndim() > 3 ? X.ndim() : 4,
769  bias_dims.data(),
770  strides.data()));
771  }
772  }
773  }
774  // Set the output
775  SetTensorNdDescriptorWithGroup<T_DX>(
776  X.ndim(), top_desc_, N, M, H_out, W_out, D_out);
777  // Set the output with descriptor useful for bias addition in one run.
778  if (kernel_.size() == 2) {
779  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
780  top_desc_for_bias_,
781  GetCudnnTensorFormat(order_),
783  N,
784  M,
785  H_out,
786  W_out));
787  } else {
788  vector<int> dims = {N, M, H_out, W_out, D_out};
789  vector<int> strides = {M * H_out * W_out * D_out,
790  H_out * W_out * D_out,
791  W_out * D_out,
792  D_out,
793  1};
794  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
795  top_desc_for_bias_,
797  X.ndim() > 3 ? X.ndim() : 4,
798  dims.data(),
799  strides.data()));
800  }
801  // Set the convolution descriptor
802 #if CUDNN_VERSION_MIN(6,0,0)
803  if (kernel_.size() == 2) {
804  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
805  conv_desc_,
806  pad_t(),
807  pad_l(),
808  stride_h(),
809  stride_w(),
810  dilation_h(),
811  dilation_w(),
812  CUDNN_CROSS_CORRELATION,
814  } else {
815  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
816  conv_desc_,
817  kernel_.size(),
818  pads_.data(),
819  stride_.data(),
820  dilation_.data(),
821  CUDNN_CROSS_CORRELATION,
823  }
824 #else
825  if (kernel_.size() == 2) {
826  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
827  conv_desc_,
828  pad_t(),
829  pad_l(),
830  stride_h(),
831  stride_w(),
832  1,
833  1,
834  CUDNN_CROSS_CORRELATION));
835  } else {
836  vector<int> ones(dilation_.size(), 1);
837  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
838  conv_desc_,
839  kernel_.size(),
840  pads_.data(),
841  stride_.data(),
842  ones.data(),
843  CUDNN_CROSS_CORRELATION,
845  }
846 #endif
847 
848 #if CUDNN_VERSION_MIN(7,0,0)
849  // enable TensorCore math if desired
850  enable_tensor_core_ &= TensorCoreAvailable();
851  if (enable_tensor_core_) {
852  CUDNN_ENFORCE(cudnnSetConvolutionMathType(
853  conv_desc_, CUDNN_TENSOR_OP_MATH));
854  }
855 
856  // set cuDNN groups if appropriate
857  CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
858 #endif
859 
860  // Set the workspace
861  size_t bwd_filter_ws_size, bwd_data_ws_size;
862 
863  // Choose dW algorithm
864  if (force_algo_[ALGO_WGRAD] >= 0) {
865  bwd_filter_algo_ = (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
866  } else if (deterministic_) {
867  bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
868  } else if (exhaustive_search_) {
869  bwd_filter_algo_ =
870  filter_algo_cache_.getAlgorithm(X.dims(), filter.dims(), [&]() {
871  VLOG(1) << "CUDNN Convolution bwd: doing filter exhaustive search.";
872  // When we do an exhaustive search, we will ignore the workspace
873  // size
874  // limit and simply go for the fastest algorithm. If you happen to
875  // run
876  // out of memory later, you will be on your own...
877  int returned_algo_count;
878  // We clean up the current workspace memory so that the forward
879  // algorithm is free to allocate memory.
880  // Actually run the search.
881  std::array<
882  cudnnConvolutionBwdFilterAlgoPerf_t,
883  kNUM_CUDNN_BWD_FILTER_ALGS>
884  filter_perf_stat;
885 
886  cudnn_wrapper_.with_cudnn_state(
887  cudnn_state_, [&](CuDNNState* state) {
888  CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithmEx(
889  state->cudnn_handle(),
890  bottom_desc_,
891  X.template data<T_X>(),
892  top_desc_,
893  dY.template data<T_DY>(),
894  conv_desc_,
895  filter_desc_,
896  dfilter->template mutable_data<T_DW>(),
897  kNUM_CUDNN_BWD_FILTER_ALGS,
898  &returned_algo_count,
899  filter_perf_stat.data(),
900  state->workspace().get(cudnn_ws_nbytes_limit_),
901  cudnn_ws_nbytes_limit_));
902  });
903  LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
904  return filter_perf_stat[0].algo;
905  });
906  } else {
907  // choose backward algorithm for filter
908  CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm(
909  cudnn_wrapper_.inline_cudnn_handle(),
910  bottom_desc_,
911  top_desc_,
912  conv_desc_,
913  filter_desc_,
914  CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
915  cudnn_ws_nbytes_limit_,
916  &bwd_filter_algo_));
917  }
918  // Pick dX algo if needed
919  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
920  if (force_algo_[ALGO_DGRAD] >= 0) {
921  bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
922  } else if (deterministic_) {
923  bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
924  } else if (exhaustive_search_) {
925  bwd_data_algo_ =
926  data_algo_cache_.getAlgorithm(X.dims(), filter.dims(), [&]() {
927  VLOG(1) << "CUDNN Convolution bwd: doing data exhaustive search.";
928  int returned_algo_count;
929 
930  std::array<
931  cudnnConvolutionBwdDataAlgoPerf_t,
932  kNUM_CUDNN_BWD_DATA_ALGS>
933  data_perf_stat;
934  cudnn_wrapper_.with_cudnn_state(
935  cudnn_state_, [&](CuDNNState* state) {
936  auto* dX =
937  Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
938  dX->ResizeLike(X);
939  const T_W* filter_data = filter.template data<T_W>();
940  const T_DY* dYdata = dY.template data<T_DY>();
941  T_DX* dXdata = dX->template mutable_data<T_DX>();
942  CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(
943  state->cudnn_handle(),
944  filter_desc_,
945  filter_data,
946  top_desc_,
947  dYdata,
948  conv_desc_,
949  bottom_desc_,
950  dXdata,
951  kNUM_CUDNN_BWD_DATA_ALGS,
952  &returned_algo_count,
953  data_perf_stat.data(),
954  state->workspace().get(cudnn_ws_nbytes_limit_),
955  cudnn_ws_nbytes_limit_));
956  });
957 
958  LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
959  return data_perf_stat[0].algo;
960  });
961  } else {
962  CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm(
963  cudnn_wrapper_.inline_cudnn_handle(),
964  filter_desc_,
965  top_desc_,
966  conv_desc_,
967  bottom_desc_,
968  CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
969  cudnn_ws_nbytes_limit_,
970  &bwd_data_algo_));
971  }
972  }
973 
974  // get workspace for backwards filter algorithm
975  CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterWorkspaceSize(
976  cudnn_wrapper_.inline_cudnn_handle(),
977  bottom_desc_,
978  top_desc_,
979  conv_desc_,
980  filter_desc_,
981  bwd_filter_algo_,
982  &bwd_filter_ws_size));
983  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
984  // get workspace for backwards data algorithm
985  CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataWorkspaceSize(
986  cudnn_wrapper_.inline_cudnn_handle(),
987  filter_desc_,
988  top_desc_,
989  conv_desc_,
990  bottom_desc_,
991  bwd_data_algo_,
992  &bwd_data_ws_size));
993  } else {
994  bwd_data_ws_size = 0;
995  }
996  cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);
997 
998  VLOG(1) << "CuDNN bwd algorithm: " << bwd_filter_algo_ << ", "
999  << bwd_data_algo_;
1000  VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
1001  }
1002 
1003  // Now, actually run the computation.
1004  if (!no_bias_) {
1005  auto* dbias = Output(BIAS_OR_INPUT_GRAD);
1006  dbias->Resize(M);
1007  CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
1008  cudnn_wrapper_.inline_cudnn_handle(),
1010  top_desc_for_bias_,
1011  dY.template data<T_DY>(),
1013  bias_desc_,
1014  dbias->template mutable_data<T_DB>()));
1015  }
1016 
1017 #if CUDNN_VERSION_MIN(7,0,0)
1018  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
1019  CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
1020  state->cudnn_handle(),
1022  bottom_desc_,
1023  X.template data<T_X>(),
1024  top_desc_,
1025  dY.template data<T_DY>(),
1026  conv_desc_,
1027  bwd_filter_algo_,
1028  state->workspace().get(cudnn_ws_nbytes_),
1029  cudnn_ws_nbytes_,
1031  filter_desc_,
1032  dfilter->template mutable_data<T_DW>()));
1033  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1034  // Compute the gradient w.r.t. the input.
1035  auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
1036  dX->ResizeLike(X);
1037  CUDNN_ENFORCE(cudnnConvolutionBackwardData(
1038  state->cudnn_handle(),
1040  filter_desc_,
1041  filter.template data<T_W>(),
1042  top_desc_,
1043  dY.template data<T_DY>(),
1044  conv_desc_,
1045  bwd_data_algo_,
1046  state->workspace().get(cudnn_ws_nbytes_),
1047  cudnn_ws_nbytes_,
1049  bottom_desc_,
1050  dX->template mutable_data<T_DX>()));
1051  }
1052  });
1053 #else
1054  for (int i = 0; i < group_; ++i) {
1055  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
1056  CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
1057  state->cudnn_handle(),
1059  bottom_desc_,
1060  X.template data<T_X>() + i * group_offset_X,
1061  top_desc_,
1062  dY.template data<T_DY>() + i * group_offset_Y,
1063  conv_desc_,
1064  bwd_filter_algo_,
1065  state->workspace().get(cudnn_ws_nbytes_),
1066  cudnn_ws_nbytes_,
1068  filter_desc_,
1069  dfilter->template mutable_data<T_DW>() + i * group_offset_filter));
1070  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1071  // Compute the gradient w.r.t. the input.
1072  auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
1073  dX->ResizeLike(X);
1074  CUDNN_ENFORCE(cudnnConvolutionBackwardData(
1075  state->cudnn_handle(),
1077  filter_desc_,
1078  filter.template data<T_W>() + i * group_offset_filter,
1079  top_desc_,
1080  dY.template data<T_DY>() + i * group_offset_Y,
1081  conv_desc_,
1082  bwd_data_algo_,
1083  state->workspace().get(cudnn_ws_nbytes_),
1084  cudnn_ws_nbytes_,
1086  bottom_desc_,
1087  dX->template mutable_data<T_DX>() + i * group_offset_X));
1088  }
1089  });
1090  }
1091 #endif
1092  return true;
1093 }
1094 
1095 // TODO(Yangqing): a lot of the function contents are very similar. Consider
1096 // consolidating them.
1097 bool CudnnConvGradientOp::RunOnDevice() {
1098  if (Input(0).IsType<float>()) {
1099  return DoRunWithType<float, // X
1100  float, // dY
1101  float, // W
1102  float, // b
1103  float, // Math
1104  float, // dX
1105  float, // dW
1106  float>(); // db
1107  }
1108  else if (Input(0).IsType<float16>()) {
1109  return DoRunWithType<float16, // X
1110  float16, // dY
1111  float16, // W
1112  float16, // b
1113  float, // Math
1114  float16, // dX
1115  float16, // dW
1116  float16>(); // db
1117  } else {
1118  LOG(FATAL) << "Unsupported input types";
1119  }
1120  return true;
1121 }
1122 
1123 REGISTER_CUDNN_OPERATOR(Conv, CudnnConvOp);
1124 REGISTER_CUDNN_OPERATOR(ConvGradient, CudnnConvGradientOp);
1125 
1126 REGISTER_CUDNN_OPERATOR(Conv1D, CudnnConvOp);
1127 REGISTER_CUDNN_OPERATOR(Conv1DGradient, CudnnConvGradientOp);
1128 
1129 REGISTER_CUDNN_OPERATOR(Conv2D, CudnnConvOp);
1130 REGISTER_CUDNN_OPERATOR(Conv2DGradient, CudnnConvGradientOp);
1131 
1132 REGISTER_CUDNN_OPERATOR(Conv3D, CudnnConvOp);
1133 REGISTER_CUDNN_OPERATOR(Conv3DGradient, CudnnConvGradientOp);
1134 
1135 } // namespace caffe2
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Definition: common_cudnn.h:199
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
Copyright (c) 2016-present, Facebook, Inc.
bool HasArgument(const string &name) const
Checks if the operator has an argument of the given name.
Definition: operator.h:52
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
Definition: common_gpu.cc:254
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...
Definition: common_cudnn.h:127