Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_op_cudnn.cc
1 #include "caffe2/operators/conv_pool_op_base.h"
2 
3 #include "caffe2/core/common_gpu.h"
4 #include "caffe2/core/context_gpu.h"
5 #include "caffe2/core/cudnn_wrappers.h"
6 #include "caffe2/operators/conv_op.h"
7 #include "caffe2/operators/conv_op_cache_cudnn.h"
8 #include "caffe2/operators/op_utils_cudnn.h"
9 #include "caffe2/utils/math.h"
10 
11 namespace caffe2 {
12 
13 class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
14  public:
15  explicit CudnnConvOpBase(const OperatorDef& operator_def, Workspace* ws)
16  : ConvPoolOpBase<CUDAContext>(operator_def, ws),
17  cudnn_wrapper_(&context_),
18  cudnn_ws_nbytes_limit_(OperatorBase::GetSingleArgument<size_t>(
19  "ws_nbytes_limit",
20  kCONV_CUDNN_WORKSPACE_LIMIT_BYTES)),
21  exhaustive_search_(
22  OperatorBase::GetSingleArgument<int>("exhaustive_search", 0)),
23  deterministic_(
24  OperatorBase::GetSingleArgument<int>("deterministic", 0)),
25  cudnn_state_(OperatorBase::GetSingleArgument<int>("cudnn_state", 0)),
26  force_algo_(OperatorBase::GetRepeatedArgument<int>(
27  "force_algo",
28  vector<int>{-1, -1, -1})),
29  enable_tensor_core_(
30  OperatorBase::GetSingleArgument<bool>("enable_tensor_core", 1)) {
31  CHECK(!deterministic_ || !exhaustive_search_);
32  CAFFE_ENFORCE(group_ > 0);
33  CAFFE_ENFORCE(!deterministic_ || !exhaustive_search_);
34  for (int i = 0; i < kernel_.size(); ++i) {
35  OPERATOR_NEEDS_FEATURE(
36  pads_[i] == pads_[kernel_.size() + i],
37  "The current padding scheme leads to unequal padding on the left "
38  "and right, which is not supported by cudnn.");
39  }
40  // dilated convolution supported by some algorithms in cuDNN v6
41 #if !(CUDNN_VERSION_MIN(6, 0, 0))
42  OPERATOR_NEEDS_FEATURE(
43  dilation_h() == 1 && dilation_w() == 1,
44  "The cudnn convolution does not support dilation yet.");
45 #endif
46  // dilated grouped convolution supported in cuDNN v7.1
47 #if !(CUDNN_VERSION_MIN(7, 1, 0))
48  if (group_ != 1) {
49  for (int dim = 0; dim < kernel_.size(); ++dim) {
50  OPERATOR_NEEDS_FEATURE(
51  dilation_[dim] == 1,
52  "When group is used, dilation should not be set at the same time.");
53  }
54  }
55 #endif
56 
57 #if CUDNN_VERSION_MIN(7, 0, 0)
58  // verify TensorCore math is supported
59  enable_tensor_core_ &= TensorCoreAvailable();
60 #else
61  enable_tensor_core_ = false;
62 #endif
63 
64  bool individual_force_algo = OperatorBase::HasArgument("force_algo_fwd") ||
65  OperatorBase::HasArgument("force_algo_dgrad") ||
66  OperatorBase::HasArgument("force_algo_wgrad");
67  if (OperatorBase::HasArgument("force_algo")) {
68  CAFFE_ENFORCE(
69  !individual_force_algo,
70  "Cannot specify both force_algo and any of",
71  "force_algo_fwd, force_algo_dgrad, force_algo_wgrad");
72  } else {
73  force_algo_ = std::vector<int>{-1, -1, -1};
74  force_algo_[ALGO_FWD] =
75  OperatorBase::GetSingleArgument<int>("force_algo_fwd", -1);
76  force_algo_[ALGO_DGRAD] =
77  OperatorBase::GetSingleArgument<int>("force_algo_dgrad", -1);
78  force_algo_[ALGO_WGRAD] =
79  OperatorBase::GetSingleArgument<int>("force_algo_wgrad", -1);
80  }
81 
82  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bottom_desc_));
83  CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&filter_desc_));
84  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bias_desc_));
85  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_));
86  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&top_desc_for_bias_));
87  CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&conv_desc_));
88  }
89 
90  ~CudnnConvOpBase() override {
91  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bottom_desc_));
92  CUDNN_ENFORCE(cudnnDestroyFilterDescriptor(filter_desc_));
93  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bias_desc_));
94  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_));
95  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(top_desc_for_bias_));
96  CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(conv_desc_));
97  }
98 
99  protected:
100  // A helper function to set up the tensor Nd desriptor, depending on the order
101  // the group and the type given.
102  template <typename T>
103  void SetTensorNdDescriptorWithGroup(
104  int size,
105  cudnnTensorDescriptor_t tensorDesc,
106  int N,
107  int C,
108  int H,
109  int W,
110  int D) {
111 #if CUDNN_VERSION_MIN(7, 0, 0)
112  const int CC = C;
113 #else
114  const int CC = C / group_;
115 #endif
116  switch (order_) {
117  case StorageOrder::NHWC:
118  if (size == 4) {
119  CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
120  tensorDesc,
122  N,
123  CC,
124  H,
125  W,
126  H * W * C,
127  1,
128  W * C,
129  C));
130  } else {
131  vector<int> dims = {N, H, W, D, CC};
132  vector<int> strides = {H * W * D * CC, W * D * CC, D * CC, CC, 1};
133  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
134  tensorDesc,
136  size > 3 ? size : 4,
137  dims.data(),
138  strides.data()));
139  }
140  break;
141  case StorageOrder::NCHW:
142  if (size == 4) {
143  CUDNN_ENFORCE(cudnnSetTensor4dDescriptorEx(
144  tensorDesc,
146  N,
147  CC,
148  H,
149  W,
150  C * H * W,
151  H * W,
152  W,
153  1));
154  } else {
155  vector<int> dims = {N, CC, H, W, D};
156  vector<int> strides = {CC * H * W * D, H * W * D, W * D, D, 1};
157  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
158  tensorDesc,
160  size > 3 ? size : 4,
161  dims.data(),
162  strides.data()));
163  }
164  break;
165  default:
166  LOG(FATAL) << "Unknown storage order: " << order_;
167  }
168  }
169 
170  void DuplicateConvDesc(
171  cudnnConvolutionDescriptor_t input,
172  size_t kernelDims,
173  size_t dilationDims,
174  cudnnConvolutionDescriptor_t copy) {
175  if (kernelDims == 1 || kernelDims == 2) {
176  cudnnConvolutionMode_t mode;
177  cudnnDataType_t dataType;
178  int pad_height = 0;
179  int pad_width = 0;
180  int stride_height = 0;
181  int stride_width = 0;
182  int dilation_height = 0;
183  int dilation_width = 0;
184 
185 #if CUDNN_VERSION_MIN(6, 0, 0)
186  CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
187  input,
188  &pad_height,
189  &pad_width,
190  &stride_height,
191  &stride_width,
192  &dilation_height,
193  &dilation_width,
194  &mode,
195  &dataType));
196 #else
197  CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
198  input,
199  &pad_height,
200  &pad_width,
201  &stride_height,
202  &stride_width,
203  &dilation_height,
204  &dilation_width,
205  &mode));
206 #endif
207 
208 #if CUDNN_VERSION_MIN(6, 0, 0)
209  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
210  copy,
211  pad_height,
212  pad_width,
213  stride_height,
214  stride_width,
215  dilation_height,
216  dilation_width,
217  mode,
218  dataType));
219 #else
220  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
221  copy,
222  pad_height,
223  pad_width,
224  stride_height,
225  stride_width,
226  dilation_height,
227  dilation_width,
228  mode));
229 #endif
230  } else {
231  cudnnConvolutionMode_t mode;
232  cudnnDataType_t dataType;
233  int arrayLength = 0;
234  vector<int> ones(dilationDims, 1);
235  CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
236  input,
237  kernel_.size(),
238  &arrayLength,
239  pads_.data(),
240  stride_.data(),
241  ones.data(),
242  &mode,
243  &dataType));
244 
245  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
246  copy,
247  kernel_.size(),
248  pads_.data(),
249  stride_.data(),
250  ones.data(),
251  mode,
252  dataType));
253  }
254  }
255 
256  template <typename T>
257  cudnnDataType_t DetermineComputeTypeFromInput(const T& X) {
258  const cudaDeviceProp& prop = GetDeviceProperty(0);
259  cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
260  if (X.template IsType<at::Half>()) {
261  if (float16_compute_ && prop.major >= 6) {
262  VLOG(1) << "CUDNN Convolution: float16_compute specified and "
263  << "supported, input data is Half - using Half "
264  << "compute.";
265  computeType = CUDNN_DATA_HALF;
266  } else if (float16_compute_) {
267  VLOG(1) << "CUDNN Convolution: float16_compute specified but"
268  << "not supported, input data is Half - using float32 "
269  << "compute.";
270  } else {
271  VLOG(1) << "CUDNN Convolution: float16_compute not specified but "
272  << "input data is Half - using float32 compute.";
273  }
274  } else {
275  VLOG(1) << "CUDNN Convolution: using float32 compute.";
276  }
277  return computeType;
278  }
279 
280  void SetConvDescFromArguments() {
281 #if CUDNN_VERSION_MIN(6, 0, 0)
282  if (kernel_.size() == 1 || kernel_.size() == 2) {
283  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
284  conv_desc_,
285  pad_t(),
286  kernel_.size() == 1 ? 0 : pad_l(),
287  stride_h(),
288  kernel_.size() == 1 ? 1 : stride_w(),
289  dilation_h(),
290  kernel_.size() == 1 ? 1 : dilation_w(),
291  CUDNN_CROSS_CORRELATION,
292  compute_type_));
293  } else {
294  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
295  conv_desc_,
296  kernel_.size(),
297  pads_.data(),
298  stride_.data(),
299  dilation_.data(),
300  CUDNN_CROSS_CORRELATION,
301  compute_type_));
302  }
303 #else
304  if (kernel_.size() == 2) {
305  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
306  conv_desc_,
307  pad_t(),
308  pad_l(),
309  stride_h(),
310  stride_w(),
311  1,
312  1,
313  CUDNN_CROSS_CORRELATION));
314  } else {
315  vector<int> ones(dilation_.size(), 1);
316  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
317  conv_desc_,
318  kernel_.size(),
319  pads_.data(),
320  stride_.data(),
321  ones.data(),
322  CUDNN_CROSS_CORRELATION,
323  compute_type_));
324  }
325 #endif
326  }
327 
328  void SetConvDescComputeType(
329  cudnnConvolutionDescriptor_t conv_desc,
330  cudnnDataType_t math) {
331  if (kernel_.size() == 2) {
332  cudnnConvolutionMode_t mode;
333  cudnnDataType_t dataType;
334  int pad_height = 0;
335  int pad_width = 0;
336  int stride_height = 0;
337  int stride_width = 0;
338  int dilation_height = 0;
339  int dilation_width = 0;
340 
341 #if CUDNN_VERSION_MIN(6, 0, 0)
342  CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
343  conv_desc,
344  &pad_height,
345  &pad_width,
346  &stride_height,
347  &stride_width,
348  &dilation_height,
349  &dilation_width,
350  &mode,
351  &dataType));
352 #else
353  CUDNN_ENFORCE(cudnnGetConvolution2dDescriptor(
354  conv_desc,
355  &pad_height,
356  &pad_width,
357  &stride_height,
358  &stride_width,
359  &dilation_height,
360  &dilation_width,
361  &mode));
362 #endif
363 
364 #if CUDNN_VERSION_MIN(6, 0, 0)
365  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
366  conv_desc,
367  pad_height,
368  pad_width,
369  stride_height,
370  stride_width,
371  dilation_height,
372  dilation_width,
373  mode,
374  math));
375 #else
376  CUDNN_ENFORCE(cudnnSetConvolution2dDescriptor(
377  conv_desc,
378  pad_height,
379  pad_width,
380  stride_height,
381  stride_width,
382  dilation_height,
383  dilation_width,
384  mode));
385 #endif
386  } else {
387  cudnnConvolutionMode_t mode;
388  cudnnDataType_t dataType;
389  int arrayLength = 0;
390  vector<int> ones(dilation_.size(), 1);
391  CUDNN_ENFORCE(cudnnGetConvolutionNdDescriptor(
392  conv_desc,
393  kernel_.size(),
394  &arrayLength,
395  pads_.data(),
396  stride_.data(),
397  ones.data(),
398  &mode,
399  &dataType));
400 
401  CUDNN_ENFORCE(cudnnSetConvolutionNdDescriptor(
402  conv_desc,
403  kernel_.size(),
404  pads_.data(),
405  stride_.data(),
406  ones.data(),
407  mode,
408  math));
409  }
410  }
411 
412  vector<int64_t> cudnn_input_dims_;
413  vector<int64_t> cudnn_filter_dims_;
414 
415  CuDNNWrapper cudnn_wrapper_;
416  cudnnTensorDescriptor_t bottom_desc_;
417  cudnnFilterDescriptor_t filter_desc_;
418  cudnnTensorDescriptor_t bias_desc_;
419  cudnnTensorDescriptor_t top_desc_;
420  // top desc for bias add in case we do group convolution
421  cudnnTensorDescriptor_t top_desc_for_bias_;
422  cudnnConvolutionDescriptor_t conv_desc_;
423  const size_t cudnn_ws_nbytes_limit_;
424  size_t cudnn_ws_nbytes_;
425  bool exhaustive_search_;
426  bool deterministic_;
427  size_t cudnn_state_;
428  vector<int> force_algo_; // stored as FWD, dFILTER, dDATA
429  bool enable_tensor_core_;
430  cudnnDataType_t compute_type_;
431 };
432 
433 class CudnnConvOp final : public CudnnConvOpBase {
434  public:
435  explicit CudnnConvOp(const OperatorDef& operator_def, Workspace* ws)
436  : CudnnConvOpBase(operator_def, ws) {}
437 
438  ~CudnnConvOp() override {}
439 
440  template <typename T_X, typename T_W, typename T_B, typename T_Y>
441  bool DoRunWithType();
442 
443  bool RunOnDevice() override;
444 
445  private:
446  cudnnConvolutionFwdAlgo_t algo_;
447  using ConvFwdAlgorithmWithCost = std::tuple<cudnnConvolutionFwdAlgo_t, float>;
449  // Input: X, W, b
450  // Output: Y
451  INPUT_TAGS(INPUT, FILTER, BIAS);
452 };
453 
454 class CudnnConvGradientOp final : public CudnnConvOpBase {
455  public:
456  explicit CudnnConvGradientOp(const OperatorDef& operator_def, Workspace* ws)
457  : CudnnConvOpBase(operator_def, ws),
458  no_bias_(OperatorBase::GetSingleArgument<int>("no_bias", 0)) {
459  CAFFE_ENFORCE(
460  !(no_bias_ && OutputSize() == 3),
461  "If bias is not present, you should not have 3 grad output.");
462 
463  CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_data_conv_desc_));
464  CUDNN_ENFORCE(cudnnCreateConvolutionDescriptor(&bwd_filter_conv_desc_));
465  }
466 
467  ~CudnnConvGradientOp() override {
468  CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_data_conv_desc_));
469  CUDNN_ENFORCE(cudnnDestroyConvolutionDescriptor(bwd_filter_conv_desc_));
470  }
471 
472  template <
473  typename T_X,
474  typename T_DY,
475  typename T_W,
476  typename T_B,
477  typename T_DX,
478  typename T_DW,
479  typename T_DB>
480  bool DoRunWithType();
481 
482  bool RunOnDevice() override;
483 
484  private:
485  cudnnConvolutionDescriptor_t bwd_filter_conv_desc_;
486  cudnnConvolutionDescriptor_t bwd_data_conv_desc_;
487  cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo_;
488  cudnnConvolutionBwdDataAlgo_t bwd_data_algo_;
489  using ConvBwdFilterAlgorithmWithCost =
490  std::tuple<cudnnConvolutionBwdFilterAlgo_t, float>;
491  using ConvBwdDataAlgorithmWithCost =
492  std::tuple<cudnnConvolutionBwdDataAlgo_t, float>;
495  bool no_bias_;
496  // input: X, W, dY
497  // output: dW, db, and optionally dX
498  INPUT_TAGS(INPUT, FILTER, OUTPUT_GRAD);
499  OUTPUT_TAGS(FILTER_GRAD, BIAS_OR_INPUT_GRAD, INPUT_GRAD);
500 };
501 
503 // Implementations
505 
506 static constexpr std::array<cudnnDataType_t, 2> kComputeTypesToTry = {
507  CUDNN_DATA_FLOAT,
508  CUDNN_DATA_HALF};
509 static constexpr std::array<const char*, 2> kComputePassNames = {
510  "fp32 compute",
511  "fp16 compute"};
512 
513 template <typename T_X, typename T_W, typename T_B, typename T_Y>
514 bool CudnnConvOp::DoRunWithType() {
515  auto& X = Input(INPUT);
516  auto& filter = Input(FILTER);
517 
518  // Figure out the output shape
519  CAFFE_ENFORCE(X.dim() >= 3 && X.dim() <= 5);
520  CAFFE_ENFORCE(filter.dim() >= 3 && filter.dim() <= 5);
521  const int M = filter.dim32(0);
522  auto output_sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, M);
523  auto* Y = Output(0, output_sizes, at::dtype<T_Y>());
524 
525  int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
526  int group_offset_X = 0, group_offset_Y = 0;
527 
528  switch (order_) {
529  case StorageOrder::NHWC:
530  N = X.dim32(0);
531  H = X.dim32(1);
532  W = X.dim() > 3 ? X.dim32(2) : 1;
533  D = X.dim() > 4 ? X.dim32(3) : 1;
534  C = X.dim32(X.dim() - 1);
535  H_out = Y->dim32(1);
536  W_out = Y->dim() > 3 ? Y->dim32(2) : 1;
537  D_out = Y->dim() > 4 ? Y->dim32(3) : 1;
538  for (int i = 0; i < kernel_.size(); ++i) {
539  CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
540  }
541  CAFFE_ENFORCE_EQ(filter.dim32(filter.dim() - 1), C / group_);
542  group_offset_X = C / group_;
543  group_offset_Y = M / group_;
544  break;
545  case StorageOrder::NCHW:
546  N = X.dim32(0);
547  C = X.dim32(1);
548  H = X.dim32(2);
549  W = X.dim() > 3 ? X.dim32(3) : 1;
550  D = X.dim() > 4 ? X.dim32(4) : 1;
551  H_out = Y->dim32(2);
552  W_out = Y->dim() > 3 ? Y->dim32(3) : 1;
553  D_out = Y->dim() > 4 ? Y->dim32(4) : 1;
554  CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
555  for (int i = 0; i < kernel_.size(); ++i) {
556  CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
557  }
558  group_offset_X = C / group_ * H * W * D;
559  group_offset_Y = M / group_ * H_out * W_out * D_out;
560  break;
561  default:
562  LOG(FATAL) << "Unknown storage order: " << order_;
563  }
564 
565  CAFFE_ENFORCE(
566  C % group_ == 0,
567  "If you set group, the number of input channels should be divisible "
568  "by group.");
569  CAFFE_ENFORCE(
570  M % group_ == 0,
571  "If you set group, the number of output channels should be divisible "
572  "by group.");
573 
574  if (N == 0) {
575  Y->template mutable_data<T_Y>();
576  return true;
577  }
578 
579  int group_offset_filter = filter.numel() / group_;
580 
581  // Set up the cudnn algorithms & workspace if necessary
582  bool input_changed = (X.sizes() != cudnn_input_dims_);
583  bool filter_changed = (filter.sizes() != cudnn_filter_dims_);
584  if (input_changed || filter_changed) {
585  VLOG(1) << "Changing the cudnn descriptor configurations.";
586  if (input_changed) {
587  cudnn_input_dims_ = X.sizes().vec();
588  SetTensorNdDescriptorWithGroup<T_X>(X.dim(), bottom_desc_, N, C, H, W, D);
589  }
590  if (filter_changed) {
591  cudnn_filter_dims_ = filter.sizes().vec();
592  if (kernel_.size() == 1 || kernel_.size() == 2) {
593 #if CUDNN_VERSION_MIN(7, 0, 0)
594  const int MM = M;
595 #else
596  const int MM = M / group_;
597 #endif
598  CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
599  filter_desc_,
601  GetCudnnTensorFormat(order_),
602  MM,
603  C / group_,
604  kernel_h(),
605  kernel_.size() == 1 ? 1 : kernel_w()));
606  } else {
607  vector<int> dims(filter.sizes().begin(), filter.sizes().end());
608 #if !CUDNN_VERSION_MIN(7, 0, 0)
609  // We only need to divide dims by group_ when CUDNN version < 7.0
610  // see CUDA group convolution doc: https://fburl.com/dgj6dvpd
611  order_ == StorageOrder::NCHW ? dims[1] /= group_
612  : dims[filter.ndim() - 1] /= group_;
613 #endif
614  CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
615  filter_desc_,
617  GetCudnnTensorFormat(order_),
618  dims.size(),
619  dims.data()));
620  }
621  if (InputSize() == 3) {
622  if (kernel_.size() == 1 || kernel_.size() == 2) {
623  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
624  bias_desc_,
625  GetCudnnTensorFormat(order_),
627  1,
628  M,
629  1,
630  1));
631  } else {
632  std::vector<int> bias_dims(X.dim(), 1);
633  bias_dims[1] = M;
634  std::vector<int> strides = {M, 1, 1, 1, 1, 1};
635  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
636  bias_desc_,
638  X.dim() > 3 ? X.dim() : 4,
639  bias_dims.data(),
640  strides.data()));
641  }
642  }
643  }
644  // Set the output
645  SetTensorNdDescriptorWithGroup<T_Y>(
646  X.dim(), top_desc_, N, M, H_out, W_out, D_out);
647  // Set the output with descriptor useful for bias addition in one run.
648  if (kernel_.size() == 1 || kernel_.size() == 2) {
649  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
650  top_desc_for_bias_,
651  GetCudnnTensorFormat(order_),
653  N,
654  M,
655  H_out,
656  W_out));
657  } else {
658  vector<int> dims = {N, M, H_out, W_out, D_out};
659  vector<int> strides = {M * H_out * W_out * D_out,
660  H_out * W_out * D_out,
661  W_out * D_out,
662  D_out,
663  1};
664  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
665  top_desc_for_bias_,
667  X.dim() > 3 ? X.dim() : 4,
668  dims.data(),
669  strides.data()));
670  }
671 
672  compute_type_ = DetermineComputeTypeFromInput(X);
673  SetConvDescFromArguments();
674 
675 #if CUDNN_VERSION_MIN(7, 0, 0)
676  if (enable_tensor_core_) {
677  CUDNN_ENFORCE(
678  cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
679  }
680 
681  // enable cuDNN conv groups
682  CUDNN_CHECK(cudnnSetConvolutionGroupCount(conv_desc_, group_));
683 #endif
684 
685  if (force_algo_[ALGO_FWD] >= 0) {
686  algo_ = (cudnnConvolutionFwdAlgo_t)force_algo_[ALGO_FWD];
687  } else if (deterministic_) {
688  algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
689  } else if (exhaustive_search_) {
690  // Even when FP16 compute is supported and requested, try FP32
691  // because it may be faster. However, if FP32 compute is specified,
692  // FP16 is not a suitable alternative - early out from the loop.
693  std::array<ConvFwdAlgorithmWithCost, 2> algosToCompare;
694  for (int i = 0; i < 2; i++) {
695  SetConvDescComputeType(conv_desc_, kComputeTypesToTry[i]);
696 
697  algosToCompare[i] = algo_cache_.getAlgorithm(
698  X.sizes(), filter.sizes(), kComputeTypesToTry[i], [&]() {
699  VLOG(1) << "CUDNN Convolution fwd: doing exhaustive "
700  << "search for " << kComputePassNames[i];
701  // When we do an exhaustive search, we will ignore the workspace
702  // size limit and simply go for the fastest algorithm. If you
703  // happen to run out of memory later, you will be on your own...
704  int returned_algo_count;
705  std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
706  fwd_perf_stat;
707 
708  // no need to clean up workspace,
709  cudnn_wrapper_.with_cudnn_state(
710  cudnn_state_, [&](CuDNNState* state) {
711  // Actually run the search.
712  CUDNN_ENFORCE(cudnnFindConvolutionForwardAlgorithmEx(
713  state->cudnn_handle(),
714  bottom_desc_,
715  X.template data<T_X>(),
716  filter_desc_,
717  filter.template data<T_W>(),
718  conv_desc_,
719  top_desc_,
720  Y->template mutable_data<T_Y>(),
721  kNUM_CUDNN_FWD_ALGS,
722  &returned_algo_count,
723  fwd_perf_stat.data(),
724  state->workspace().get(cudnn_ws_nbytes_limit_),
725  cudnn_ws_nbytes_limit_));
726  });
727  LogCuDNNPerfStats(fwd_perf_stat, returned_algo_count);
728  float algo_time = fwd_perf_stat[0].status == CUDNN_STATUS_SUCCESS
729  ? fwd_perf_stat[0].time
730  : 1e10;
731  return ConvFwdAlgorithmWithCost(fwd_perf_stat[0].algo, algo_time);
732  });
733 
734  // When set to fp32 compute, don't try fp16
735  if (compute_type_ == CUDNN_DATA_FLOAT) {
736  break;
737  }
738  }
739 
740  if (compute_type_ == CUDNN_DATA_FLOAT) {
741  // For FP32 compute, just use the best FP32 algorithm
742  algo_ = std::get<0>(algosToCompare[0]);
743  } else {
744  // For FP16 compute, choose algo with fastest execution
745  int bestAlgoIndex =
746  (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
747  ? 0
748  : 1;
749  algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
750  SetConvDescComputeType(conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
751  }
752  } else {
753  // Get the convolution algorithm based on the workspace limit.
754  CUDNN_ENFORCE(cudnnGetConvolutionForwardAlgorithm(
755  cudnn_wrapper_.inline_cudnn_handle(),
756  bottom_desc_,
757  filter_desc_,
758  conv_desc_,
759  top_desc_,
760  CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
761  cudnn_ws_nbytes_limit_,
762  &algo_));
763  }
764  for (int step = 0; step < 2; ++step) {
765  cudnnStatus_t _status = cudnnGetConvolutionForwardWorkspaceSize(
766  cudnn_wrapper_.inline_cudnn_handle(),
767  bottom_desc_,
768  filter_desc_,
769  conv_desc_,
770  top_desc_,
771  algo_,
772  &cudnn_ws_nbytes_);
773  if (step == 0) {
774  if (_status == CUDNN_STATUS_SUCCESS) {
775  break;
776  }
777  if (_status == CUDNN_STATUS_NOT_SUPPORTED) {
778  cudnnConvolutionFwdAlgo_t new_algo = deterministic_
779  ? CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
780  : CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
781  VLOG(1) << "Forward algorithm " << (int)algo_
782  << " is not currently supported for given parameters."
783  << " Trying the default algorithm " << (int)new_algo;
784  algo_ = new_algo;
785  continue;
786  }
787  }
788  CUDNN_ENFORCE(_status);
789  }
790  VLOG(1) << "CuDNN algorithm: " << algo_;
791  VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
792  }
793 
794  // Now, actually run the computation.
795  // Run directly through cuDNN if possible
796 #if CUDNN_VERSION_MIN(7, 0, 0)
797  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
798  CUDNN_ENFORCE(cudnnConvolutionForward(
799  state->cudnn_handle(),
801  bottom_desc_,
802  X.template data<T_X>(),
803  filter_desc_,
804  filter.template data<T_W>(),
805  conv_desc_,
806  algo_,
807  state->workspace().get(cudnn_ws_nbytes_),
808  cudnn_ws_nbytes_,
810  top_desc_,
811  Y->template mutable_data<T_Y>()));
812  });
813 #else
814  // otherwise manually run through groups
815  for (int i = 0; i < group_; ++i) {
816  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
817  CUDNN_ENFORCE(cudnnConvolutionForward(
818  state->cudnn_handle(),
820  bottom_desc_,
821  X.template data<T_X>() + i * group_offset_X,
822  filter_desc_,
823  filter.template data<T_W>() + i * group_offset_filter,
824  conv_desc_,
825  algo_,
826  state->workspace().get(cudnn_ws_nbytes_),
827  cudnn_ws_nbytes_,
829  top_desc_,
830  Y->template mutable_data<T_Y>() + i * group_offset_Y));
831  });
832  }
833 #endif
834  // Bias
835  if (InputSize() == 3) {
836  auto& bias = Input(BIAS);
837 
838  CAFFE_ENFORCE_EQ(bias.dim(), 1);
839  CAFFE_ENFORCE_EQ(bias.dim32(0), M);
840 
841  CUDNN_ENFORCE(cudnnAddTensor(
842  cudnn_wrapper_.inline_cudnn_handle(),
844  bias_desc_,
845  bias.template data<T_B>(),
847  top_desc_for_bias_,
848  Y->template mutable_data<T_Y>()));
849  }
850  // Done.
851  return true;
852 }
853 
854 bool CudnnConvOp::RunOnDevice() {
855  if (Input(0).IsType<float>()) {
856  return DoRunWithType<
857  float, // X
858  float, // W
859  float, // B
860  float>(); // Y
861  } else if (Input(0).IsType<at::Half>()) {
862  return DoRunWithType<
863  at::Half, // X
864  at::Half, // W
865  at::Half, // B
866  at::Half>(); // Y
867  } else {
868  LOG(FATAL) << "Only float (32bit) and Half are supported by "
869  << "cudnn convolution, but input " << debug_def().input(0)
870  << " has [" << Input(0).dtype().name() << "]";
871  }
872  return true;
873 }
874 
875 template <
876  typename T_X,
877  typename T_DY,
878  typename T_W,
879  typename T_B,
880  typename T_DX,
881  typename T_DW,
882  typename T_DB>
883 bool CudnnConvGradientOp::DoRunWithType() {
884  auto& X = Input(INPUT);
885  auto& filter = Input(FILTER);
886  auto& dY = Input(OUTPUT_GRAD);
887 
888  CAFFE_ENFORCE(X.dim() >= 3 && X.dim() <= 5);
889  CAFFE_ENFORCE(filter.dim() >= 3 && filter.dim() <= 5);
890 
891  const int M = filter.dim32(0);
892  int N = 0, C = 0, H = 0, W = 0, D = 0, H_out = 0, W_out = 0, D_out = 0;
893  int group_offset_X = 0, group_offset_Y = 0;
894 
895  switch (order_) {
896  case StorageOrder::NHWC:
897  N = X.dim32(0);
898  H = X.dim32(1);
899  W = X.dim() > 3 ? X.dim32(2) : 1;
900  D = X.dim() > 4 ? X.dim32(3) : 1;
901  C = X.dim32(X.dim() - 1);
902  H_out = dY.dim32(1);
903  W_out = dY.dim() > 3 ? dY.dim32(2) : 1;
904  D_out = dY.dim() > 4 ? dY.dim32(3) : 1;
905  for (int i = 0; i < kernel_.size(); ++i) {
906  CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
907  }
908  CAFFE_ENFORCE_EQ(filter.dim32(filter.dim() - 1), C / group_);
909  group_offset_X = C / group_;
910  group_offset_Y = M / group_;
911  break;
912  case StorageOrder::NCHW:
913  N = X.dim32(0);
914  C = X.dim32(1);
915  H = X.dim32(2);
916  W = X.dim() > 3 ? X.dim32(3) : 1;
917  D = X.dim() > 4 ? X.dim32(4) : 1;
918  H_out = dY.dim32(2);
919  W_out = dY.dim() > 3 ? dY.dim32(3) : 1;
920  D_out = dY.dim() > 4 ? dY.dim32(4) : 1;
921  CAFFE_ENFORCE_EQ(filter.dim32(1), C / group_);
922  for (int i = 0; i < kernel_.size(); ++i) {
923  CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
924  }
925  group_offset_X = C / group_ * H * W * D;
926  group_offset_Y = M / group_ * H_out * W_out * D_out;
927  break;
928  default:
929  LOG(FATAL) << "Unknown storage order: " << order_;
930  }
931 
932  CAFFE_ENFORCE(
933  C % group_ == 0,
934  "If you set group, the number of input channels should be divisible "
935  "by group.");
936  CAFFE_ENFORCE(
937  M % group_ == 0,
938  "If you set group, the number of output channels should be divisible "
939  "by group.");
940 
941  int group_offset_filter = filter.numel() / group_;
942  if (kernel_.size() == 1) {
944  } else if (kernel_.size() == 2) {
946  } else if (kernel_.size() == 3) {
948  } else {
949  CAFFE_THROW("Unsupported kernel size:", kernel_.size());
950  }
951  auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T_DW>());
952 
953  if (N == 0) {
954  math::Set<T_DW, CUDAContext>(
955  dfilter->numel(),
956  T_DW(0),
957  dfilter->template mutable_data<T_DW>(),
958  &context_);
959  if (!no_bias_) {
960  auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T_DB>());
961  math::Set<T_DB, CUDAContext>(
962  dbias->numel(),
963  T_DB(0),
964  dbias->template mutable_data<T_DB>(),
965  &context_);
966  }
967  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
968  auto* dX = Output(
969  no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
970  X.sizes(),
971  at::dtype<T_DX>());
972  dX->template mutable_data<T_DX>();
973  }
974  return true;
975  }
976 
977  // Set up the cudnn algorithms & workspace if necessary
978  bool input_changed = (X.sizes() != cudnn_input_dims_);
979  bool filter_changed = (filter.sizes() != cudnn_filter_dims_);
980  if (input_changed || filter_changed) {
981  VLOG(1) << "Changing the cudnn descriptor configurations.";
982  if (input_changed) {
983  cudnn_input_dims_ = X.sizes().vec();
984  SetTensorNdDescriptorWithGroup<T_X>(X.dim(), bottom_desc_, N, C, H, W, D);
985  }
986  if (filter_changed) {
987  cudnn_filter_dims_ = filter.sizes().vec();
988  if (kernel_.size() == 1 || kernel_.size() == 2) {
989 #if CUDNN_VERSION_MIN(7, 0, 0)
990  const int MM = M;
991 #else
992  const int MM = M / group_;
993 #endif
994  CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
995  filter_desc_,
997  GetCudnnTensorFormat(order_),
998  MM,
999  C / group_,
1000  kernel_h(),
1001  kernel_.size() == 1 ? 1 : kernel_w()));
1002  } else {
1003  vector<int> dims(filter.sizes().begin(), filter.sizes().end());
1004 #if !CUDNN_VERSION_MIN(7, 0, 0)
1005  // We only need to divide dims by group_ when CUDNN version < 7.0
1006  // see CUDA group convolution doc: https://fburl.com/dgj6dvpd
1007  order_ == StorageOrder::NCHW ? dims[1] /= group_
1008  : dims[filter.ndim() - 1] /= group_;
1009 #endif
1010 
1011  CUDNN_ENFORCE(cudnnSetFilterNdDescriptor(
1012  filter_desc_,
1014  GetCudnnTensorFormat(order_),
1015  dims.size(),
1016  dims.data()));
1017  }
1018  if (!no_bias_) {
1019  if (kernel_.size() == 1 || kernel_.size() == 2) {
1020  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
1021  bias_desc_,
1022  GetCudnnTensorFormat(order_),
1024  1,
1025  M,
1026  1,
1027  1));
1028  } else {
1029  std::vector<int> bias_dims(X.dim(), 1);
1030  bias_dims[1] = M;
1031  std::vector<int> strides = {M, 1, 1, 1, 1, 1};
1032  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
1033  bias_desc_,
1035  X.dim() > 3 ? X.dim() : 4,
1036  bias_dims.data(),
1037  strides.data()));
1038  }
1039  }
1040  }
1041  // Set the output
1042  SetTensorNdDescriptorWithGroup<T_DX>(
1043  X.dim(), top_desc_, N, M, H_out, W_out, D_out);
1044  // Set the output with descriptor useful for bias addition in one run.
1045  if (kernel_.size() == 1 || kernel_.size() == 2) {
1046  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
1047  top_desc_for_bias_,
1048  GetCudnnTensorFormat(order_),
1050  N,
1051  M,
1052  H_out,
1053  W_out));
1054  } else {
1055  vector<int> dims = {N, M, H_out, W_out, D_out};
1056  vector<int> strides = {M * H_out * W_out * D_out,
1057  H_out * W_out * D_out,
1058  W_out * D_out,
1059  D_out,
1060  1};
1061  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
1062  top_desc_for_bias_,
1064  X.dim() > 3 ? X.dim() : 4,
1065  dims.data(),
1066  strides.data()));
1067  }
1068 
1069  compute_type_ = DetermineComputeTypeFromInput(X);
1070  SetConvDescFromArguments();
1071 
1072  DuplicateConvDesc(
1073  conv_desc_, kernel_.size(), dilation_.size(), bwd_filter_conv_desc_);
1074  DuplicateConvDesc(
1075  conv_desc_, kernel_.size(), dilation_.size(), bwd_data_conv_desc_);
1076 
1077 #if CUDNN_VERSION_MIN(7, 0, 0)
1078  if (enable_tensor_core_) {
1079  CUDNN_ENFORCE(cudnnSetConvolutionMathType(
1080  bwd_filter_conv_desc_, CUDNN_TENSOR_OP_MATH));
1081  CUDNN_ENFORCE(cudnnSetConvolutionMathType(
1082  bwd_data_conv_desc_, CUDNN_TENSOR_OP_MATH));
1083  }
1084 
1085  // set cuDNN groups if appropriate
1086  CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_filter_conv_desc_, group_));
1087  CUDNN_CHECK(cudnnSetConvolutionGroupCount(bwd_data_conv_desc_, group_));
1088 #endif
1089 
1090  // Choose dW algorithm
1091  if (force_algo_[ALGO_WGRAD] >= 0) {
1092  bwd_filter_algo_ =
1093  (cudnnConvolutionBwdFilterAlgo_t)force_algo_[ALGO_WGRAD];
1094  } else if (deterministic_) {
1095  bwd_filter_algo_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
1096  } else if (exhaustive_search_) {
1097  // Even when FP16 compute is supported and requested, try FP32
1098  // because it may be faster. However, if FP32 compute is specified,
1099  // FP16 is not a suitable alternative - early out from the loop.
1100  std::array<ConvBwdFilterAlgorithmWithCost, 2> algosToCompare;
1101  for (int i = 0; i < 2; i++) {
1102  SetConvDescComputeType(bwd_filter_conv_desc_, kComputeTypesToTry[i]);
1103 
1104  algosToCompare[i] = filter_algo_cache_.getAlgorithm(
1105  X.sizes(), filter.sizes(), kComputeTypesToTry[i], [&]() {
1106  VLOG(1) << "CUDNN Convolution bwd: doing filter exhaustive"
1107  << "search for " << kComputePassNames[i];
1108  // When we do an exhaustive search, we will ignore the workspace
1109  // size limit and simply go for the fastest algorithm. If you
1110  // happen to run out of memory later, you will be on your own...
1111  int returned_algo_count;
1112  // We clean up the current workspace memory so that the forward
1113  // algorithm is free to allocate memory.
1114  // Actually run the search.
1115  std::array<
1116  cudnnConvolutionBwdFilterAlgoPerf_t,
1117  kNUM_CUDNN_BWD_FILTER_ALGS>
1118  filter_perf_stat;
1119 
1120  cudnn_wrapper_.with_cudnn_state(
1121  cudnn_state_, [&](CuDNNState* state) {
1122  CUDNN_ENFORCE(cudnnFindConvolutionBackwardFilterAlgorithmEx(
1123  state->cudnn_handle(),
1124  bottom_desc_,
1125  X.template data<T_X>(),
1126  top_desc_,
1127  dY.template data<T_DY>(),
1128  bwd_filter_conv_desc_,
1129  filter_desc_,
1130  dfilter->template mutable_data<T_DW>(),
1131  kNUM_CUDNN_BWD_FILTER_ALGS,
1132  &returned_algo_count,
1133  filter_perf_stat.data(),
1134  state->workspace().get(cudnn_ws_nbytes_limit_),
1135  cudnn_ws_nbytes_limit_));
1136  });
1137  LogCuDNNPerfStats(filter_perf_stat, returned_algo_count);
1138  float algo_time =
1139  filter_perf_stat[0].status == CUDNN_STATUS_SUCCESS
1140  ? filter_perf_stat[0].time
1141  : 1e10;
1142  return ConvBwdFilterAlgorithmWithCost(
1143  filter_perf_stat[0].algo, algo_time);
1144  });
1145 
1146  // When set to fp32 compute, don't try fp16
1147  if (compute_type_ == CUDNN_DATA_FLOAT) {
1148  break;
1149  }
1150  }
1151 
1152  if (compute_type_ == CUDNN_DATA_FLOAT) {
1153  // For FP32 compute, just use the best FP32 algorithm
1154  bwd_filter_algo_ = std::get<0>(algosToCompare[0]);
1155  } else {
1156  // For FP16 compute, choose algo with fastest execution
1157  int bestAlgoIndex =
1158  (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
1159  ? 0
1160  : 1;
1161  bwd_filter_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
1162  SetConvDescComputeType(
1163  bwd_filter_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
1164  }
1165  } else {
1166  // choose backward algorithm for filter
1167  CUDNN_ENFORCE(cudnnGetConvolutionBackwardFilterAlgorithm(
1168  cudnn_wrapper_.inline_cudnn_handle(),
1169  bottom_desc_,
1170  top_desc_,
1171  bwd_filter_conv_desc_,
1172  filter_desc_,
1173  CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
1174  cudnn_ws_nbytes_limit_,
1175  &bwd_filter_algo_));
1176  }
1177  // Pick dX algo if needed
1178  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1179  if (force_algo_[ALGO_DGRAD] >= 0) {
1180  bwd_data_algo_ = (cudnnConvolutionBwdDataAlgo_t)force_algo_[ALGO_DGRAD];
1181  } else if (deterministic_) {
1182  bwd_data_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
1183  } else if (exhaustive_search_) {
1184  // Even when FP16 compute is supported and requested, try FP32
1185  // because it may be faster. However, if FP32 compute is specified,
1186  // FP16 is not a suitable alternative - early out from the loop.
1187  std::array<ConvBwdDataAlgorithmWithCost, 2> algosToCompare;
1188  for (int i = 0; i < 2; i++) {
1189  SetConvDescComputeType(bwd_data_conv_desc_, kComputeTypesToTry[i]);
1190 
1191  algosToCompare[i] = data_algo_cache_.getAlgorithm(
1192  X.sizes(), filter.sizes(), kComputeTypesToTry[i], [&]() {
1193  VLOG(1) << "CUDNN Convolution bwd: doing data exhaustive"
1194  << "search for " << kComputePassNames[i];
1195  int returned_algo_count;
1196 
1197  std::array<
1198  cudnnConvolutionBwdDataAlgoPerf_t,
1199  kNUM_CUDNN_BWD_DATA_ALGS>
1200  data_perf_stat;
1201  cudnn_wrapper_.with_cudnn_state(
1202  cudnn_state_, [&](CuDNNState* state) {
1203  auto* dX = Output(
1204  no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
1205  X.sizes(),
1206  at::dtype<T_DX>());
1207  const T_W* filter_data = filter.template data<T_W>();
1208  const T_DY* dYdata = dY.template data<T_DY>();
1209  T_DX* dXdata = dX->template mutable_data<T_DX>();
1210  CUDNN_ENFORCE(cudnnFindConvolutionBackwardDataAlgorithmEx(
1211  state->cudnn_handle(),
1212  filter_desc_,
1213  filter_data,
1214  top_desc_,
1215  dYdata,
1216  bwd_data_conv_desc_,
1217  bottom_desc_,
1218  dXdata,
1219  kNUM_CUDNN_BWD_DATA_ALGS,
1220  &returned_algo_count,
1221  data_perf_stat.data(),
1222  state->workspace().get(cudnn_ws_nbytes_limit_),
1223  cudnn_ws_nbytes_limit_));
1224  });
1225 
1226  LogCuDNNPerfStats(data_perf_stat, returned_algo_count);
1227  float algo_time =
1228  data_perf_stat[0].status == CUDNN_STATUS_SUCCESS
1229  ? data_perf_stat[0].time
1230  : 1e10;
1231  return ConvBwdDataAlgorithmWithCost(
1232  data_perf_stat[0].algo, algo_time);
1233  });
1234 
1235  // When set to fp32 compute, don't try fp16
1236  if (compute_type_ == CUDNN_DATA_FLOAT) {
1237  break;
1238  }
1239  }
1240 
1241  if (compute_type_ == CUDNN_DATA_FLOAT) {
1242  // For FP32 compute, just use the best FP32 algorithm
1243  bwd_data_algo_ = std::get<0>(algosToCompare[0]);
1244  } else {
1245  // For FP16 compute, choose algo with fastest execution
1246  int bestAlgoIndex =
1247  (std::get<1>(algosToCompare[0]) < std::get<1>(algosToCompare[1]))
1248  ? 0
1249  : 1;
1250  bwd_data_algo_ = std::get<0>(algosToCompare[bestAlgoIndex]);
1251  SetConvDescComputeType(
1252  bwd_data_conv_desc_, kComputeTypesToTry[bestAlgoIndex]);
1253  }
1254  } else {
1255  CUDNN_ENFORCE(cudnnGetConvolutionBackwardDataAlgorithm(
1256  cudnn_wrapper_.inline_cudnn_handle(),
1257  filter_desc_,
1258  top_desc_,
1259  bwd_data_conv_desc_,
1260  bottom_desc_,
1261  CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
1262  cudnn_ws_nbytes_limit_,
1263  &bwd_data_algo_));
1264  }
1265  }
1266 
1267  // get workspace size for backwards filter algorithm
1268  size_t bwd_filter_ws_size, bwd_data_ws_size;
1269 
1270  for (int step = 0; step < 2; ++step) {
1271  cudnnStatus_t _status = cudnnGetConvolutionBackwardFilterWorkspaceSize(
1272  cudnn_wrapper_.inline_cudnn_handle(),
1273  bottom_desc_,
1274  top_desc_,
1275  bwd_filter_conv_desc_,
1276  filter_desc_,
1277  bwd_filter_algo_,
1278  &bwd_filter_ws_size);
1279  if (step == 0) {
1280  if (_status == CUDNN_STATUS_SUCCESS) {
1281  break;
1282  }
1283  if (_status == CUDNN_STATUS_NOT_SUPPORTED) {
1284  cudnnConvolutionBwdFilterAlgo_t new_algo = deterministic_
1285  ? CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
1286  : CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
1287  VLOG(1) << "Backward Filter algorithm " << (int)bwd_filter_algo_
1288  << " is not currently supported for given parameters."
1289  << " Trying the default algorithm " << (int)new_algo;
1290  bwd_filter_algo_ = new_algo;
1291  continue;
1292  }
1293  }
1294  CUDNN_ENFORCE(_status);
1295  }
1296 
1297  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1298  // get workspace size for backwards data algorithm
1299  for (int step = 0; step < 2; ++step) {
1300  cudnnStatus_t _status = cudnnGetConvolutionBackwardDataWorkspaceSize(
1301  cudnn_wrapper_.inline_cudnn_handle(),
1302  filter_desc_,
1303  top_desc_,
1304  bwd_data_conv_desc_,
1305  bottom_desc_,
1306  bwd_data_algo_,
1307  &bwd_data_ws_size);
1308  if (step == 0) {
1309  if (_status == CUDNN_STATUS_SUCCESS) {
1310  break;
1311  }
1312  if (_status == CUDNN_STATUS_NOT_SUPPORTED) {
1313  cudnnConvolutionBwdDataAlgo_t new_algo = deterministic_
1314  ? CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
1315  : CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
1316  VLOG(1) << "Backward Data algorithm " << (int)bwd_data_algo_
1317  << " is not currently supported for given parameters."
1318  << " Trying the default algorithm " << (int)new_algo;
1319  bwd_data_algo_ = new_algo;
1320  continue;
1321  }
1322  }
1323  CUDNN_ENFORCE(_status);
1324  }
1325  } else {
1326  bwd_data_ws_size = 0;
1327  }
1328  cudnn_ws_nbytes_ = std::max(bwd_filter_ws_size, bwd_data_ws_size);
1329 
1330  VLOG(1) << "CuDNN bwd data & filter algorithm: " << bwd_data_algo_ << ", "
1331  << bwd_filter_algo_;
1332  VLOG(1) << "CuDNN workspace size: " << cudnn_ws_nbytes_;
1333  }
1334 
1335  // Now, actually run the computation.
1336  if (!no_bias_) {
1337  auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T_DB>());
1338  CUDNN_ENFORCE(cudnnConvolutionBackwardBias(
1339  cudnn_wrapper_.inline_cudnn_handle(),
1341  top_desc_for_bias_,
1342  dY.template data<T_DY>(),
1344  bias_desc_,
1345  dbias->template mutable_data<T_DB>()));
1346  }
1347 
1348 #if CUDNN_VERSION_MIN(7, 0, 0)
1349  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
1350  CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
1351  state->cudnn_handle(),
1353  bottom_desc_,
1354  X.template data<T_X>(),
1355  top_desc_,
1356  dY.template data<T_DY>(),
1357  bwd_filter_conv_desc_,
1358  bwd_filter_algo_,
1359  state->workspace().get(cudnn_ws_nbytes_),
1360  cudnn_ws_nbytes_,
1362  filter_desc_,
1363  dfilter->template mutable_data<T_DW>()));
1364  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1365  // Compute the gradient w.r.t. the input.
1366 
1367  auto* dX = Output(
1368  no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
1369  X.sizes(),
1370  at::dtype<T_DX>());
1371  CUDNN_ENFORCE(cudnnConvolutionBackwardData(
1372  state->cudnn_handle(),
1374  filter_desc_,
1375  filter.template data<T_W>(),
1376  top_desc_,
1377  dY.template data<T_DY>(),
1378  bwd_data_conv_desc_,
1379  bwd_data_algo_,
1380  state->workspace().get(cudnn_ws_nbytes_),
1381  cudnn_ws_nbytes_,
1383  bottom_desc_,
1384  dX->template mutable_data<T_DX>()));
1385  }
1386  });
1387 #else
1388  for (int i = 0; i < group_; ++i) {
1389  cudnn_wrapper_.with_cudnn_state(cudnn_state_, [&](CuDNNState* state) {
1390  CUDNN_ENFORCE(cudnnConvolutionBackwardFilter(
1391  state->cudnn_handle(),
1393  bottom_desc_,
1394  X.template data<T_X>() + i * group_offset_X,
1395  top_desc_,
1396  dY.template data<T_DY>() + i * group_offset_Y,
1397  bwd_filter_conv_desc_,
1398  bwd_filter_algo_,
1399  state->workspace().get(cudnn_ws_nbytes_),
1400  cudnn_ws_nbytes_,
1402  filter_desc_,
1403  dfilter->template mutable_data<T_DW>() + i * group_offset_filter));
1404  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
1405  // Compute the gradient w.r.t. the input.
1406  auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
1407  dX->ResizeLike(X);
1408  CUDNN_ENFORCE(cudnnConvolutionBackwardData(
1409  state->cudnn_handle(),
1411  filter_desc_,
1412  filter.template data<T_W>() + i * group_offset_filter,
1413  top_desc_,
1414  dY.template data<T_DY>() + i * group_offset_Y,
1415  bwd_data_conv_desc_,
1416  bwd_data_algo_,
1417  state->workspace().get(cudnn_ws_nbytes_),
1418  cudnn_ws_nbytes_,
1420  bottom_desc_,
1421  dX->template mutable_data<T_DX>() + i * group_offset_X));
1422  }
1423  });
1424  }
1425 #endif
1426  return true;
1427 }
1428 
1429 // TODO(Yangqing): a lot of the function contents are very similar. Consider
1430 // consolidating them.
1431 bool CudnnConvGradientOp::RunOnDevice() {
1432  if (Input(0).IsType<float>()) {
1433  return DoRunWithType<
1434  float, // X
1435  float, // dY
1436  float, // W
1437  float, // b
1438  float, // dX
1439  float, // dW
1440  float>(); // db
1441  } else if (Input(0).IsType<at::Half>()) {
1442  return DoRunWithType<
1443  at::Half, // X
1444  at::Half, // dY
1445  at::Half, // W
1446  at::Half, // b
1447  at::Half, // dX
1448  at::Half, // dW
1449  at::Half>(); // db
1450  } else {
1451  LOG(FATAL) << "Unsupported input types";
1452  }
1453  return true;
1454 }
1455 
1456 REGISTER_CUDNN_OPERATOR(Conv, CudnnConvOp);
1457 REGISTER_CUDNN_OPERATOR(ConvGradient, CudnnConvGradientOp);
1458 
1459 REGISTER_CUDNN_OPERATOR(Conv1D, CudnnConvOp);
1460 REGISTER_CUDNN_OPERATOR(Conv1DGradient, CudnnConvGradientOp);
1461 
1462 REGISTER_CUDNN_OPERATOR(Conv2D, CudnnConvOp);
1463 REGISTER_CUDNN_OPERATOR(Conv2DGradient, CudnnConvGradientOp);
1464 
1465 REGISTER_CUDNN_OPERATOR(Conv3D, CudnnConvOp);
1466 REGISTER_CUDNN_OPERATOR(Conv3DGradient, CudnnConvGradientOp);
1467 
1468 } // namespace caffe2
Definition: any.cpp:108
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Definition: common_cudnn.h:192
bool TensorCoreAvailable()
Return the availability of TensorCores for math.
Definition: common_gpu.cc:217
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
const Tensor & Input(int idx, DeviceType type=CUDAContext::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: OpClasses.h:13
Definition: static.cpp:64
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:139
bool HasArgument(const string &name) const
Checks if the operator has an argument of the given name.
Definition: operator.h:70
Definition: static.cpp:70
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...
Definition: common_cudnn.h:120