Caffe2 - C++ API
A deep learning, cross platform ML framework
Conv.cpp
1 #include <ATen/ATen.h>
2 #include <ATen/NativeFunctions.h>
3 #include <ATen/Config.h>
4 #include <ATen/cuda/CUDAConfig.h>
5 #include <ATen/cuda/Exceptions.h>
6 
7 #if !AT_CUDNN_ENABLED()
8 
9 namespace at { namespace native {
10 
11 // See Note [ATen preprocessor philosophy]
12 
13 at::Tensor cudnn_convolution(
14  const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
15  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
16  int64_t groups, bool benchmark, bool deterministic) {
17  AT_ERROR("cudnn_convolution: ATen not compiled with cuDNN support");
18 }
19 
20 at::Tensor cudnn_convolution_backward_input(
21  IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight,
22  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
23  bool benchmark, bool deterministic) {
24  AT_ERROR("cudnn_convolution_backward_input: ATen not compiled with cuDNN support");
25 }
26 
27 at::Tensor cudnn_convolution_backward_weight(
28  IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input,
29  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
30  bool benchmark, bool deterministic) {
31  AT_ERROR("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support");
32 }
33 
34 at::Tensor cudnn_convolution_backward_bias(
35  const at::Tensor& grad_output) {
36  AT_ERROR("cudnn_convolution_backward_bias: ATen not compiled with cuDNN support");
37 }
38 
39 std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_backward(
40  const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
41  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
42  bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
43  AT_ERROR("cudnn_convolution_backward: ATen not compiled with cuDNN support");
44 }
45 
46 at::Tensor cudnn_convolution_transpose(
47  const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias /* optional */,
48  IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
49  int64_t groups, bool benchmark, bool deterministic) {
50  AT_ERROR("cudnn_convolution_transpose: ATen not compiled with cuDNN support");
51 }
52 
53 at::Tensor cudnn_convolution_transpose_backward_input(
54  const at::Tensor& grad_output, const at::Tensor& weight,
55  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
56  int64_t groups, bool benchmark, bool deterministic) {
57  AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
58 }
59 
60 at::Tensor cudnn_convolution_transpose_backward_weight(
61  IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input,
62  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
63  bool benchmark, bool deterministic) {
64  AT_ERROR("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support");
65 }
66 
67 std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
68  const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight,
69  IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
70  bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
71  AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support");
72 }
73 
74 }}
75 
76 #else // AT_CUDNN_ENABLED
77 
78 #include <THC/THC.h>
79 
80 #include <ATen/cudnn/cudnn-wrapper.h>
81 #include <ATen/cudnn/Descriptors.h>
82 #include <ATen/cudnn/Types.h>
83 #include <ATen/cudnn/Utils.h>
84 #include <ATen/native/utils/ParamsHash.h>
85 
86 #include <ATen/TensorUtils.h>
87 
88 #include <functional>
89 #include <iterator>
90 #include <sstream>
91 #include <algorithm>
92 #include <memory>
93 #include <mutex>
94 #include <stdint.h>
95 #include <unordered_map>
96 
97 // Note [behavior of cudnnFind and cudnnGet]
98 // You'll notice that by default, in the ConvolutionDescriptor, we do the following:
99 //
100 // AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
101 // if(dataType == CUDNN_DATA_HALF)
102 // AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
103 //
104 // When cudnnSetConvolutionMathType is called before cudnnGet/cudnnFind, it informs
105 // cudnnGet/cudnnFind to iterate/take into account both tensor core and non-tensor-core algos.
106 // If you don't call cudnnSetConvolutionMathType before calling cudnnGet/cudnnFind,
107 // cudnnGet/cudnnFind may not pick tensor core algos.
108 //
109 // Now after its run, cudnnGet/cudnnFind comes up with the best pair of algo+mathType
110 // with all the initial knowledge its given. It then becomes the user's responsibility
111 // to update mathType of the convolution descriptor and call the subsequent cudnn calls with
112 // the best algo and the updated descriptor. If we don't update the descriptor but just run
113 // with the best algo, under the hood, cudnn will run with the slower kernel
114 // since it sees fastest algorithm combination with a sub optimal mathType.
115 
116 // Note [blacklist fft algorithms for strided dgrad]
117 // This is a workaround for a CuDNN bug that gave wrong results in certain strided convolution
118 // gradient setups. Check Issue #16610 for bug details. Bug is there for CUDNN version < 7.5 .
119 
120 namespace at { namespace native {
121 
122 // TODO: Go through all the checking code again and make sure
123 // we haven't missed anything.
124 
125 // ---------------------------------------------------------------------
126 //
127 // Math
128 //
129 // ---------------------------------------------------------------------
130 
131 constexpr int input_batch_size_dim = 0; // also grad_input
132 constexpr int input_channels_dim = 1;
133 constexpr int output_batch_size_dim = 0; // also grad_output
134 constexpr int output_channels_dim = 1;
135 constexpr int weight_output_channels_dim = 0;
136 constexpr int weight_input_channels_dim = 1;
137 
138 // Often written as 2 + max_dim (extra dims for batch size and channels)
139 constexpr int max_dim = 3;
140 
141 // NB: conv_output_size and conv_input_size are not bijections,
142 // as conv_output_size loses information; this is why conv_input_size
143 // takes an extra output_padding argument to resolve the ambiguity.
144 
145 static std::vector<int64_t> conv_output_size(
146  IntArrayRef input_size, IntArrayRef weight_size,
147  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
148 ) {
149  // ASSERT(input_size.size() > 2)
150  // ASSERT(input_size.size() == weight_size.size())
151  auto dim = input_size.size();
152  std::vector<int64_t> output_size(dim);
153  output_size[0] = input_size[input_batch_size_dim];
154  output_size[1] = weight_size[weight_output_channels_dim];
155  for (size_t d = 2; d < dim; ++d) {
156  auto kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
157  output_size[d] = (input_size[d] + (2 * padding[d - 2])
158  - kernel) / stride[d - 2] + 1;
159  }
160  return output_size;
161 }
162 
163 std::vector<int64_t> conv_input_size(
164  IntArrayRef output_size, IntArrayRef weight_size,
165  IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
166 ) {
167  // ASSERT(output_size.size() > 2)
168  // ASSERT(output_size.size() == weight_size.size())
169  auto dim = output_size.size();
170  std::vector<int64_t> input_size(dim);
171  input_size[0] = output_size[output_batch_size_dim];
172  input_size[1] = weight_size[weight_input_channels_dim] * groups;
173  for (size_t d = 2; d < dim; ++d) {
174  int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1;
175  input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) +
176  kernel + output_padding[d - 2];
177  }
178  return input_size;
179 }
180 
181 std::vector<int64_t> conv_weight_size(
182  IntArrayRef input_size, IntArrayRef output_size,
183  IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups
184 ) {
185  auto dim = input_size.size();
186  std::vector<int64_t> weight_size(dim);
187  weight_size[0] = output_size[1];
188  weight_size[1] = input_size[1] / groups;
189  for (size_t d = 2; d < dim; ++d) {
190  int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2]
191  + 2 * padding[d - 2] - output_padding[d - 2];
192  weight_size[d] = (kernel - 1) / dilation[d - 2] + 1;
193  }
194  return weight_size;
195 }
196 
197 // TODO: Move this into the standard library, with a better name?
198 Tensor narrowGroup(const Tensor& t, int dim, int group_idx, int64_t groups) {
199  auto group_size = t.size(dim) / groups;
200  return t.narrow(dim, group_idx * group_size, group_size);
201 }
202 
203 // ---------------------------------------------------------------------
204 //
205 // Checking
206 //
207 // ---------------------------------------------------------------------
208 
209 // Note [Legacy CuDNN grouped convolution support]
210 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
211 // CuDNN earlier than CuDNN 7 does not directly support group
212 // convolution, so we provide support for it by sequentially
213 // running a convolution per group with appropriately
214 // adjusted sizes. https://blog.yani.io/filter-group-tutorial/
215 // has a fairly good diagram explaining how it works.
216 
217 // Used on pad, stride and dilation
218 static void check_args(CheckedFrom c, IntArrayRef args, size_t expected_size, const char* arg_name)
219 {
220  AT_CHECK(args.size() <= expected_size,
221  "Too many ", arg_name, " values (", args.size(), ") supplied, expecting ",
222  expected_size, " (while checking arguments for ", c, ")");
223  AT_CHECK(args.size() >= expected_size,
224  "Not enough ", arg_name, " values (", args.size(), ") supplied, expecting ",
225  expected_size, " (while checking arguments for ", c, ")");
226 
227  auto num_negative_values = std::count_if(args.begin(), args.end(), [](int x){return x < 0;});
228  if (num_negative_values > 0){
229  std::stringstream ss;
230  ss << arg_name << " should be greater than zero but got (";
231  std::copy(args.begin(), args.end() - 1, std::ostream_iterator<int>(ss,", "));
232  ss << args.back() << ")" << " (while checking arguments for " << c << ")";
233  AT_ERROR(ss.str());
234  }
235 }
236 
237 
238 // NOTE [ Convolution checks ]
239 //
240 // NB: For many call sites, it is not strictly necessary to check all of
241 // these relationships (for example, for forward convolution, we compute
242 // the size of output ourselves, so we don't actually need to check
243 // output. However, writing a single function that does everything
244 // means we get to reuse it for both forwards and all backwards
245 // variants, even when the set of "real" inputs varies. The magic of
246 // relational computing!
247 //
248 // (There is one downside, which is that it is slightly harder to write
249 // error messages which are able to distinguish between real inputs
250 // (which the user can change) and computed inputs (which the user can
251 // only indirectly affect). It would be an interesting exercise to
252 // come up with a general framework to handle such situations.)
253 static void convolution_shape_check(
254  CheckedFrom c,
255  const TensorGeometryArg& input, const TensorGeometryArg& weight, const TensorGeometryArg& output,
256  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups)
257 {
258  check_args(c, padding, input->dim() - 2, "padding");
259  check_args(c, stride, padding.size(), "stride");
260  check_args(c, dilation, padding.size(), "dilation");
261 
262  // Input
263  checkDimRange(c, input, 3, 6 /* exclusive */);
264  checkSize(c, input, input_channels_dim, weight->size(1) * groups);
265 
266  // Weight
267  checkSameDim(c, input, weight);
268 
269  // TODO: check that output->size() matches output_sizes
270  // TODO: check that weight matches output->sizes()
271  checkSameDim(c, input, output);
272 }
273 
274 // This POD struct is used to let us easily compute hashes of the
275 // parameters
276 struct ConvolutionParams
277 {
278  cudnnDataType_t dataType;
279  int input_size[2 + max_dim];
280  int input_stride[2 + max_dim];
281  int weight_size[2 + max_dim];
282  int padding[max_dim];
283  int stride[max_dim];
284  int dilation[max_dim];
285  int64_t groups;
286  bool deterministic;
287  // NB: transposed purposely omitted: transposed just swaps
288  // forward and backward, so you can reuse the benchmark entry,
289 };
290 
291 // NB: This can't be a constructor, because then ConvolutionParams
292 // would not be a POD anymore.
293 // TODO: Use TensorGeometry here instead of the entire Tensor, which we
294 // don't actually need. (OTOH: We can always pass in
295 // grad_input/grad_output, so this is not very pressing)
296 void setConvolutionParams(
297  ConvolutionParams* params,
298  const at::Tensor& input, const at::Tensor& weight,
299  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
300  int64_t groups, bool deterministic) {
301 
302  cudnnDataType_t dataType = getCudnnDataType(input);
303  memset(params, 0, sizeof(ConvolutionParams));
304  params->dataType = dataType;
305  // ASSERT(weight.dim() == input.dim())
306  for (int i = 0; i != input.dim(); ++i) {
307  params->input_size[i] = (int) input.size(i);
308  params->input_stride[i] = (int) input.stride(i);
309  params->weight_size[i] = (int) weight.size(i);
310  }
311  // ASSERT(padding.size() == stride.size())
312  // ASSERT(padding.size() == dilation.size())
313  for (size_t i = 0; i != padding.size(); ++i) {
314  params->padding[i] = padding[i];
315  params->stride[i] = stride[i];
316  params->dilation[i] = dilation[i];
317  }
318  // In principle, we shouldn't parametrize by groups for legacy
319  // CuDNN, but it doesn't seem worth the effort to actually do this.
320  params->groups = groups;
321  params->deterministic = deterministic;
322 }
323 
324 // Convenience struct for passing around descriptors and data
325 // pointers
326 struct ConvolutionArgs {
327  cudnnHandle_t handle;
328  ConvolutionParams params;
329  TensorDescriptor idesc, odesc;
330  FilterDescriptor wdesc;
331  const Tensor& input, output, weight;
332  ConvolutionDescriptor cdesc;
333 
334  ConvolutionArgs(const Tensor& input, const Tensor& output, const Tensor& weight) : input(input), output(output), weight(weight) {
335  }
336 };
337 
338 // ---------------------------------------------------------------------
339 //
340 // Benchmarking
341 //
342 // ---------------------------------------------------------------------
343 
344 // TODO: Use something less heavy duty than a big honking mutex
345 template <typename T>
346 struct BenchmarkCache {
347  std::mutex mutex;
348  std::unordered_map<ConvolutionParams, T, ParamsHash<ConvolutionParams>, ParamsEqual<ConvolutionParams>> map;
349 
350  bool find(const ConvolutionParams& params, T* results) {
351  std::lock_guard<std::mutex> guard(mutex);
352  auto it = map.find(params);
353  if (it == map.end()) {
354  return false;
355  }
356  *results = it->second;
357  return true;
358  }
359 
360  void insert(const ConvolutionParams& params, const T& results) {
361  std::lock_guard<std::mutex> guard(mutex);
362  map[params] = results;
363  }
364 };
365 
366 BenchmarkCache<cudnnConvolutionFwdAlgoPerf_t> fwd_algos;
367 BenchmarkCache<cudnnConvolutionBwdDataAlgoPerf_t> bwd_data_algos;
368 BenchmarkCache<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filter_algos;
369 
370 // TODO: Stop manually allocating CUDA memory; allocate an ATen byte
371 // tensor instead.
372 struct Workspace {
373  Workspace(size_t size) : size(size), data(NULL) {
374  data = THCudaMalloc(globalContext().lazyInitCUDA(), size);
375  }
376  Workspace(const Workspace&) = delete;
377  Workspace(Workspace&&) = default;
378  Workspace& operator=(Workspace&&) = default;
379  ~Workspace() {
380  if (data) {
381  THCudaFree(globalContext().lazyInitCUDA(), data);
382  }
383  }
384 
385  size_t size;
386  void* data;
387 };
388 
389 template<typename perf_t>
390 struct algorithm_search {
391 };
392 
393 cudnnStatus_t getWorkspaceSize(
394  const ConvolutionArgs& args,
395  cudnnConvolutionFwdAlgo_t algo, size_t* sz)
396 {
397  return cudnnGetConvolutionForwardWorkspaceSize(
398  args.handle,
399  args.idesc.desc(),
400  args.wdesc.desc(),
401  args.cdesc.desc(),
402  args.odesc.desc(),
403  algo,
404  sz
405  );
406 }
407 cudnnStatus_t getWorkspaceSize(
408  const ConvolutionArgs& args,
409  cudnnConvolutionBwdDataAlgo_t algo, size_t* sz)
410 {
411  return cudnnGetConvolutionBackwardDataWorkspaceSize(
412  args.handle,
413  args.wdesc.desc(),
414  args.odesc.desc(),
415  args.cdesc.desc(),
416  args.idesc.desc(),
417  algo,
418  sz);
419 }
420 cudnnStatus_t getWorkspaceSize(
421  const ConvolutionArgs& args,
422  cudnnConvolutionBwdFilterAlgo_t algo, size_t* sz)
423 {
424  return cudnnGetConvolutionBackwardFilterWorkspaceSize(
425  args.handle,
426  args.idesc.desc(),
427  args.odesc.desc(),
428  args.cdesc.desc(),
429  args.wdesc.desc(),
430  algo,
431  sz);
432 }
433 
434 template<typename algo_t>
435 size_t getMaxWorkspaceSize(
436  const ConvolutionArgs& args,
437  const algo_t *algo, int n_algo)
438 {
439  THCState *state = globalContext().lazyInitCUDA();
440 
441  size_t max_ws_size = 0;
442  size_t max_block_size = 0;
443  size_t total_gpu_mem = 0;
444  size_t free_gpu_mem = 0;
445 
446  THCudaCheck(THCudaMemGetInfo(state, &free_gpu_mem, &total_gpu_mem, &max_block_size));
447 
448  for (int i = 0; i < n_algo; i++) {
449  cudnnStatus_t err;
450  size_t sz;
451  err = getWorkspaceSize(args, algo[i], &sz);
452  if (CUDNN_STATUS_SUCCESS != err || sz == 0
453  || sz < max_ws_size || sz > max_block_size) continue;
454  max_ws_size = sz;
455  }
456  return max_ws_size;
457 }
458 
459 template<typename perf_t>
460 perf_t getBestAlgorithm(perf_t *perfResults, const ConvolutionArgs& args, int n_algo) {
461  int best_algo_idx;
462  bool is_deterministic = false;
463  if (args.params.deterministic) {
464  // iterate over perf results of all algorithms and find the best deterministic algo
465  for (int i = 0; i < n_algo; i++) {
466  // TODO: Shouldn't all returned results be successful?
467  // Double check documentation for cudnnFindConvolutionForwardAlgorithmEx
468  if (perfResults[i].status == CUDNN_STATUS_SUCCESS &&
469  perfResults[i].determinism == CUDNN_DETERMINISTIC) {
470  best_algo_idx = i;
471  is_deterministic = true;
472  break;
473  }
474  }
475  if (!is_deterministic) {
476  AT_ERROR("no deterministic convolution algorithms available in CuDNN");
477  }
478  } else {
479  best_algo_idx = 0;
480  }
481 
482  // See Note [blacklist fft algorithms for strided dgrad]
483 #if CUDNN_VERSION < 7500
484  if (std::is_same<decltype(perfResults[best_algo_idx].algo), cudnnConvolutionBwdDataAlgo_t>::value) {
485  int stride_dim = args.input.dim() - 2;
486  bool blacklist = std::any_of(std::begin(args.params.stride),
487  std::begin(args.params.stride) + stride_dim,
488  [=](int n){return n != 1;});
489  if (blacklist && (static_cast<cudnnConvolutionBwdDataAlgo_t>(perfResults[best_algo_idx].algo) == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
490  || static_cast<cudnnConvolutionBwdDataAlgo_t>(perfResults[best_algo_idx].algo) == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
491  perfResults[best_algo_idx].algo = algorithm_search<perf_t>::DEFAULT_ALGO;
492  if (args.params.dataType == CUDNN_DATA_HALF) {
493  perfResults[best_algo_idx].mathType = CUDNN_TENSOR_OP_MATH;
494  } else {
495  perfResults[best_algo_idx].mathType = CUDNN_DEFAULT_MATH;
496  }
497  }
498  }
499 #endif
500 
501  return perfResults[best_algo_idx];
502 }
503 
504 template<>
505 struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
506  using perf_t = cudnnConvolutionFwdAlgoPerf_t;
507  using algo_t = cudnnConvolutionFwdAlgo_t;
508 
509  static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
510  static BenchmarkCache<perf_t>& cache() { return fwd_algos; }
511 
512  static perf_t findAlgorithm(const ConvolutionArgs& args, bool benchmark) {
513  static const algo_t algos[] = {
514  CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
515  CUDNN_CONVOLUTION_FWD_ALGO_FFT,
516  CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
517  CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
518  CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
519  CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
520  CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
521  CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
522  };
523  static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
524  static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
525  "Missing cuDNN convolution forward algorithms");
526  int perf_count;
527  std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
528  if (!benchmark) {
529  AT_CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(
530  args.handle,
531  args.idesc.desc(),
532  args.wdesc.desc(),
533  args.cdesc.desc(),
534  args.odesc.desc(),
535  num_algos,
536  &perf_count,
537  perf_results.get()));
538  } else {
539  size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
540  Workspace ws(max_ws_size);
541  AT_CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithmEx(
542  args.handle,
543  args.idesc.desc(), args.input.data_ptr(),
544  args.wdesc.desc(), args.weight.data_ptr(),
545  args.cdesc.desc(),
546  args.odesc.desc(), args.output.data_ptr(),
547  num_algos,
548  &perf_count,
549  perf_results.get(),
550  ws.data,
551  ws.size));
552  }
553  return getBestAlgorithm<perf_t>(perf_results.get(), args, perf_count);
554  }
555 
556  static void getWorkspaceSize(
557  const ConvolutionArgs& args,
558  algo_t algo, size_t* workspaceSize)
559  {
560  AT_CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
561  args.handle,
562  args.idesc.desc(),
563  args.wdesc.desc(),
564  args.cdesc.desc(),
565  args.odesc.desc(),
566  algo,
567  workspaceSize));
568  }
569 };
570 
571 template<>
572 struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
573  using perf_t = cudnnConvolutionBwdDataAlgoPerf_t;
574  using algo_t = cudnnConvolutionBwdDataAlgo_t;
575 
576  static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
577  static BenchmarkCache<perf_t>& cache() { return bwd_data_algos; }
578 
579  static perf_t findAlgorithm(const ConvolutionArgs& args, bool benchmark) {
580  static const algo_t algos[] = {
581  CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
582  CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
583  CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
584  CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
585  CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
586  CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED
587  };
588  static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
589  static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
590  "Missing cuDNN convolution backward data algorithms.");
591  int perf_count;
592  std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
593  if (!benchmark) {
594  AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataAlgorithm_v7(
595  args.handle,
596  args.wdesc.desc(),
597  args.odesc.desc(),
598  args.cdesc.desc(),
599  args.idesc.desc(),
600  num_algos,
601  &perf_count,
602  perf_results.get()));
603  } else {
604  size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
605  Workspace ws(max_ws_size);
606  AT_CUDNN_CHECK(cudnnFindConvolutionBackwardDataAlgorithmEx(
607  args.handle,
608  args.wdesc.desc(), args.weight.data_ptr(),
609  args.odesc.desc(), args.output.data_ptr(),
610  args.cdesc.desc(),
611  args.idesc.desc(), args.input.data_ptr(),
612  num_algos,
613  &perf_count,
614  perf_results.get(),
615  ws.data,
616  ws.size));
617  }
618  return getBestAlgorithm<perf_t>(perf_results.get(), args, perf_count);
619  }
620 
621  static void getWorkspaceSize(
622  const ConvolutionArgs& args,
623  cudnnConvolutionBwdDataAlgo_t algo, size_t* workspaceSize)
624  {
625  AT_CUDNN_CHECK(cudnnGetConvolutionBackwardDataWorkspaceSize(
626  args.handle,
627  args.wdesc.desc(),
628  args.odesc.desc(),
629  args.cdesc.desc(),
630  args.idesc.desc(),
631  algo,
632  workspaceSize));
633  }
634 };
635 
636 template<>
637 struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
638  using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
639  using algo_t = cudnnConvolutionBwdFilterAlgo_t;
640 
641  static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
642 
643  static BenchmarkCache<perf_t>& cache() { return bwd_filter_algos; }
644 
645  static perf_t findAlgorithm(const ConvolutionArgs& args, bool benchmark) {
646  static const algo_t algos[] = {
647  CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
648  CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
649  CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
650  CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
651  CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
652  CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
653  };
654  // NOTE: - 1 because ALGO_WINOGRAD is not implemented
655  static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
656  static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos,
657  "Missing cuDNN convolution backward filter algorithms.");
658  std::unique_ptr<perf_t[]> perf_results(new perf_t[num_algos]);
659  int perf_count;
660  if (!benchmark) {
661  AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
662  args.handle,
663  args.idesc.desc(),
664  args.odesc.desc(),
665  args.cdesc.desc(),
666  args.wdesc.desc(),
667  num_algos,
668  &perf_count,
669  perf_results.get()));
670  } else {
671  size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
672  Workspace ws(max_ws_size);
673  AT_CUDNN_CHECK(cudnnFindConvolutionBackwardFilterAlgorithmEx(
674  args.handle,
675  args.idesc.desc(), args.input.data_ptr(),
676  args.odesc.desc(), args.output.data_ptr(),
677  args.cdesc.desc(),
678  args.wdesc.desc(), args.weight.data_ptr(),
679  num_algos,
680  &perf_count,
681  perf_results.get(),
682  ws.data,
683  ws.size));
684  }
685  return getBestAlgorithm<perf_t>(perf_results.get(), args, perf_count);
686  }
687 
688  static void getWorkspaceSize(const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize)
689  {
690  AT_CUDNN_CHECK(cudnnGetConvolutionBackwardFilterWorkspaceSize(
691  args.handle,
692  args.idesc.desc(),
693  args.odesc.desc(),
694  args.cdesc.desc(),
695  args.wdesc.desc(),
696  algo,
697  workspaceSize));
698  }
699 };
700 
701 template<typename perf_t>
702 void findAlgorithm(const ConvolutionArgs& args, bool benchmark, perf_t* algoPerf) {
703  using search = algorithm_search<perf_t>;
704  auto& cache = search::cache();
705 
706  if (cache.find(args.params, algoPerf)) {
707  return;
708  }
709 
710  if (args.params.deterministic && !benchmark) {
711  algoPerf->algo = search::DEFAULT_ALGO;
712  if (args.params.dataType == CUDNN_DATA_HALF) {
713  algoPerf->mathType = CUDNN_TENSOR_OP_MATH;
714  } else {
715  algoPerf->mathType = CUDNN_DEFAULT_MATH;
716  }
717  search::getWorkspaceSize(args, algoPerf->algo, &(algoPerf->memory));
718  return;
719  }
720 
721  if (benchmark) {
722  if (cache.find(args.params, algoPerf)) {
723  // re-check cache since another thread may have benchmarked the algorithm
724  return;
725  }
726  }
727 
728  auto perfResults = search::findAlgorithm(args, benchmark);
729  // for deterministic algo, look at all the perf results and return the best
730  // deterministic algo
731  if (perfResults.status == CUDNN_STATUS_SUCCESS &&
732  !(args.params.deterministic && perfResults.determinism != CUDNN_DETERMINISTIC)) {
733 
734  // if benchmarking, map the original params with the found algo+math type for re-use
735  if (benchmark) {
736  cache.insert(args.params, perfResults);
737 
738  // Free the cached blocks in our caching allocator. They are
739  // needed here because the above benchmarking uses a huge amount of memory,
740  // e.g. a few GBs.
741  c10::cuda::CUDACachingAllocator::emptyCache();
742  }
743 
744  *algoPerf = perfResults;
745  } else {
746  algoPerf->algo = search::DEFAULT_ALGO;
747  if (args.params.dataType == CUDNN_DATA_HALF) {
748  algoPerf->mathType = CUDNN_TENSOR_OP_MATH;
749  } else {
750  algoPerf->mathType = CUDNN_DEFAULT_MATH;
751  }
752  search::getWorkspaceSize(args, algoPerf->algo, &(algoPerf->memory));
753  }
754 }
755 
756 template<typename perf_t>
757 Workspace chooseAlgorithm(
758  const ConvolutionArgs& args,
759  bool benchmark,
760  perf_t* algoPerf)
761 {
762  findAlgorithm(args, benchmark, algoPerf);
763 
764  using search = algorithm_search<perf_t>;
765  try {
766  return Workspace(algoPerf->memory);
767  } catch (const std::exception& e) {
768  cudaGetLastError(); // clear OOM error
769 
770  // switch to default algorithm and record it in the cache to prevent
771  // further OOM errors
772  algoPerf->algo = search::DEFAULT_ALGO;
773  if (args.params.dataType == CUDNN_DATA_HALF) {
774  algoPerf->mathType = CUDNN_TENSOR_OP_MATH;
775  } else {
776  algoPerf->mathType = CUDNN_DEFAULT_MATH;
777  }
778  search::getWorkspaceSize(args, algoPerf->algo, &(algoPerf->memory));
779  search::cache().insert(args.params, *algoPerf);
780  return Workspace(algoPerf->memory);
781  }
782 }
783 
784 // ---------------------------------------------------------------------
785 //
786 // Bias addition
787 //
788 // ---------------------------------------------------------------------
789 
790 // In-place!
791 void cudnn_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const TensorArg& bias)
792 {
793  checkAllSameType(c, {output, bias});
794  checkAllSameGPU(c, {output, bias});
795  checkSize(c, bias, { output->size(output_channels_dim) });
796 
797  // See Note [CuDNN broadcast padding]. Handle the left padding
798  // ourselves, but use TensorDescriptor's padding argument to do the rest.
799  TensorDescriptor bdesc, odesc;
800  bdesc.set(bias->expand({1, bias->size(0)}), output->dim());
801  odesc.set(*output);
802 
803  auto handle = getCudnnHandle();
804  auto dataType = getCudnnDataType(*bias);
805  Constant one(dataType, 1);
806 
807  AT_CUDNN_CHECK(cudnnAddTensor(handle, &one, bdesc.desc(), bias->data_ptr(),
808  &one, odesc.desc(), output->data_ptr()));
809 }
810 
811 // NOTE [ Convolution design ]
812 //
813 // The general strategy:
814 //
815 // - cudnn_convolution (Tensor)
816 // Entry points for clients, takes bias
817 //
818 // - cudnn_convolution_forward (TensorArg)
819 // Entry point, which may be reused between regular
820 // convolution and transposed convolution. Does NOT take bias.
821 //
822 // - raw_cudnn_convolution_forward_out (Tensor)
823 // Low level function which invokes CuDNN, and takes an output
824 // tensor which is directly written to (thus _out).
825 //
826 // Where does argument checking happen? Here's the division of
827 // responsibility:
828 // - Things that happen in at::Tensor
829 // - TensorArg allocation
830 // - setCuDNNStreamToCurrent
831 // - Things that happen in TensorArg
832 // - Check arguments (type, GPU, shape)
833 //
834 // TODO: Consider renaming zero-indexed arguments to "self"
835 
836 
837 
838 // ---------------------------------------------------------------------
839 //
840 // Convolution forward / Transposed convolution backward
841 //
842 // ---------------------------------------------------------------------
843 
844 // The raw API directly invokes CuDNN and does not emulate support
845 // for group convolution on old versions of CuDNN.
846 //
847 // There are a few reasons this should never be directly exposed
848 // via ATen:
849 //
850 // - It takes output as a parameter (this should be computed!)
851 // - It doesn't do input checking
852 // - It doesn't resize output (it is assumed to be correctly sized)
853 //
854 void raw_cudnn_convolution_forward_out(
855  const Tensor& output, const Tensor& input, const Tensor& weight,
856  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
857  bool benchmark, bool deterministic) {
858 
859  auto dataType = getCudnnDataType(input);
860 
861  ConvolutionArgs args{ input, output, weight };
862  args.handle = getCudnnHandle();
863  setConvolutionParams(&args.params, input, weight, padding, stride, dilation, groups, deterministic);
864  args.idesc.set(input);
865  args.wdesc.set(weight);
866  args.odesc.set(output);
867  args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
868 
869  // TODO: when we do legacy group convolution support, we'll repeatedly
870  // reinitialize the workspace for each convolution we do. This is
871  // wasteful; we'd rather reuse the workspace. OTOH, legacy group
872  // convolution support is already pretty slow, so this might not
873  // matter. (This applies to raw_cudnn_convolution_backward_input as well.)
874  cudnnConvolutionFwdAlgoPerf_t fwdAlgPerf;
875  Workspace workspace = chooseAlgorithm(args, benchmark, &fwdAlgPerf);
876 
877  // update convDesc mathType since cudnn 7.4+ now requires both algo + mathType to figure out
878  // whether to use Tensor core kernels or not
879  // See Note [behavior of cudnnFind and cudnnGet]
880  AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), fwdAlgPerf.mathType));
881 
882  Constant one(dataType, 1);
883  Constant zero(dataType, 0);
884 
885  AT_CUDNN_CHECK(cudnnConvolutionForward(
886  args.handle,
887  &one, args.idesc.desc(), input.data_ptr(),
888  args.wdesc.desc(), weight.data_ptr(),
889  args.cdesc.desc(), fwdAlgPerf.algo, workspace.data, workspace.size,
890  &zero, args.odesc.desc(), output.data_ptr()));
891 }
892 
893 Tensor cudnn_convolution_forward(
894  CheckedFrom c,
895  const TensorArg& input, const TensorArg& weight,
896  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
897  bool benchmark, bool deterministic)
898 {
899  checkAllSameType(c, {input, weight});
900  checkAllSameGPU(c, {input, weight});
901 
902  auto output_t = at::empty(
903  conv_output_size(input->sizes(), weight->sizes(),
904  padding, stride, dilation, groups),
905  input->options());
906 
907  // Avoid ambiguity of "output" when this is being used as backwards
908  TensorArg output{ output_t, "result", 0 };
909  convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups);
910 
911  // See #4500
912  Tensor weight_contig = weight->contiguous();
913 
914  raw_cudnn_convolution_forward_out(
915  *output, *input, weight_contig,
916  padding, stride, dilation, groups, benchmark, deterministic);
917 
918  return *output;
919 }
920 
921 Tensor cudnn_convolution(
922  const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t,
923  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
924  int64_t groups, bool benchmark, bool deterministic)
925 {
926  TensorArg input { input_t, "input", 1 },
927  weight { weight_t, "weight", 2 },
928  bias { bias_t, "bias", 3 };
929  setCuDNNStreamToCurrent();
930  CheckedFrom c = "cudnn_convolution";
931  auto output_t = cudnn_convolution_forward(
932  c, input, weight, padding, stride, dilation, groups, benchmark, deterministic);
933  if (bias->defined()) {
934  cudnn_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
935  }
936  return output_t;
937 }
938 
939 // NB: output_padding not needed here, as there is no ambiguity to
940 // resolve
941 Tensor cudnn_convolution_transpose_backward_input(
942  const Tensor& grad_output_t, const Tensor& weight_t,
943  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
944  int64_t groups, bool benchmark, bool deterministic)
945 {
946  TensorArg grad_output { grad_output_t, "grad_output", 1 },
947  weight { weight_t, "weight", 2 };
948  setCuDNNStreamToCurrent();
949  return cudnn_convolution_forward(
950  "cudnn_convolution_transpose_backward_input",
951  grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
952 }
953 
954 std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_transpose_backward(
955  const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
956  IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
957  bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
958 
959  Tensor grad_output = grad_output_t.contiguous();
960 
961  Tensor grad_input, grad_weight, grad_bias;
962  if (output_mask[0]) {
963  grad_input = at::cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
964  }
965  if (output_mask[1]) {
966  grad_weight = at::cudnn_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
967  }
968  if (output_mask[2]) {
969  grad_bias = at::cudnn_convolution_backward_bias(grad_output);
970  }
971 
972  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
973 }
974 
975 // ---------------------------------------------------------------------
976 //
977 // Convolution backward / Transposed convolution forward
978 //
979 // ---------------------------------------------------------------------
980 
981 void raw_cudnn_convolution_backward_input_out(
982  const at::Tensor& grad_input,
983  const at::Tensor& grad_output,
984  const at::Tensor& weight,
985  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
986  bool benchmark, bool deterministic) {
987 
988  auto dataType = getCudnnDataType(grad_output);
989 
990  ConvolutionArgs args{ grad_input, grad_output, weight };
991  args.handle = getCudnnHandle();
992  setConvolutionParams(&args.params, grad_input, weight, padding, stride, dilation, groups, deterministic);
993  args.idesc.set(grad_input);
994  args.wdesc.set(weight);
995  args.odesc.set(grad_output);
996  args.cdesc.set(dataType, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
997 
998  cudnnConvolutionBwdDataAlgoPerf_t bwdDataAlgPerf;
999  Workspace workspace = chooseAlgorithm(args, benchmark, &bwdDataAlgPerf);
1000 
1001  // update convDesc mathType since cudnn 7.4+ now requires both algo + mathType to figure out
1002  // whether to use Tensor core kernels or not
1003  // See Note [behavior of cudnnFind and cudnnGet]
1004  AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdDataAlgPerf.mathType));
1005 
1006  Constant one(dataType, 1);
1007  Constant zero(dataType, 0);
1008 
1009  AT_CUDNN_CHECK(cudnnConvolutionBackwardData(
1010  args.handle,
1011  &one, args.wdesc.desc(), weight.data_ptr(),
1012  args.odesc.desc(), grad_output.data_ptr(),
1013  args.cdesc.desc(), bwdDataAlgPerf.algo, workspace.data, workspace.size,
1014  &zero, args.idesc.desc(), grad_input.data_ptr()));
1015 }
1016 
1017 // NOTE [ Backward vs transpose convolutions ]
1018 //
1019 // Backward and transpose are algorithmically equivalent, but they
1020 // compute their geometry differently. In a backwards, you knew what
1021 // the original size of the input tensor was, so you can cache that
1022 // geometry and fill it directly. In transposed convolution, it is
1023 // more conventional to not explicitly specify the output (previously
1024 // input) size, and compute it. This, however, leaves a degree of
1025 // freedom; this degree of freedom is resolved using the
1026 // output_padding parameter. Both of these interfaces are equivalent,
1027 // but they are differently convenient depending on the use case.
1028 
1029 Tensor cudnn_convolution_backward_input(
1030  CheckedFrom c,
1031  IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight,
1032  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
1033  bool benchmark, bool deterministic)
1034 {
1035  checkAllSameType(c, {grad_output, weight});
1036  checkAllSameGPU(c, {grad_output, weight});
1037 
1038  auto grad_input_t = at::empty(input_size, grad_output->options());
1039 
1040  // Avoid "grad_input" when this is being used as transposed convolution
1041  TensorArg grad_input{ grad_input_t, "result", 0 };
1042  convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups);
1043 
1044  // See #4500
1045  Tensor weight_contig = weight->contiguous();
1046 
1047  raw_cudnn_convolution_backward_input_out(
1048  *grad_input, *grad_output, weight_contig,
1049  padding, stride, dilation, groups, benchmark, deterministic);
1050 
1051  return *grad_input;
1052 }
1053 
1054 Tensor cudnn_convolution_transpose_forward(
1055  CheckedFrom c,
1056  const TensorArg& grad_output, const TensorArg& weight,
1057  IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
1058  bool benchmark, bool deterministic)
1059 {
1060  auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(),
1061  padding, output_padding, stride, dilation, groups);
1062  return cudnn_convolution_backward_input(c, input_size, grad_output, weight,
1063  padding, stride, dilation, groups, benchmark, deterministic);
1064 }
1065 
1066 Tensor cudnn_convolution_backward_input(
1067  IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t,
1068  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
1069  bool benchmark, bool deterministic)
1070 {
1071  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
1072  weight{ weight_t, "weight", 2 };
1073  setCuDNNStreamToCurrent();
1074  return cudnn_convolution_backward_input(
1075  "cudnn_convolution_backward_input",
1076  input_size, grad_output, weight,
1077  padding, stride, dilation, groups, benchmark, deterministic);
1078 }
1079 
1080 std::tuple<at::Tensor,at::Tensor,at::Tensor> cudnn_convolution_backward(
1081  const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight,
1082  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
1083  bool benchmark, bool deterministic, std::array<bool,3> output_mask) {
1084 
1085  Tensor grad_output = grad_output_t.contiguous();
1086 
1087  Tensor grad_input, grad_weight, grad_bias;
1088  if (output_mask[0]) {
1089  grad_input = at::cudnn_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic);
1090  }
1091  if (output_mask[1]) {
1092  grad_weight = at::cudnn_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic);
1093  }
1094  if (output_mask[2]) {
1095  grad_bias = at::cudnn_convolution_backward_bias(grad_output);
1096  }
1097 
1098  return std::tuple<Tensor,Tensor,Tensor>{grad_input, grad_weight, grad_bias};
1099 }
1100 
1101 Tensor cudnn_convolution_transpose(
1102  const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t,
1103  IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation,
1104  int64_t groups, bool benchmark, bool deterministic)
1105 {
1106  TensorArg input { input_t, "input", 1 },
1107  weight { weight_t, "weight", 2 },
1108  bias { bias_t, "bias", 3 };
1109  CheckedFrom c = "cudnn_convolution_transpose";
1110  auto output_t = cudnn_convolution_transpose_forward(
1111  c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic);
1112  if (bias->defined()) {
1113  cudnn_convolution_add_bias_(c, { output_t, "result", 0 }, bias);
1114  }
1115  return output_t;
1116 }
1117 
1118 // ---------------------------------------------------------------------
1119 //
1120 // Convolution backward (weight)
1121 //
1122 // ---------------------------------------------------------------------
1123 
1124 void raw_cudnn_convolution_backward_weight_out(
1125  const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input,
1126  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
1127  bool benchmark, bool deterministic) {
1128 
1129  auto dataType = getCudnnDataType(input);
1130 
1131  ConvolutionArgs args{ input, grad_output, grad_weight };
1132  args.handle = getCudnnHandle();
1133  setConvolutionParams(&args.params, input, grad_weight, padding, stride, dilation, groups, deterministic);
1134  args.idesc.set(input);
1135  args.wdesc.set(grad_weight);
1136  args.odesc.set(grad_output);
1137  args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups);
1138 
1139  cudnnConvolutionBwdFilterAlgoPerf_t bwdFilterAlgPerf;
1140  Workspace workspace = chooseAlgorithm(args, benchmark, &bwdFilterAlgPerf);
1141 
1142  // update convDesc mathType since cudnn 7.4+ now requires both algo + mathType to figure out
1143  // whether to use Tensor core kernels or not
1144  // See Note [behavior of cudnnFind and cudnnGet]
1145  AT_CUDNN_CHECK(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType));
1146 
1147  Constant one(dataType, 1);
1148  Constant zero(dataType, 0);
1149 
1150  AT_CUDNN_CHECK(cudnnConvolutionBackwardFilter(
1151  args.handle,
1152  &one, args.idesc.desc(), input.data_ptr(),
1153  args.odesc.desc(), grad_output.data_ptr(),
1154  args.cdesc.desc(), bwdFilterAlgPerf.algo, workspace.data, workspace.size,
1155  &zero, args.wdesc.desc(), grad_weight.data_ptr()));
1156 }
1157 
1158 Tensor cudnn_convolution_backward_weight(
1159  CheckedFrom c,
1160  IntArrayRef weight_size, const TensorArg& grad_output, const TensorArg& input,
1161  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
1162  bool benchmark, bool deterministic)
1163 {
1164 
1165  checkAllSameType(c, {grad_output, input});
1166  checkAllSameGPU(c, {grad_output, input});
1167 
1168  auto grad_weight_t = at::empty(weight_size, grad_output->options());
1169 
1170  // For uniformity with everything else, although it seems grad_weight
1171  // would be unambiguous too.
1172  TensorArg grad_weight{ grad_weight_t, "result", 0 };
1173  convolution_shape_check(c, input, grad_weight, grad_output, padding, stride, dilation, groups);
1174 
1175  raw_cudnn_convolution_backward_weight_out(
1176  *grad_weight, *grad_output, *input,
1177  padding, stride, dilation, groups, benchmark, deterministic);
1178 
1179  return grad_weight_t;
1180 }
1181 
1182 Tensor cudnn_convolution_backward_weight(
1183  IntArrayRef weight_size,
1184  const Tensor& grad_output_t,
1185  const Tensor& input_t,
1186  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
1187  bool benchmark, bool deterministic)
1188 {
1189  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
1190  input{ input_t, "input", 2 };
1191  setCuDNNStreamToCurrent();
1192  return cudnn_convolution_backward_weight(
1193  "cudnn_convolution_backward_weight",
1194  weight_size, grad_output, input,
1195  padding, stride, dilation, groups, benchmark, deterministic);
1196 }
1197 
1198 Tensor cudnn_convolution_transpose_backward_weight(
1199  IntArrayRef weight_size,
1200  const Tensor& grad_output_t,
1201  const Tensor& input_t,
1202  IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups,
1203  bool benchmark, bool deterministic)
1204 {
1205  TensorArg grad_output{ grad_output_t, "grad_output", 1 },
1206  input{ input_t, "input", 2 };
1207  setCuDNNStreamToCurrent();
1208  return cudnn_convolution_backward_weight(
1209  "cudnn_convolution_backward_weight",
1210  weight_size, input, grad_output,
1211  padding, stride, dilation, groups, benchmark, deterministic);
1212 }
1213 
1214 // ---------------------------------------------------------------------
1215 //
1216 // Convolution backward (bias)
1217 //
1218 // ---------------------------------------------------------------------
1219 
1220 Tensor cudnn_convolution_backward_bias(
1221  const Tensor& grad_output_t)
1222 {
1223  TensorArg grad_output{ grad_output_t, "grad_output", 1 };
1224  setCuDNNStreamToCurrent();
1225 
1226  auto grad_bias_t = at::empty(
1227  { grad_output->size(output_channels_dim) }, grad_output->options());
1228 
1229  TensorArg grad_bias{ grad_bias_t, "result", 0 };
1230 
1231  // See Note [CuDNN broadcast padding]. Handle the left padding
1232  // ourselves, but use TensorDescriptor's pad argument to do the rest.
1233  TensorDescriptor bdesc{grad_bias->expand({1, grad_bias->size(0)}),
1234  static_cast<size_t>(grad_output->dim())};
1235  TensorDescriptor odesc{*grad_output};
1236 
1237  auto handle = getCudnnHandle();
1238  auto dataType = getCudnnDataType(*grad_bias);
1239  Constant one(dataType, 1);
1240  Constant zero(dataType, 0);
1241 
1242  AT_CUDNN_CHECK(cudnnConvolutionBackwardBias(handle, &one, odesc.desc(), grad_output->data_ptr(),
1243  &zero, bdesc.desc(), grad_bias->data_ptr()));
1244  return *grad_bias;
1245 }
1246 
1247 
1248 }} // namespace
1249 
1250 #endif
Flush-To-Zero and Denormals-Are-Zero mode.