Caffe2 - C++ API
A deep learning, cross platform ML framework
RNN.cpp
1 #include <ATen/native/RNN.h>
2 #include <ATen/ATen.h>
3 #include <ATen/Config.h>
4 #include <ATen/InitialTensorOptions.h>
5 #include <ATen/MatrixRef.h>
6 #include <ATen/NativeFunctions.h>
7 #include <ATen/TensorUtils.h>
8 #include <ATen/cuda/CUDAConfig.h>
9 #include <ATen/cuda/CUDAEvent.h>
10 #include <ATen/cuda/Exceptions.h>
11 #include <c10/util/Exception.h>
12 
13 #if !AT_CUDNN_ENABLED()
14 
15 namespace at { namespace native {
16 
17 // See Note [ATen preprocessor philosophy]
18 
19 Tensor _cudnn_rnn_flatten_weight(
20  TensorList weight_arr, int64_t weight_stride0,
21  int64_t input_size,
22  int64_t fn_mode, int64_t fn_hidden_size,
23  int64_t fn_num_layers, bool batch_first,
24  bool fn_bidirectional
25  ) {
26  AT_ERROR("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support");
27 }
28 
29 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
30  const Tensor& input_r,
31  TensorList weight, int64_t weight_stride0,
32  const Tensor& weight_buf_r, const Tensor& hx, const Tensor& cx,
33  int64_t fn_mode, int64_t fn_hidden_size,
34  int64_t fn_num_layers, bool batch_first, double fn_dropout,
35  bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes,
36  const Tensor& fn_dropout_state
37  ) {
38  AT_ERROR("_cudnn_rnn: ATen not compiled with cuDNN support");
39 }
40 
41 std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
42  const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
43  const Tensor& output, const Tensor& grad_output_r, const Tensor& grad_hy_r,
44  const Tensor& grad_cy_r,
45  int64_t mode, int64_t hidden_size,
46  int64_t num_layers, bool batch_first, double dropout,
47  bool train, bool bidirectional, IntArrayRef batch_sizes,
48  const Tensor& dropout_state, const Tensor& reserve,
49  std::array<bool, 4> output_mask
50  ) {
51  AT_ERROR("_cudnn_rnn_backward: ATen not compiled with cuDNN support");
52 }
53 
54 Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, const TensorOptions& options) {
55  AT_ERROR("_cudnn_init_dropout_state: ATen not compiled with cuDNN support");
56 }
57 
58 }} // namespace at::native
59 
60 #else // AT_CUDNN_ENABLED()
61 
62 #include <ATen/cudnn/cudnn-wrapper.h>
63 #include <ATen/cudnn/Descriptors.h>
64 #include <ATen/cudnn/Types.h>
65 #include <ATen/cudnn/Utils.h>
66 
67 namespace at { namespace native {
68 
69 namespace {
70  // DropoutDescriptor
71 
72  struct DropoutDescriptorParams {
73  bool train;
74  double dropout;
75  Tensor dropout_state;
76  DropoutDescriptorParams() {}
77  void set(bool train_, double dropout_, Tensor dropout_state_) {
78  train = train_;
79  dropout = dropout_;
80  dropout_state = dropout_state_;
81  }
82  DropoutDescriptor descriptor(cudnnHandle_t handle) const {
83  auto dropout_p = train ? dropout : 0;
84  DropoutDescriptor dropout_desc;
85  if (dropout_p == 0) {
86  dropout_desc.set_no_dropout(handle);
87  } else {
88  dropout_desc.set(handle, dropout_p, dropout_state);
89  }
90  return dropout_desc;
91  }
92  };
93 
94  // RNNDescriptor
95 
96  struct RNNDescriptorParams {
97  int64_t hidden_size;
98  int64_t num_layers;
99  cudnnDirectionMode_t bidirectional;
100  cudnnRNNMode_t mode;
101  cudnnDataType_t datatype;
102  cudnnDataType_t input_datatype;
103  cudnnRNNAlgo_t algo = CUDNN_RNN_ALGO_STANDARD;
104  cudnnRNNInputMode_t input_mode = CUDNN_LINEAR_INPUT;
105 
106  int64_t num_directions() const {
107  return bidirectional ? 2 : 1;
108  }
109 
110  void set_mode(int64_t fn_mode) {
111  switch (fn_mode) {
112  case CUDNN_RNN_RELU:
113  mode = CUDNN_RNN_RELU;
114  break;
115  case CUDNN_RNN_TANH:
116  mode = CUDNN_RNN_TANH;
117  break;
118  case CUDNN_LSTM:
119  mode = CUDNN_LSTM;
120  break;
121  case CUDNN_GRU:
122  mode = CUDNN_GRU;
123  break;
124  default:
125  {
126  std::ostringstream oss;
127  oss << "unrecognized cuDNN RNN mode " << fn_mode;
128  AT_ERROR(oss.str());
129  }
130  }
131  }
132 
133  void set_bidirectional(bool fn_bidirectional) {
134  bidirectional = fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
135  }
136 
137  void set_algo(cudnnRNNAlgo_t algo){
138  this->algo = algo;
139  }
140 
141  void set(int64_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional, cudnnDataType_t datatype, cudnnDataType_t input_datatype) {
142  this->set_mode(mode);
143  this->hidden_size = hidden_size;
144  this->num_layers = num_layers;
145  this->set_bidirectional(bidirectional);
146  this->datatype = datatype;
147  this->input_datatype = input_datatype;
148  }
149 
150 
151  RNNDescriptor descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const {
152  RNNDescriptor rnn_desc;
153  rnn_desc.set(handle, hidden_size, num_layers, std::move(dropout_desc), input_mode, bidirectional, mode, datatype, input_datatype, algo);
154  return rnn_desc;
155  }
156 
157  // In some cases, a use of RNNDescriptor does not rely on the
158  // DropoutDescriptor. In this case, we fake up a no-dropout
159  // descriptor to make the RNN descriptor initialization go through.
160  // This is used by _cudnn_rnn_flatten_weight, which needs an
161  // RNNDescriptor for get_parameters(), but does not actually need
162  // a fully initialized dropout descriptor. This lets us avoid
163  // having to pass the dropout state to flatten, which has no business
164  // knowing what the dropout state is.
165  RNNDescriptor descriptor(cudnnHandle_t handle) const {
166  DropoutDescriptor dropout_desc;
167  dropout_desc.set_no_dropout(handle);
168  return descriptor(handle, std::move(dropout_desc));
169  }
170  };
171 
172  // TensorDescriptor list
173 
174  std::vector<TensorDescriptor> rnn_descriptor_sequence(const Tensor& tensor, IntArrayRef batch_sizes) {
175  std::vector<TensorDescriptor> descriptors(batch_sizes.size());
176  size_t i = 0;
177  // To be mutated in the loop
178  auto batch_tensor_size = tensor.sizes().vec();
179  for (auto batch_size : batch_sizes) {
180  batch_tensor_size[0] = batch_size;
181  // NB: cuDNN RNN API does not support 2d descriptors, so we
182  // must pad it out to 3d.
183  descriptors[i].set(getCudnnDataType(tensor), batch_tensor_size, tensor.strides(), 3);
184  i++;
185  }
186  return descriptors;
187  }
188 
189  std::vector<TensorDescriptor> rnn_descriptor(const Tensor& tensor, int64_t N) {
190  std::vector<TensorDescriptor> descriptors(N);
191  for (int64_t i = 0; i < N; i++) {
192  descriptors[i].set(tensor, 5);
193  }
194  return descriptors;
195  }
196 
197  // The best way to understand the meaning of the values stored in
198  // this struct is to consider each of the possible ways our
199  // input can be structured.
200  //
201  // Suppose you want to run RNN on the following variable
202  // length inputs:
203  //
204  // Sequence 1: ABCD
205  // Sequence 2: EF
206  // Sequence 3: G
207  //
208  // (Let _ be padding when we have non-packed representations.)
209  //
210  // # Packed input (batch_sizes is non-empty)
211  //
212  // input_size
213  // +------+ +
214  // | A | |
215  // | E | mini_batch = |
216  // | G | batch_sizes[0] = 3 |
217  // +------+ |
218  // | B | | batch_sizes_sum = 7
219  // | F | batch_sizes[1] = 2 |
220  // +------+ |
221  // | C | batch_sizes[2] = 1 |
222  // +------+ |
223  // | D | batch_sizes[3] = 1 |
224  // +------+ +
225  //
226  // (seq_length = 4)
227  //
228  // input.size() = batch_sizes_sum x input_size
229  //
230  // # Unpacked input (batch_first = false)
231  //
232  // mini_batch = 3
233  // +-------+
234  // | A E G |
235  // | B F _ | seq_length = 4
236  // | C _ _ |
237  // | D _ _ |
238  // +-------+
239  // ... input_size
240  // +-------+
241  //
242  // input.size() = seq_length x mini_batch x input_size
243  //
244  // # Unpacked input (batch_first = true)
245  //
246  // seq_length = 4
247  // +---------+
248  // | A B C D |
249  // | E F _ _ | mini_batch = 3
250  // | G _ _ _ |
251  // +---------+
252  // ... input_size
253  // +---------+
254  //
255  // input.size() = mini_batch x seq_length x input_size
256  //
257  struct TensorDescriptorListParams {
258  IntArrayRef batch_sizes;
259  int64_t seq_length;
260  int64_t mini_batch;
261  // NB: this is not input.size(), which is an IntArrayRef; instead, this
262  // size of the inner-most dimension. In NL applications, this is usually
263  // the size of the embedding. You can also think of this as the size
264  // of the "channel" dimension (at risk of confusing vision researchers :)
265  int64_t input_size;
266  // Only valid when !is_input_packed
267  int64_t batch_sizes_sum; // == sum(batch_sizes)
268 
269  bool is_input_packed() const {
270  return batch_sizes.size() != 0;
271  }
272 
273  void set(IntArrayRef input_sizes, IntArrayRef batch_sizes_, bool batch_first) {
274  batch_sizes = batch_sizes_;
275  if (is_input_packed()) {
276  seq_length = batch_sizes.size();
277  mini_batch = batch_sizes[0];
278  // NB: When input is packed, the mini_batch size is NOT the size
279  // of the outer dimension
280  batch_sizes_sum = input_sizes[0];
281  input_size = input_sizes[1];
282  } else {
283  if (batch_first) {
284  seq_length = input_sizes[1];
285  mini_batch = input_sizes[0];
286  } else {
287  seq_length = input_sizes[0];
288  mini_batch = input_sizes[1];
289  }
290  input_size = input_sizes[2];
291  // TODO: Actually, would this make ASAN's job harder catching
292  // an uninitialized access?
293  batch_sizes_sum = -1; // something bogus in case we access it
294  }
295  }
296 
297  // TODO: check x for consistency with input_size?
298  std::vector<TensorDescriptor> descriptors(Tensor x) const {
299  auto is_input_packed = batch_sizes.size() != 0;
300  if (is_input_packed) {
301  return rnn_descriptor_sequence(x, batch_sizes);
302  } else {
303  return rnn_descriptor(x[0], seq_length);
304  }
305  }
306  };
307 
308  // Everything together
309 
310  struct RNNParams {
311  DropoutDescriptorParams dropout;
312  RNNDescriptorParams rnn;
313  TensorDescriptorListParams tensors;
314  };
315 
316  // NB: Doesn't include the weight descriptor
317  struct RNNDescriptors {
318  RNNDescriptor rnn_desc;
319  // NB: this won't actually lay out the tensor descriptor pointers
320  // in the right way, so you'll have to preprocess them
321  std::vector<TensorDescriptor> x_descs;
322  std::vector<TensorDescriptor> y_descs;
323  TensorDescriptor hx_desc;
324  TensorDescriptor hy_desc;
325  TensorDescriptor cx_desc;
326  TensorDescriptor cy_desc;
327 
328  RNNDescriptors(const RNNParams& fn, cudnnHandle_t handle, Tensor x, Tensor y, Tensor hx, Tensor cx) {
329  rnn_desc = fn.rnn.descriptor(handle, fn.dropout.descriptor(handle));
330  x_descs = fn.tensors.descriptors(x);
331  y_descs = fn.tensors.descriptors(y);
332  hx_desc.set(hx, 5);
333  hy_desc.set(hx, 5);
334  if (cx.defined()) {
335  cx_desc.set(cx, 5);
336  cy_desc.set(cx, 5);
337  }
338  }
339 
340  // TODO: This is annoying, having to put the cudnnTensorDescriptor_t
341  // in a contiguous array...
342  std::vector<cudnnTensorDescriptor_t> get_descs(const std::vector<TensorDescriptor>& descs) {
343  std::vector<cudnnTensorDescriptor_t> r;
344  r.reserve(descs.size());
345  for (auto& desc : descs) {
346  r.emplace_back(desc.desc());
347  }
348  return r;
349  }
350 
351  std::vector<cudnnTensorDescriptor_t> get_x_descs() {
352  return get_descs(x_descs);
353  }
354 
355  std::vector<cudnnTensorDescriptor_t> get_y_descs() {
356  return get_descs(y_descs);
357  }
358  };
359 
360  int64_t get_num_weights(cudnnHandle_t handle, const RNNDescriptor& rnn_desc,
361  const TensorDescriptor& x_desc, cudnnDataType_t datatype) {
362  size_t weight_size;
363  AT_CUDNN_CHECK(cudnnGetRNNParamsSize(handle, rnn_desc.desc(), x_desc.desc(), &weight_size, datatype));
364  auto elem_size = dataSize(datatype);
365  AT_ASSERTM(weight_size % elem_size == 0, "cudnnGetRNNParamsSize returned nonsensical weight_size");
366  return weight_size / elem_size;
367  }
368 
369  int64_t _num_linear_layers(cudnnRNNMode_t mode) {
370  switch(mode) {
371  case CUDNN_LSTM:
372  return 8;
373  case CUDNN_GRU:
374  return 6;
375  case CUDNN_RNN_RELU:
376  return 2;
377  case CUDNN_RNN_TANH:
378  return 2;
379  default:
380  AT_ERROR("unknown cuDNN RNN mode ", mode);
381  }
382  }
383 
384  /*
385  Returns weight and bias tensors for each layer of the RNN. These tensors
386  are views on the underlying weight buffer allocated by CuDNN.
387 
388  Note: for LSTM and GRU, which have multiple parameters of each type (4 and 3, respectively),
389  these parameters are concatenated along the first dimension.
390  These parameters are returned in a consistent order by CuDNN:
391  (reset, forget, cell, output) for LSTM
392  (reset, input, new) for GRU
393  Args:
394  fn: The RNN function object holding the RNN state
395  handle: a CuDNN handle
396  weight_buf: a 1D tensor containing the CuDNN-allocated weight (or grad_weight) buffer
397  Returns:
398  parameters: [(weight_ih, weight_hh, bias_ih, bias_hh)*], with length equal to the num_layers.
399  This is represented as a pair of vector, and outer-dimension stride
400  (NB: Can't return MatrixRef because we need to allocate the underlying tensor)
401  */
402  std::pair<std::vector<Tensor>, size_t> // stride0
403  get_parameters(
404  cudnnHandle_t handle,
405  const RNNDescriptorParams& rnn,
406  const RNNDescriptor& rnn_desc,
407  const TensorDescriptor& x_desc,
408  const FilterDescriptor& w_desc,
409  const Tensor& weight_buf
410  ) {
411  auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams };
412  std::vector<Tensor> params;
413  int64_t num_linear_layers = _num_linear_layers(rnn.mode);
414  int64_t num_layers = rnn.num_directions() * rnn.num_layers;
415  size_t cur_offset = 0;
416  size_t global_layer_params_count = 0;
417  for (int64_t layer = 0; layer < num_layers; layer++) {
418  size_t layer_params_count = 0;
419  for (auto cudnn_method : cudnn_methods) {
420  for (int64_t linear_id = 0; linear_id < num_linear_layers; linear_id++) {
421  FilterDescriptor lin_layer_mat_desc;
422  void* matrix_pointer;
423  AT_CUDNN_CHECK(cudnn_method(
424  handle,
425  rnn_desc.desc(),
426  layer,
427  x_desc.desc(),
428  w_desc.desc(),
429  weight_buf.data_ptr(),
430  linear_id,
431  lin_layer_mat_desc.mut_desc(),
432  &matrix_pointer
433  ));
434  cudnnDataType_t data_type;
435  cudnnTensorFormat_t format;
436  int nb_dims;
437  constexpr int min_dim = 3;
438  // TODO: The use of CPU tensor here is a bit goofy in C++,
439  // some sort of alloca would be good enough except that it is
440  // kind of convenient to be able to prod() on it.
441  Tensor filter_dim_a = at::empty(min_dim, at::initialTensorOptions().dtype(kInt));
442  AT_CUDNN_CHECK(cudnnGetFilterNdDescriptor(
443  lin_layer_mat_desc.desc(),
444  min_dim,
445  &data_type,
446  &format,
447  &nb_dims,
448  filter_dim_a.data<int>()
449  ));
450 
451  AT_ASSERTM(nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim = ", min_dim);
452  filter_dim_a = filter_dim_a.slice(0, 0, nb_dims);
453  auto elem_size = dataSize(getCudnnDataType(weight_buf));
454  auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr();
455  AT_ASSERTM(offset_bytes % elem_size == 0, "offset_bytes = ", offset_bytes, "; elem_size = ", elem_size);
456  size_t offset = offset_bytes / elem_size;
457 
458  // for all the RNN types provided by CUDNN, all the ih weights
459  // are the same size and are allocated in a contiguous chunk
460  // (same for the hh weights, and the ih and hh biases).
461  // Since we're storing all the weights in a single tensor anyway,
462  // might as well merge the CUDNN ones into a single tensor as well
463  int mat_numel = *filter_dim_a.prod(at::ScalarType::Int).data<int>();
464  if (linear_id == 0 || linear_id == num_linear_layers / 2) {
465  std::initializer_list<int64_t> size = {
466  mat_numel * num_linear_layers / 2, 1};
467  // Generate a new parameter tensor which is a view into the
468  // weight_buf.
469  Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size);
470  params.emplace_back(std::move(param));
471  layer_params_count++;
472  } else {
473  AT_ASSERTM(cur_offset == offset, "cur_offset = ", cur_offset, "; offset = ", offset);
474  }
475  cur_offset = offset + mat_numel;
476  }
477  } // for cudnn_method
478  if (layer == 0) {
479  global_layer_params_count = layer_params_count;
480  } else {
481  AT_ASSERTM(global_layer_params_count == layer_params_count,
482  "global_layer_params_count = ", global_layer_params_count,
483  "; layer_params_count = ", layer_params_count);
484  }
485  } // for layer
486  return std::make_pair(params, global_layer_params_count);
487  }
488 
489  // This is a lightweight version of the method above used to quickly get the expected
490  // parameter offsets.
491  std::vector<void*> get_expected_data_ptrs(
492  const Tensor& weight_buf, cudnnHandle_t handle, const RNNDescriptorParams& rnn,
493  const RNNDescriptor& rnn_desc, const TensorDescriptor& x_desc, cudnnDataType_t datatype) {
494  FilterDescriptor w_desc;
495  w_desc.set(weight_buf, 3);
496 
497  int64_t num_linear_layers = _num_linear_layers(rnn.mode);
498  int64_t num_dir_layers = rnn.num_directions() * rnn.num_layers;
499  const auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams };
500  std::vector<void*> data_ptrs;
501  data_ptrs.reserve(num_dir_layers * 2 * 2);
502  for (int64_t layer = 0; layer < num_dir_layers; layer++) {
503  for (auto cudnn_method : cudnn_methods) {
504  // This API returns a separate pointer for weight of every gate,
505  // but we represent them as a single tensor, so we're only interested
506  // in a very limited subset of possible values.
507  const std::array<int64_t, 2> linear_offsets = { 0, num_linear_layers / 2 };
508  for (int64_t linear_id : linear_offsets) {
509  FilterDescriptor lin_layer_mat_desc;
510  void* matrix_pointer;
511  AT_CUDNN_CHECK(cudnn_method(
512  handle,
513  rnn_desc.desc(),
514  layer,
515  x_desc.desc(),
516  w_desc.desc(),
517  weight_buf.data_ptr(),
518  linear_id,
519  lin_layer_mat_desc.mut_desc(),
520  &matrix_pointer
521  ));
522  data_ptrs.push_back(matrix_pointer);
523  }
524  }
525  }
526  return data_ptrs;
527  }
528 
529  void _viewOrCopyParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to, bool copy) {
530  AT_ASSERTM(params_from.size(0) == params_to.size(0), "number of layers mismatch");
531  for (size_t i = 0; i < params_from.size(0); i++) {
532  auto layer_params_from = params_from[i];
533  auto layer_params_to = params_to[i];
534  // NOTE: these lists have all weights before all biases, so if the layer
535  // doesn't use biases, iteration will terminate once layer_params_from ends
536  // and ignore them.
537  for (auto a = layer_params_from.begin(), b = layer_params_to.begin();
538  a != layer_params_from.end() && b != layer_params_to.end();
539  ++a, ++b) {
540  auto param_from = *a, param_to = *b;
541  AT_ASSERTM(param_from.type() == param_to.type(), "parameter types mismatch");
542  if (copy) {
543  param_to.copy_(param_from.view_as(param_to));
544  } else {
545  param_from.resize_as_(param_to);
546  }
547  }
548  }
549  }
550 
551  void _copyParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to) {
552  _viewOrCopyParams(params_from, params_to, true);
553  }
554 
555  void _viewParams(MatrixRef<Tensor> params_from, MatrixRef<Tensor> params_to) {
556  _viewOrCopyParams(params_from, params_to, false);
557  }
558 
559 
560  std::vector<int64_t> _input_size(const TensorDescriptorListParams& tensors) {
561  if (tensors.is_input_packed()) {
562  return {tensors.batch_sizes_sum, tensors.input_size};
563  } else {
564  return {tensors.seq_length, tensors.mini_batch, tensors.input_size};
565  }
566  }
567 
568  std::vector<int64_t> _hidden_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) {
569  return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.hidden_size};
570  }
571 
572  std::vector<int64_t> _output_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) {
573  if (tensors.is_input_packed()) {
574  return {tensors.batch_sizes_sum, rnn.hidden_size * rnn.num_directions()};
575  } else {
576  return {tensors.seq_length, tensors.mini_batch, rnn.hidden_size * rnn.num_directions()};
577  }
578  }
579 
580  cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input){
581 #if CUDNN_VERSION < 7200 || CUDA_VERSION < 9010
582  return CUDNN_RNN_ALGO_STANDARD;
583 #else
584  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
585  const int64_t bsize = tensors.mini_batch;
586  //excluding Turing from using persistent rnn.
587  if (prop->major == 7 && prop->minor != 5 && getCudnnDataType(input) == CUDNN_DATA_HALF && !tensors.is_input_packed()) {
588  if (rnn.num_layers == 1 && rnn.hidden_size <= 1024 && rnn.num_directions() == 1 &&
589  rnn.hidden_size % 128 == 0 && tensors.input_size % 128 == 0){
590  //technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf,
591  //weed them out
592  if ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8){
593  if ((tensors.seq_length >=40 && bsize <=128) ||
594  (tensors.seq_length >=20 && bsize <=96) ||
595  (tensors.seq_length >=10 && bsize <=32)) {
596  return CUDNN_RNN_ALGO_PERSIST_STATIC;
597  }
598  }
599  }
600  }
601  return CUDNN_RNN_ALGO_STANDARD;
602 #endif
603  }
604 
605  cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) {
606 #if CUDNN_VERSION != 7103
607 // CUDNN 7.1.3 enforces RNN descriptor type to be identical to input/weight. This check throws an error for type
608 // promotion. The check has since been removed.
609  if (dtype == CUDNN_DATA_HALF) {
610  return CUDNN_DATA_FLOAT;
611  }
612 #endif
613  return dtype;
614  }
615 
616 } // anonymous namespace
617 
618 // NB: does inplace update into TensorList
619 // It would be a relatively simple matter to refactor this into multiple
620 // functions, only one of which does an inplace update, but we leave this
621 // for future work
622 Tensor _cudnn_rnn_flatten_weight(
623  TensorList weight_arr, int64_t weight_stride0,
624  int64_t input_size,
625  int64_t fn_mode, int64_t fn_hidden_size,
626  int64_t fn_num_layers, bool batch_first,
627  bool fn_bidirectional
628  ) {
629 
630  AT_CHECK(weight_arr.size() > 0,
631  "_cudnn_rnn_flatten_weight_: cannot flatten empty weight list");
632 
633  auto any_param = weight_arr[0];
634  auto datatype = getCudnnDataType(any_param);
635 
636  RNNDescriptorParams rnn;
637  rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
638 
639  auto handle = getCudnnHandle();
640  RNNDescriptor rnn_desc = rnn.descriptor(handle);
641 
642  TensorGeometry x_geom({1, input_size});
643  TensorDescriptor x_desc;
644  x_desc.set(getCudnnDataType(any_param), x_geom.sizes(), x_geom.strides(), 5);
645 
646  auto num_weights = get_num_weights(handle, rnn_desc, x_desc, datatype);
647  auto weight_buf = at::zeros(num_weights, any_param.options());
648 
649  FilterDescriptor w_desc;
650  w_desc.set(weight_buf, 3);
651 
652  // Slice off views into weight_buf
653  std::vector<Tensor> params_arr;
654  size_t params_stride0;
655  std::tie(params_arr, params_stride0) = get_parameters(handle, rnn, rnn_desc, x_desc, w_desc, weight_buf);
656 
657  MatrixRef<Tensor> weight{weight_arr, static_cast<size_t>(weight_stride0)},
658  params{params_arr, params_stride0};
659 
660  // Copy weights
661  _copyParams(weight, params);
662 
663  // Update the storage
664  for (size_t i = 0; i < weight.size(0); i++) {
665  for (auto orig_param_it = weight[i].begin(), new_param_it = params[i].begin();
666  orig_param_it != weight[i].end() && new_param_it != params[i].end();
667  orig_param_it++, new_param_it++) {
668  auto orig_param = *orig_param_it, new_param = *new_param_it;
669  orig_param.set_(new_param.view_as(orig_param));
670  }
671  }
672 
673  return weight_buf;
674 }
675 
676 // NB: when fn_batch_sizes is empty, that means no batch sizes was specified
677 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _cudnn_rnn(
678  const Tensor& input_r,
679  TensorList weight, int64_t weight_stride0,
680  const Tensor& weight_buf_r, const Tensor& hx, const Tensor& cx,
681  int64_t fn_mode, int64_t fn_hidden_size,
682  int64_t fn_num_layers, bool batch_first, double fn_dropout,
683  bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes,
684  const Tensor& fn_dropout_state
685  ) {
686 
687  check_device(input_r, weight, {hx, cx});
688  auto input = input_r;
689  auto weight_buf = weight_buf_r;
690  if (fn_dropout_state.defined()) {
691  auto input_arg = TensorArg(input, "input", 1);
692  auto dropout_state_arg = TensorArg(fn_dropout_state, "dropout_states", 15);
693  checkSameGPU("cudnn_rnn", input_arg, dropout_state_arg);
694  }
695  RNNParams fn;
696  auto datatype = getCudnnDataType(input);
697  fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
698  fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
699  fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
700 
701  // TODO: Set device to input
702 
703  if (fn.rnn.mode != CUDNN_LSTM) {
704  AT_CHECK(!cx.defined(),
705  "rnn: illegal defined cx for non-LSTM RNN");
706  }
707 
708  // TODO: can batch_first be a wrapper around this function?
709  auto is_input_packed = fn.tensors.batch_sizes.size() != 0;
710  if (batch_first && !is_input_packed) {
711  input = input.transpose(0, 1);
712  }
713 
714  auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
715  auto output_size = _output_size(fn.rnn, fn.tensors);
716 
717  AT_CHECK(hx.is_contiguous(),
718  "rnn: hx is not contiguous");
719  AT_CHECK(!cx.defined() || cx.is_contiguous(),
720  "rnn: cx is not contiguous");
721 
722  auto x = input.contiguous();
723  auto output = at::empty(output_size, input.options());
724  auto hy = at::empty(hidden_size, hx.options());
725  Tensor cy;
726  if (cx.defined()) {
727  cy = at::empty(hidden_size, cx.options());
728  } else {
729  cy = at::empty({0}, hx.options()); // NB: Not allowed to return undefined tensors
730  }
731  auto y = output;
732 
733  auto handle = getCudnnHandle();
734  cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input);
735  fn.rnn.set_algo(algo);
736  RNNDescriptors descs(fn, handle, x, y, hx, cx);
737 
738  FilterDescriptor w_desc;
739  if (!weight_buf.defined()) {
740  auto num_weights = get_num_weights(handle, descs.rnn_desc, descs.x_descs[0], datatype);
741  weight_buf = at::empty(num_weights, x.options());
742  w_desc.set(weight_buf, 3);
743  weight_buf.zero_();
744  std::vector<Tensor> params;
745  size_t params_stride0;
746  std::tie(params, params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf);
747  _copyParams(MatrixRef<Tensor>{weight, static_cast<size_t>(weight_stride0)},
748  MatrixRef<Tensor>{params, params_stride0});
749  } else {
750  w_desc.set(weight_buf, 3);
751  }
752 
753  AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
754  "Expected cell size ", IntArrayRef{hidden_size}, ", got ", cx.sizes());
755 
756  size_t workspace_size;
757  auto x_descs_arr = descs.get_x_descs();
758  auto y_descs_arr = descs.get_y_descs();
759  AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
760  handle,
761  descs.rnn_desc.desc(),
762  fn.tensors.seq_length,
763  x_descs_arr.data(),
764  &workspace_size
765  ));
766  Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
767 
768  Tensor reserve;
769  // NB: Previously, the test was for fn.requires_grad, but we don't have
770  // this information. Use 'train' as a proxy.
771  if (fn_train) {
772  size_t reserve_size;
773  AT_CUDNN_CHECK(cudnnGetRNNTrainingReserveSize(
774  handle,
775  descs.rnn_desc.desc(),
776  fn.tensors.seq_length,
777  x_descs_arr.data(),
778  &reserve_size
779  ));
780  reserve = at::empty(reserve_size, input.options().dtype(kByte));
781  AT_CUDNN_CHECK(cudnnRNNForwardTraining(
782  handle,
783  descs.rnn_desc.desc(),
784  fn.tensors.seq_length,
785  x_descs_arr.data(), x.data_ptr(),
786  descs.hx_desc.desc(), hx.data_ptr(),
787  descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr,
788  w_desc.desc(), weight_buf.data_ptr(),
789  y_descs_arr.data(), y.data_ptr(),
790  descs.hy_desc.desc(), hy.data_ptr(),
791  descs.cy_desc.desc(), cy.defined() ? cy.data_ptr() : nullptr,
792  workspace.data_ptr(), workspace.size(0),
793  reserve.data_ptr(), reserve.size(0)
794  ));
795  } else { // inference
796  reserve = at::empty({0}, input.options().dtype(kByte));
797  AT_CUDNN_CHECK(cudnnRNNForwardInference(
798  handle,
799  descs.rnn_desc.desc(),
800  fn.tensors.seq_length,
801  x_descs_arr.data(), x.data_ptr(),
802  descs.hx_desc.desc(), hx.data_ptr(),
803  descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr,
804  w_desc.desc(), weight_buf.data_ptr(),
805  y_descs_arr.data(), y.data_ptr(),
806  descs.hy_desc.desc(), hy.data_ptr(),
807  descs.cy_desc.desc(), cy.defined() ? cy.data_ptr() : nullptr,
808  workspace.data_ptr(), workspace.size(0)
809  ));
810 
811  }
812 
813  if (batch_first && !is_input_packed) {
814  output.transpose_(0, 1);
815  }
816 
817  return std::make_tuple(output, hy, cy, reserve, weight_buf);
818 }
819 
820 std::tuple<Tensor, Tensor, Tensor> _cudnn_rnn_backward_input(
821  const Tensor& input_r, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
822  const Tensor& output_r, const Tensor& grad_output_r, const Tensor& grad_hy,
823  const Tensor& grad_cy,
824  int64_t fn_mode, int64_t fn_hidden_size,
825  int64_t fn_num_layers, bool batch_first, double fn_dropout,
826  bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes,
827  const Tensor& fn_dropout_state, const Tensor& fn_reserve,
828  std::array<bool, 3> output_mask
829  ) {
830 
831  auto input = input_r;
832  auto grad_output = grad_output_r;
833  auto output = output_r;
834 
835  RNNParams fn;
836  auto datatype = getCudnnDataType(input);
837  fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
838  fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
839  fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
840 
841  // TODO: Set device to input
842  auto handle = getCudnnHandle();
843 
844  if (fn.rnn.mode != CUDNN_LSTM) {
845  AT_CHECK(!cx.defined(),
846  "rnn: illegal defined cx for non-LSTM RNN");
847  }
848 
849  auto is_input_packed = fn_batch_sizes.size() != 0;
850  if (batch_first && !is_input_packed) {
851  input = input.transpose(0, 1);
852  grad_output = grad_output.transpose(0, 1);
853  output = output.transpose(0, 1);
854  }
855 
856  auto input_size = _input_size(fn.tensors);
857  auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
858  auto output_size = _output_size(fn.rnn, fn.tensors);
859 
860  AT_CHECK(hx.is_contiguous(),
861  "rnn: hx is not contiguous");
862  AT_CHECK(!cx.defined() || cx.is_contiguous(),
863  "rnn: cx is not contiguous");
864 
865  auto x = input.contiguous();
866  auto dy = grad_output.contiguous();
867  auto y = output;
868  auto w = weight_buf;
869  auto dx = at::empty(input.sizes(), input.options()); // TODO: more compact way of saying this
870  auto dhy = grad_hy.contiguous().view(hidden_size);
871  auto dcy = grad_cy.defined() ? grad_cy.contiguous().view(hidden_size) : Tensor();
872  auto dhx = at::empty(hidden_size, hx.options());
873  AT_ASSERTM(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN");
874  auto dcx = cx.defined() ? at::empty(hidden_size, cx.options()) : Tensor();
875 
876  AT_CHECK(fn_train,
877  "cudnn RNN backward can only be called in training mode");
878 
879  AT_CHECK(input.sizes().equals(input_size),
880  "Expected input size ", IntArrayRef{input_size}, ", got ", input.sizes());
881  AT_CHECK(output.sizes().equals(output_size),
882  "Expected output size ", IntArrayRef{output_size}, ", got ", output.sizes());
883 
884  AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
885  "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes());
886  AT_CHECK(!cx.defined() || cx.sizes().equals(hidden_size),
887  "Expected cell size ", IntArrayRef{hidden_size}, ", got ", cx.sizes());
888  AT_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size),
889  "Expected d_hidden size ", IntArrayRef{hidden_size}, ", got ", dhy.sizes());
890  AT_CHECK(!dcy.defined() || dcy.sizes().equals(hidden_size),
891  "Expected d_cell size ", IntArrayRef{hidden_size}, ", got ", dcy.sizes());
892 
893  AT_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()),
894  "Gradients aren't CUDA tensors");
895 
896  cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input);
897  fn.rnn.set_algo(algo);
898  RNNDescriptors descs(fn, handle, x, y, hx, cx);
899 
900  FilterDescriptor w_desc;
901  w_desc.set(weight_buf, 3);
902 
903  size_t workspace_size;
904  auto x_descs_arr = descs.get_x_descs();
905  auto y_descs_arr = descs.get_y_descs();
906  AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
907  handle,
908  descs.rnn_desc.desc(),
909  fn.tensors.seq_length,
910  x_descs_arr.data(),
911  &workspace_size
912  ));
913  // TODO: put this in the correct device???
914  Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
915 
916  AT_CUDNN_CHECK(cudnnRNNBackwardData(
917  handle,
918  descs.rnn_desc.desc(),
919  fn.tensors.seq_length,
920  y_descs_arr.data(), y.data_ptr(),
921  y_descs_arr.data(), dy.data_ptr(),
922  descs.hy_desc.desc(), dhy.data_ptr(),
923  descs.cy_desc.desc(), cx.defined() ? dcy.data_ptr() : nullptr,
924  w_desc.desc(), w.data_ptr(),
925  descs.hx_desc.desc(), hx.data_ptr(),
926  descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr,
927  x_descs_arr.data(), dx.data_ptr(),
928  descs.hx_desc.desc(), dhx.data_ptr(),
929  descs.cx_desc.desc(), cx.defined() ? dcx.data_ptr() : nullptr,
930  workspace.data_ptr(), workspace.size(0),
931  fn_reserve.data_ptr(), fn_reserve.size(0)
932  ));
933 
934  if (batch_first && !is_input_packed) {
935  dx = dx.transpose_(0, 1);
936  }
937 
938  return std::make_tuple(dx, dhx, dcx);
939 }
940 
941 // NB: This MUST BE CALLED AFTER _cudnn_rnn_backward_input.
942 // We'll give a user friendly combined function...
943 std::vector<Tensor> _cudnn_rnn_backward_weight(
944  // TODO: I think tensor geometry sufficient for weight_buf/weight
945  const Tensor& input_r, TensorList weight_arr, int64_t weight_stride0,
946  const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
947  const Tensor& output_r,
948  int64_t fn_mode, int64_t fn_hidden_size,
949  int64_t fn_num_layers, bool batch_first, double fn_dropout,
950  bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes,
951  const Tensor& fn_dropout_state, const Tensor& fn_reserve
952  ) {
953 
954  MatrixRef<Tensor> weight{ weight_arr, static_cast<size_t>(weight_stride0) };
955 
956  auto input = input_r;
957  auto output = output_r;
958 
959  RNNParams fn;
960  auto datatype = getCudnnDataType(input);
961  fn.rnn.set(fn_mode, fn_hidden_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype);
962  fn.dropout.set(fn_train, fn_dropout, fn_dropout_state);
963  fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first);
964 
965  auto handle = getCudnnHandle();
966 
967  if (fn.rnn.mode != CUDNN_LSTM) {
968  AT_CHECK(!cx.defined(),
969  "rnn: illegal defined cx for non-LSTM RNN");
970  }
971 
972  auto is_input_packed = fn_batch_sizes.size() != 0;
973  if (batch_first && !is_input_packed) {
974  input = input.transpose(0, 1);
975  output = output.transpose(0, 1);
976  }
977 
978  auto input_size = _input_size(fn.tensors);
979  auto hidden_size = _hidden_size(fn.rnn, fn.tensors);
980 
981  AT_CHECK(fn_train,
982  "cudnn RNN backward can only be called in training mode");
983 
984  AT_CHECK(input.sizes().equals(input_size),
985  "Expected input size ", IntArrayRef{input_size}, ", got ", input.sizes());
986  AT_CHECK(!hx.defined() || hx.sizes().equals(hidden_size),
987  "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes());
988 
989  // TODO: the above were the only checks in rnn.py, but it doesn't seem
990  // like these checks are enough
991 
992  AT_CHECK(hx.is_contiguous(),
993  "rnn: hx is not contiguous");
994  AT_CHECK(!cx.defined() || cx.is_contiguous(),
995  "rnn: cx is not contiguous");
996 
997  auto x = input.contiguous();
998  const auto& y = output;
999  auto dw = at::zeros(weight_buf.sizes(), weight_buf.options());
1000 
1001  cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input);
1002  fn.rnn.set_algo(algo);
1003  RNNDescriptors descs(fn, handle, x, y, hx, cx);
1004 
1005  FilterDescriptor w_desc;
1006  w_desc.set(weight_buf, 3);
1007 
1008  size_t workspace_size;
1009  auto x_descs_arr = descs.get_x_descs();
1010  auto y_descs_arr = descs.get_y_descs();
1011  AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize(
1012  handle,
1013  descs.rnn_desc.desc(),
1014  fn.tensors.seq_length,
1015  x_descs_arr.data(),
1016  &workspace_size
1017  ));
1018  Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte));
1019 
1020  AT_CUDNN_CHECK(cudnnRNNBackwardWeights(
1021  handle,
1022  descs.rnn_desc.desc(),
1023  fn.tensors.seq_length,
1024  x_descs_arr.data(), x.data_ptr(),
1025  descs.hx_desc.desc(), hx.data_ptr(),
1026  y_descs_arr.data(), y.data_ptr(),
1027  workspace.data_ptr(), workspace.size(0),
1028  w_desc.desc(), dw.data_ptr(),
1029  fn_reserve.data_ptr(), fn_reserve.size(0)
1030  ));
1031 
1032 
1033  std::vector<Tensor> grad_params_arr;
1034  size_t grad_params_stride0;
1035  std::tie(grad_params_arr, grad_params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw);
1036  if (grad_params_stride0 == static_cast<size_t>(weight_stride0)) {
1037  _viewParams(MatrixRef<Tensor>{grad_params_arr, grad_params_stride0},
1038  MatrixRef<Tensor>{weight_arr, static_cast<size_t>(weight_stride0)});
1039  return grad_params_arr;
1040  } else {
1041  std::vector<Tensor> grad_weight_arr;
1042  grad_weight_arr.reserve( weight.numel() );
1043  for (const auto& w : weight_arr) {
1044  grad_weight_arr.emplace_back(at::empty(w.sizes(), w.options()));
1045  }
1046  _copyParams(MatrixRef<Tensor>{grad_params_arr, grad_params_stride0},
1047  MatrixRef<Tensor>{grad_weight_arr, static_cast<size_t>(weight_stride0)});
1048  return grad_weight_arr;
1049  }
1050 }
1051 
1052 // We need this dispatcher because _cudnn_rnn_backward_weight has a stringent
1053 // ordering requirement with _cudnn_rnn_backward_input
1054 std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>> _cudnn_rnn_backward(
1055  const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx,
1056  const Tensor& output, const Tensor& grad_output_r, const Tensor& grad_hy_r,
1057  const Tensor& grad_cy_r,
1058  int64_t mode, int64_t hidden_size,
1059  int64_t num_layers, bool batch_first, double dropout,
1060  bool train, bool bidirectional, IntArrayRef batch_sizes,
1061  const Tensor& dropout_state, const Tensor& reserve,
1062  std::array<bool, 4> output_mask
1063  ) {
1064 
1065  auto grad_output = grad_output_r.defined() ? grad_output_r : at::zeros_like(output);
1066  auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx);
1067  auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : at::zeros_like(cx)) : grad_cy_r;
1068 
1069  Tensor dx, dhx, dcx;
1070  // NB: unconditionally compute this gradient, because it mutates reserve
1071  std::tie(dx, dhx, dcx) = at::native::_cudnn_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]});
1072  std::vector<Tensor> dw;
1073  if (output_mask[3]) {
1074  dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve);
1075  }
1076  return std::tuple<Tensor, Tensor, Tensor, std::vector<Tensor>>{dx, dhx, dcx, dw};
1077 }
1078 
1079 // TODO: I am not sure if we actually need the 'dropout' and 'train' parameters
1080 // to initialize just the state tensor
1081 Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, const TensorOptions& options) {
1082  auto handle = getCudnnHandle();
1083  DropoutDescriptor dropout_desc;
1084  auto dropout_p = train ? dropout : 0;
1085  dropout_desc.initialize_rng(handle, dropout_p, dropout_seed, options);
1086  return dropout_desc.state;
1087 }
1088 
1090 // CUDA dispatch for the generic RNN ops (at::lstm, at::gru, ...)
1092 
1093 namespace {
1094 
1095 // Helpers for working with different hidden types.
1096 std::tuple<Tensor, Tensor> unpack_hidden(const Tensor& hidden) {
1097  return std::make_tuple(hidden, at::Tensor{});
1098 }
1099 
1100 std::tuple<Tensor, Tensor> unpack_hidden(const std::tuple<Tensor, Tensor>& hidden) {
1101  return hidden;
1102 }
1103 
1104 template<typename hidden_type>
1105 hidden_type pack_hidden(const Tensor& hx, const Tensor& cx) {
1106  static_assert(std::is_same<hidden_type, void>::value, "pack_hidden not implemented for this type");
1107  AT_ERROR("NOT IMPLEMENTED");
1108 }
1109 
1110 template<>
1111 Tensor pack_hidden<Tensor>(const Tensor& hx, const Tensor& cx) {
1112  AT_ASSERT(cx.numel() == 0);
1113  return hx;
1114 }
1115 
1116 template<>
1117 std::tuple<Tensor, Tensor> pack_hidden<std::tuple<Tensor, Tensor>>(const Tensor& hx, const Tensor& cx) {
1118  return std::make_tuple(hx, cx);
1119 }
1120 
1121 struct DropoutState {
1122  // Both buffer and event are lazily instantiated when a dropout state is needed
1123  // for the first time. Note that in this case needed != used, as we don't need
1124  // a bufer to e.g. run RNNs in test mode.
1125  at::Tensor buffer;
1127  std::mutex mutex;
1128 
1129  // Every time we use a dropout state, we need to synchronize with its event,
1130  // to make sure all previous uses finish running before this one starts. Once
1131  // we're done, we record the event to allow others to synchronize with this kernel.
1132  // Those events are really needed only for inter-stream sync on a single GPU.
1133  // I doubt anyone will want to run cuDNN RNNs in parallel on a single GPU, so
1134  // they should end up being complete no-ops.
1135  void lock() {
1136  // NB: We can't ignore the lock even when event is undefined, because someone
1137  // could then define it before we get to unlock().
1138  mutex.lock();
1139  if (event) {
1140  event->block(cuda::getCurrentCUDAStream());
1141  }
1142  }
1143 
1144  void unlock() {
1145  if (event) {
1146  event->record();
1147  }
1148  mutex.unlock();
1149  }
1150 };
1151 
1152 DropoutState& get_dropout_state(double dropout_p, bool train, TensorOptions options) {
1153  // Each state is slightly over 2MB and initialized lazily, so it's fine to cache them.
1154  static std::vector<DropoutState> ten_dropout_state_cache { static_cast<size_t>(cuda::getNumGPUs()) };
1155  static std::vector<DropoutState> var_dropout_state_cache { static_cast<size_t>(cuda::getNumGPUs()) };
1156  static std::mutex state_cache_mut;
1157 
1158  int device = cuda::current_device();
1159  std::unique_lock<std::mutex> lock {state_cache_mut};
1160  auto& state = options.is_variable() ? var_dropout_state_cache.at(device)
1161  : ten_dropout_state_cache.at(device);
1162  if (train && dropout_p > 0 && !state.buffer.defined()) {
1163  std::unique_lock<std::mutex> lock {state.mutex};
1164  int64_t seed = at::empty({}, at::kLong).random_().item<int64_t>();
1165  state.buffer = at::_cudnn_init_dropout_state(
1166  dropout_p, train, seed, options.dtype(at::kByte));
1167  // NB: CUDA binds the event to a device at creation time, so we can initialize it
1168  // only now, when we know we're on the correct device.
1169  state.event.emplace();
1170  }
1171  return state;
1172 }
1173 
1174 Tensor try_get_weight_buf(
1175  const Tensor& input, TensorList parameters, bool has_biases,
1176  cudnnRNNMode_t mode, int64_t hidden_size, int64_t num_layers, bool bidirectional) {
1177  // Prepare all relevant descriptors
1178  auto handle = getCudnnHandle();
1179  auto datatype = getCudnnDataType(input);
1180 
1181  RNNDescriptorParams rnn;
1182  rnn.set(mode, hidden_size, num_layers, bidirectional, promote_rnn_math_type(datatype), datatype);
1183  RNNDescriptor rnn_desc = rnn.descriptor(handle);
1184 
1185  TensorGeometry x_geom ({1, input.size(-1)});
1186  TensorDescriptor x_desc;
1187  x_desc.set(datatype, x_geom.sizes(), x_geom.strides(), 5);
1188 
1189  auto num_params = get_num_weights(handle, rnn_desc, x_desc, datatype);
1190 
1191  // Try to get parameter storage
1192  auto & any_param = parameters.at(0);
1193  auto param_storage = any_param.storage();
1194  auto weight_buf = at::empty({0}, any_param.options()).set_(param_storage);
1195  if (weight_buf.size(0) < num_params) {
1196  return {};
1197  } else if (weight_buf.size(0) > num_params) {
1198  weight_buf = weight_buf.narrow(0, 0, num_params);
1199  }
1200 
1201  // Get and check data pointers
1202  auto expected_data_ptrs = get_expected_data_ptrs(
1203  weight_buf, handle, rnn, rnn_desc, x_desc, datatype);
1204 
1205  int64_t num_parameters = parameters.size();
1206  int64_t num_ptrs = expected_data_ptrs.size();
1207  AT_ASSERT(num_ptrs == (num_parameters * (has_biases ? 1 : 2)));
1208  AT_ASSERT(num_ptrs % (has_biases ? 4 : 2) == 0);
1209  for (int64_t param_i = 0, ptr_i = 0;
1210  ptr_i < num_ptrs;
1211  ptr_i += (has_biases ? 2 : 4), param_i += 2) {
1212  if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {};
1213  if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {};
1214  }
1215  if (!parameters[num_parameters - 1].is_contiguous()) return {};
1216  return weight_buf;
1217 }
1218 
1219 const char * WEIGHT_FORMAT_WARN = "RNN module weights are not part of single contiguous "
1220  "chunk of memory. This means they need to be compacted "
1221  "at every call, possibly greatly increasing memory usage. "
1222  "To compact weights again call flatten_parameters().";
1223 
1224 template<typename hidden_type>
1225 std::pair<Tensor, hidden_type> _cudnn_impl(
1226  const Tensor& input, const Tensor& _batch_sizes, const hidden_type& hidden,
1227  TensorList params, bool has_biases, cudnnRNNMode_t mode,
1228  int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
1229  Tensor hx, cx;
1230  std::tie(hx, cx) = unpack_hidden(hidden);
1231  int64_t hidden_size = hx.size(2);
1232 
1233  auto weight_buf = try_get_weight_buf(
1234  input, params, has_biases, mode, hidden_size, num_layers, bidirectional);
1235  if (!weight_buf.defined()) {
1236  AT_WARN(WEIGHT_FORMAT_WARN);
1237  }
1238 
1239  AT_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D");
1240  IntArrayRef batch_sizes { _batch_sizes.data<int64_t>(), static_cast<size_t>(_batch_sizes.size(0)) };
1241 
1242  auto & dropout_state = get_dropout_state(dropout_p, train, input.options());
1243  std::unique_lock<DropoutState> lock { dropout_state };
1244  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
1245  auto cudnn_output = at::_cudnn_rnn(
1246  input, params, has_biases ? 4 : 2, weight_buf,
1247  hx, cx, static_cast<int>(mode), hidden_size, num_layers, /*batch_first=*/false,
1248  dropout_p, train, bidirectional, batch_sizes, dropout_state.buffer);
1249 
1250  return {std::get<0>(cudnn_output),
1251  pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
1252 }
1253 
1254 template<typename hidden_type>
1255 std::pair<Tensor, hidden_type> _cudnn_impl(
1256  const Tensor& input, const hidden_type& hidden,
1257  TensorList params, bool has_biases, cudnnRNNMode_t mode,
1258  int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
1259  Tensor hx, cx;
1260  std::tie(hx, cx) = unpack_hidden(hidden);
1261  int64_t hidden_size = hx.size(2);
1262 
1263  auto weight_buf = try_get_weight_buf(
1264  input, params, has_biases, mode, hidden_size, num_layers, bidirectional);
1265  if (!weight_buf.defined()) {
1266  AT_WARN(WEIGHT_FORMAT_WARN);
1267  }
1268 
1269  auto & dropout_state = get_dropout_state(dropout_p, train, input.options());
1270  std::unique_lock<DropoutState> lock { dropout_state };
1271  // cudnn_output = std::tuple<output, hy, cy, reserve, new_weight_buf>
1272  auto cudnn_output = at::_cudnn_rnn(
1273  input, params, has_biases ? 4 : 2, weight_buf,
1274  hx, cx, static_cast<int>(mode), hidden_size, num_layers, batch_first, dropout_p,
1275  train, bidirectional, /*batch_sizes=*/{}, dropout_state.buffer);
1276 
1277  return {std::get<0>(cudnn_output),
1278  pack_hidden<hidden_type>(std::get<1>(cudnn_output), std::get<2>(cudnn_output))};
1279 }
1280 
1281 #define ONE_HIDDEN_RNN(NAME, MODE) \
1282 void NAME##_cudnn(Tensor& output, Tensor& hy, \
1283  const Tensor& input, const Tensor& hx, \
1284  TensorList params, bool has_biases, \
1285  int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { \
1286  std::tie(output, hy) = _cudnn_impl(input, hx, params, has_biases, \
1287  MODE, num_layers, dropout_p, train, bidirectional, batch_first); \
1288 } \
1289  \
1290 void NAME##_packed_cudnn(Tensor& output, Tensor& hy, \
1291  const Tensor& data, const Tensor& batch_sizes, const Tensor& hx, \
1292  TensorList params, bool has_biases, \
1293  int64_t num_layers, double dropout_p, bool train, bool bidirectional) { \
1294  std::tie(output, hy) = _cudnn_impl(data, batch_sizes, hx, params, \
1295  has_biases, MODE, num_layers, dropout_p, train, bidirectional); \
1296 } \
1297  \
1298 REGISTER_CUDA_DISPATCH(NAME##_cudnn_stub, &NAME##_cudnn); \
1299 REGISTER_CUDA_DISPATCH(NAME##_packed_cudnn_stub, &NAME##_packed_cudnn);
1300 
1301 ONE_HIDDEN_RNN(gru, CUDNN_GRU)
1302 ONE_HIDDEN_RNN(rnn_tanh, CUDNN_RNN_TANH)
1303 ONE_HIDDEN_RNN(rnn_relu, CUDNN_RNN_RELU)
1304 
1305 void lstm_cudnn(Tensor& output, Tensor& hy, Tensor& cy,
1306  const Tensor& input, TensorList hx,
1307  TensorList params, bool has_biases,
1308  int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) {
1309  auto result = _cudnn_impl(input, std::make_tuple(hx[0], hx[1]), params, has_biases,
1310  CUDNN_LSTM, num_layers, dropout_p, train, bidirectional, batch_first);
1311  output = result.first;
1312  hy = std::get<0>(result.second);
1313  cy = std::get<1>(result.second);
1314 }
1315 
1316 void lstm_packed_cudnn(Tensor& output, Tensor& hy, Tensor& cy,
1317  const Tensor& data, const Tensor& batch_sizes, TensorList hx,
1318  TensorList params, bool has_biases,
1319  int64_t num_layers, double dropout_p, bool train, bool bidirectional) {
1320  auto result = _cudnn_impl(data, batch_sizes, std::make_tuple(hx[0], hx[1]),
1321  params, has_biases, CUDNN_LSTM, num_layers, dropout_p, train, bidirectional);
1322  output = result.first;
1323  hy = std::get<0>(result.second);
1324  cy = std::get<1>(result.second);
1325 }
1326 
1327 REGISTER_CUDA_DISPATCH(lstm_cudnn_stub, &lstm_cudnn);
1328 REGISTER_CUDA_DISPATCH(lstm_packed_cudnn_stub, &lstm_packed_cudnn);
1329 
1330 } // anonymous namepsace
1331 
1332 }} // namespace at::native
1333 
1334 #endif // AT_CUDNN_ENABLED()
constexpr size_t size() const
size - Get the array size.
Definition: ArrayRef.h:138
Flush-To-Zero and Denormals-Are-Zero mode.