Caffe2 - Python API
A deep learning, cross platform ML framework
rnn.py
1 import math
2 import torch
3 import warnings
4 import itertools
5 import numbers
6 
7 from .module import Module
8 from ..parameter import Parameter
9 from ..utils.rnn import PackedSequence, get_packed_sequence
10 from .. import init
11 from .. import _VF
12 from ..._jit_internal import weak_module, weak_script_method, weak_script, \
13  _parameter_list
14 
15 _rnn_impls = {
16  'GRU': _VF.gru,
17  'RNN_TANH': _VF.rnn_tanh,
18  'RNN_RELU': _VF.rnn_relu,
19 }
20 
21 
22 @weak_script
23 def apply_permutation(tensor, permutation, dim=1):
24  # type: (Tensor, Tensor, int) -> Tensor
25  return tensor.index_select(dim, permutation)
26 
27 
28 class RNNBase(Module):
29  __constants__ = ['mode', 'input_size', 'hidden_size', 'num_layers', 'bias',
30  'batch_first', 'dropout', 'bidirectional', '_flat_parameters']
31 
32  def __init__(self, mode, input_size, hidden_size,
33  num_layers=1, bias=True, batch_first=False,
34  dropout=0., bidirectional=False):
35  super(RNNBase, self).__init__()
36  self.mode = mode
37  self.input_size = input_size
38  self.hidden_size = hidden_size
39  self.num_layers = num_layers
40  self.bias = bias
41  self.batch_first = batch_first
42  self.dropout = dropout
43  self.bidirectional = bidirectional
44  num_directions = 2 if bidirectional else 1
45 
46  if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
47  isinstance(dropout, bool):
48  raise ValueError("dropout should be a number in range [0, 1] "
49  "representing the probability of an element being "
50  "zeroed")
51  if dropout > 0 and num_layers == 1:
52  warnings.warn("dropout option adds dropout after all but last "
53  "recurrent layer, so non-zero dropout expects "
54  "num_layers greater than 1, but got dropout={} and "
55  "num_layers={}".format(dropout, num_layers))
56 
57  if mode == 'LSTM':
58  gate_size = 4 * hidden_size
59  elif mode == 'GRU':
60  gate_size = 3 * hidden_size
61  elif mode == 'RNN_TANH':
62  gate_size = hidden_size
63  elif mode == 'RNN_RELU':
64  gate_size = hidden_size
65  else:
66  raise ValueError("Unrecognized RNN mode: " + mode)
67 
68  self._all_weights = []
69  for layer in range(num_layers):
70  for direction in range(num_directions):
71  layer_input_size = input_size if layer == 0 else hidden_size * num_directions
72 
73  w_ih = Parameter(torch.Tensor(gate_size, layer_input_size))
74  w_hh = Parameter(torch.Tensor(gate_size, hidden_size))
75  b_ih = Parameter(torch.Tensor(gate_size))
76  # Second bias vector included for CuDNN compatibility. Only one
77  # bias vector is needed in standard definition.
78  b_hh = Parameter(torch.Tensor(gate_size))
79  layer_params = (w_ih, w_hh, b_ih, b_hh)
80 
81  suffix = '_reverse' if direction == 1 else ''
82  param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
83  if bias:
84  param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
85  param_names = [x.format(layer, suffix) for x in param_names]
86 
87  for name, param in zip(param_names, layer_params):
88  setattr(self, name, param)
89  self._all_weights.append(param_names)
90 
91  self.flatten_parameters()
92  self.reset_parameters()
93 
94  def flatten_parameters(self):
95  """Resets parameter data pointer so that they can use faster code paths.
96 
97  Right now, this works only if the module is on the GPU and cuDNN is enabled.
98  Otherwise, it's a no-op.
99  """
100  any_param = next(self.parameters()).data
101  if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable(any_param):
102  return
103 
104  # If any parameters alias, we fall back to the slower, copying code path. This is
105  # a sufficient check, because overlapping parameter buffers that don't completely
106  # alias would break the assumptions of the uniqueness check in
107  # Module.named_parameters().
108  all_weights = self._flat_weights
109  unique_data_ptrs = set(p.data_ptr() for p in all_weights)
110  if len(unique_data_ptrs) != len(all_weights):
111  return
112 
113  with torch.cuda.device_of(any_param):
114  import torch.backends.cudnn.rnn as rnn
115 
116  # NB: This is a temporary hack while we still don't have Tensor
117  # bindings for ATen functions
118  with torch.no_grad():
119  # NB: this is an INPLACE function on all_weights, that's why the
120  # no_grad() is necessary.
121  torch._cudnn_rnn_flatten_weight(
122  all_weights, (4 if self.bias else 2),
123  self.input_size, rnn.get_cudnn_mode(self.mode), self.hidden_size, self.num_layers,
124  self.batch_first, bool(self.bidirectional))
125 
126  def _apply(self, fn):
127  ret = super(RNNBase, self)._apply(fn)
128  self.flatten_parameters()
129  return ret
130 
131  def reset_parameters(self):
132  stdv = 1.0 / math.sqrt(self.hidden_size)
133  for weight in self.parameters():
134  init.uniform_(weight, -stdv, stdv)
135 
136  @_parameter_list
137  def get_flat_weights(self):
138  return self._flat_weights
139 
140  @weak_script_method
141  def check_input(self, input, batch_sizes):
142  # type: (Tensor, Optional[Tensor]) -> None
143  expected_input_dim = 2 if batch_sizes is not None else 3
144  if input.dim() != expected_input_dim:
145  raise RuntimeError(
146  'input must have {} dimensions, got {}'.format(
147  expected_input_dim, input.dim()))
148  if self.input_size != input.size(-1):
149  raise RuntimeError(
150  'input.size(-1) must be equal to input_size. Expected {}, got {}'.format(
151  self.input_size, input.size(-1)))
152 
153  @weak_script_method
154  def get_expected_hidden_size(self, input, batch_sizes):
155  # type: (Tensor, Optional[Tensor]) -> Tuple[int, int, int]
156  if batch_sizes is not None:
157  mini_batch = batch_sizes[0]
158  mini_batch = int(mini_batch)
159  else:
160  mini_batch = input.size(0) if self.batch_first else input.size(1)
161  num_directions = 2 if self.bidirectional else 1
162  expected_hidden_size = (self.num_layers * num_directions,
163  mini_batch, self.hidden_size)
164  return expected_hidden_size
165 
166  @weak_script_method
167  def check_hidden_size(self, hx, expected_hidden_size, msg='Expected hidden size {}, got {}'):
168  # type: (Tensor, Tuple[int, int, int], str) -> None
169  if hx.size() != expected_hidden_size:
170  raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
171 
172  def check_forward_args(self, input, hidden, batch_sizes):
173  self.check_input(input, batch_sizes)
174  expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
175 
176  self.check_hidden_size(hidden, expected_hidden_size)
177 
178  def permute_hidden(self, hx, permutation):
179  if permutation is None:
180  return hx
181  return apply_permutation(hx, permutation)
182 
183  def forward(self, input, hx=None):
184  is_packed = isinstance(input, PackedSequence)
185  if is_packed:
186  input, batch_sizes, sorted_indices, unsorted_indices = input
187  max_batch_size = batch_sizes[0]
188  max_batch_size = int(max_batch_size)
189  else:
190  batch_sizes = None
191  max_batch_size = input.size(0) if self.batch_first else input.size(1)
192  sorted_indices = None
193  unsorted_indices = None
194 
195  if hx is None:
196  num_directions = 2 if self.bidirectional else 1
197  hx = torch.zeros(self.num_layers * num_directions,
198  max_batch_size, self.hidden_size,
199  dtype=input.dtype, device=input.device)
200  else:
201  # Each batch of the hidden state should match the input sequence that
202  # the user believes he/she is passing in.
203  hx = self.permute_hidden(hx, sorted_indices)
204 
205  self.check_forward_args(input, hx, batch_sizes)
206  _impl = _rnn_impls[self.mode]
207  if batch_sizes is None:
208  result = _impl(input, hx, self.get_flat_weights(), self.bias, self.num_layers,
209  self.dropout, self.training, self.bidirectional, self.batch_first)
210  else:
211  result = _impl(input, batch_sizes, hx, self.get_flat_weights(), self.bias,
212  self.num_layers, self.dropout, self.training, self.bidirectional)
213  output = result[0]
214  hidden = result[1]
215 
216  if is_packed:
217  output = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
218  return output, self.permute_hidden(hidden, unsorted_indices)
219 
220  def extra_repr(self):
221  s = '{input_size}, {hidden_size}'
222  if self.num_layers != 1:
223  s += ', num_layers={num_layers}'
224  if self.bias is not True:
225  s += ', bias={bias}'
226  if self.batch_first is not False:
227  s += ', batch_first={batch_first}'
228  if self.dropout != 0:
229  s += ', dropout={dropout}'
230  if self.bidirectional is not False:
231  s += ', bidirectional={bidirectional}'
232  return s.format(**self.__dict__)
233 
234  def __setstate__(self, d):
235  super(RNNBase, self).__setstate__(d)
236  if 'all_weights' in d:
237  self._all_weights = d['all_weights']
238  if isinstance(self._all_weights[0][0], str):
239  return
240  num_layers = self.num_layers
241  num_directions = 2 if self.bidirectional else 1
242  self._all_weights = []
243  for layer in range(num_layers):
244  for direction in range(num_directions):
245  suffix = '_reverse' if direction == 1 else ''
246  weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}', 'bias_hh_l{}{}']
247  weights = [x.format(layer, suffix) for x in weights]
248  if self.bias:
249  self._all_weights += [weights]
250  else:
251  self._all_weights += [weights[:2]]
252 
253  @property
254  def _flat_weights(self):
255  return [p for layerparams in self.all_weights for p in layerparams]
256 
257  @property
258  def all_weights(self):
259  return [[getattr(self, weight) for weight in weights] for weights in self._all_weights]
260 
261 
262 class RNN(RNNBase):
263  r"""Applies a multi-layer Elman RNN with :math:`tanh` or :math:`ReLU` non-linearity to an
264  input sequence.
265 
266 
267  For each element in the input sequence, each layer computes the following
268  function:
269 
270  .. math::
271  h_t = \text{tanh}(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})
272 
273  where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
274  the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
275  previous layer at time `t-1` or the initial hidden state at time `0`.
276  If :attr:`nonlinearity` is ``'relu'``, then `ReLU` is used instead of `tanh`.
277 
278  Args:
279  input_size: The number of expected features in the input `x`
280  hidden_size: The number of features in the hidden state `h`
281  num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
282  would mean stacking two RNNs together to form a `stacked RNN`,
283  with the second RNN taking in outputs of the first RNN and
284  computing the final results. Default: 1
285  nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
286  bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
287  Default: ``True``
288  batch_first: If ``True``, then the input and output tensors are provided
289  as `(batch, seq, feature)`. Default: ``False``
290  dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
291  RNN layer except the last layer, with dropout probability equal to
292  :attr:`dropout`. Default: 0
293  bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False``
294 
295  Inputs: input, h_0
296  - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
297  of the input sequence. The input can also be a packed variable length
298  sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
299  or :func:`torch.nn.utils.rnn.pack_sequence`
300  for details.
301  - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
302  containing the initial hidden state for each element in the batch.
303  Defaults to zero if not provided. If the RNN is bidirectional,
304  num_directions should be 2, else it should be 1.
305 
306  Outputs: output, h_n
307  - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
308  containing the output features (`h_t`) from the last layer of the RNN,
309  for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has
310  been given as the input, the output will also be a packed sequence.
311 
312  For the unpacked case, the directions can be separated
313  using ``output.view(seq_len, batch, num_directions, hidden_size)``,
314  with forward and backward being direction `0` and `1` respectively.
315  Similarly, the directions can be separated in the packed case.
316  - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
317  containing the hidden state for `t = seq_len`.
318 
319  Like *output*, the layers can be separated using
320  ``h_n.view(num_layers, num_directions, batch, hidden_size)``.
321 
322  Shape:
323  - Input1: :math:`(L, N, H_{in})` tensor containing input features where
324  :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length.
325  - Input2: :math:`(S, N, H_{out})` tensor
326  containing the initial hidden state for each element in the batch.
327  :math:`H_{out}=\text{hidden\_size}`
328  Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}`
329  If the RNN is bidirectional, num_directions should be 2, else it should be 1.
330  - Output1: :math:`(L, N, H_{all})` where :math:`H_all=\text{num\_directions} * \text{hidden\_size}`
331  - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state
332  for each element in the batch
333 
334  Attributes:
335  weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
336  of shape `(hidden_size, input_size)` for `k = 0`. Otherwise, the shape is
337  `(hidden_size, num_directions * hidden_size)`
338  weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
339  of shape `(hidden_size, hidden_size)`
340  bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
341  of shape `(hidden_size)`
342  bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
343  of shape `(hidden_size)`
344 
345  .. note::
346  All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
347  where :math:`k = \frac{1}{\text{hidden\_size}}`
348 
349  .. include:: cudnn_persistent_rnn.rst
350 
351  Examples::
352 
353  >>> rnn = nn.RNN(10, 20, 2)
354  >>> input = torch.randn(5, 3, 10)
355  >>> h0 = torch.randn(2, 3, 20)
356  >>> output, hn = rnn(input, h0)
357  """
358 
359  def __init__(self, *args, **kwargs):
360  if 'nonlinearity' in kwargs:
361  if kwargs['nonlinearity'] == 'tanh':
362  mode = 'RNN_TANH'
363  elif kwargs['nonlinearity'] == 'relu':
364  mode = 'RNN_RELU'
365  else:
366  raise ValueError("Unknown nonlinearity '{}'".format(
367  kwargs['nonlinearity']))
368  del kwargs['nonlinearity']
369  else:
370  mode = 'RNN_TANH'
371 
372  super(RNN, self).__init__(mode, *args, **kwargs)
373 
374 
375 @weak_module
376 class LSTM(RNNBase):
377  r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input
378  sequence.
379 
380 
381  For each element in the input sequence, each layer computes the following
382  function:
383 
384  .. math::
385  \begin{array}{ll} \\
386  i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
387  f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
388  g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
389  o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
390  c_t = f_t * c_{(t-1)} + i_t * g_t \\
391  h_t = o_t * \tanh(c_t) \\
392  \end{array}
393 
394  where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
395  state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{(t-1)}`
396  is the hidden state of the layer at time `t-1` or the initial hidden
397  state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
398  :math:`o_t` are the input, forget, cell, and output gates, respectively.
399  :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.
400 
401  In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
402  (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
403  dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
404  variable which is :math:`0` with probability :attr:`dropout`.
405 
406  Args:
407  input_size: The number of expected features in the input `x`
408  hidden_size: The number of features in the hidden state `h`
409  num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
410  would mean stacking two LSTMs together to form a `stacked LSTM`,
411  with the second LSTM taking in outputs of the first LSTM and
412  computing the final results. Default: 1
413  bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
414  Default: ``True``
415  batch_first: If ``True``, then the input and output tensors are provided
416  as (batch, seq, feature). Default: ``False``
417  dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
418  LSTM layer except the last layer, with dropout probability equal to
419  :attr:`dropout`. Default: 0
420  bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``
421 
422  Inputs: input, (h_0, c_0)
423  - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
424  of the input sequence.
425  The input can also be a packed variable length sequence.
426  See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
427  :func:`torch.nn.utils.rnn.pack_sequence` for details.
428  - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
429  containing the initial hidden state for each element in the batch.
430  If the LSTM is bidirectional, num_directions should be 2, else it should be 1.
431  - **c_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
432  containing the initial cell state for each element in the batch.
433 
434  If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.
435 
436 
437  Outputs: output, (h_n, c_n)
438  - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
439  containing the output features `(h_t)` from the last layer of the LSTM,
440  for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
441  given as the input, the output will also be a packed sequence.
442 
443  For the unpacked case, the directions can be separated
444  using ``output.view(seq_len, batch, num_directions, hidden_size)``,
445  with forward and backward being direction `0` and `1` respectively.
446  Similarly, the directions can be separated in the packed case.
447  - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
448  containing the hidden state for `t = seq_len`.
449 
450  Like *output*, the layers can be separated using
451  ``h_n.view(num_layers, num_directions, batch, hidden_size)`` and similarly for *c_n*.
452  - **c_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
453  containing the cell state for `t = seq_len`.
454 
455  Attributes:
456  weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
457  `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`.
458  Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`
459  weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
460  `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`
461  bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
462  `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
463  bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
464  `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`
465 
466  .. note::
467  All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
468  where :math:`k = \frac{1}{\text{hidden\_size}}`
469 
470  .. include:: cudnn_persistent_rnn.rst
471 
472  Examples::
473 
474  >>> rnn = nn.LSTM(10, 20, 2)
475  >>> input = torch.randn(5, 3, 10)
476  >>> h0 = torch.randn(2, 3, 20)
477  >>> c0 = torch.randn(2, 3, 20)
478  >>> output, (hn, cn) = rnn(input, (h0, c0))
479  """
480  __overloads__ = {'forward': ['forward_packed', 'forward_tensor']}
481 
482  def __init__(self, *args, **kwargs):
483  super(LSTM, self).__init__('LSTM', *args, **kwargs)
484 
485  @weak_script_method
486  def check_forward_args(self, input, hidden, batch_sizes):
487  # type: (Tensor, Tuple[Tensor, Tensor], Optional[Tensor]) -> None
488  self.check_input(input, batch_sizes)
489  expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)
490 
491  self.check_hidden_size(hidden[0], expected_hidden_size,
492  'Expected hidden[0] size {}, got {}')
493  self.check_hidden_size(hidden[1], expected_hidden_size,
494  'Expected hidden[1] size {}, got {}')
495 
496  @weak_script_method
497  def permute_hidden(self, hx, permutation):
498  # type: (Tuple[Tensor, Tensor], Optional[Tensor]) -> Tuple[Tensor, Tensor]
499  if permutation is None:
500  return hx
501  return apply_permutation(hx[0], permutation), apply_permutation(hx[1], permutation)
502 
503  @weak_script_method
504  def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
505  # type: (Tensor, Optional[Tuple[Tensor, Tensor]], Optional[Tensor], int, Optional[Tensor]) -> Tuple[Tensor, Tuple[Tensor, Tensor]] # noqa
506  if hx is None:
507  num_directions = 2 if self.bidirectional else 1
508  zeros = torch.zeros(self.num_layers * num_directions,
509  max_batch_size, self.hidden_size,
510  dtype=input.dtype, device=input.device)
511  hx = (zeros, zeros)
512  else:
513  # Each batch of the hidden state should match the input sequence that
514  # the user believes he/she is passing in.
515  hx = self.permute_hidden(hx, sorted_indices)
516 
517  self.check_forward_args(input, hx, batch_sizes)
518  if batch_sizes is None:
519  result = _VF.lstm(input, hx, self.get_flat_weights(), self.bias, self.num_layers,
520  self.dropout, self.training, self.bidirectional, self.batch_first)
521  else:
522  result = _VF.lstm(input, batch_sizes, hx, self.get_flat_weights(), self.bias,
523  self.num_layers, self.dropout, self.training, self.bidirectional)
524  output = result[0]
525  hidden = result[1:]
526 
527  return output, hidden
528 
529  @weak_script_method
530  def forward_tensor(self, input, hx=None):
531  # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]
532  batch_sizes = None
533  max_batch_size = input.size(0) if self.batch_first else input.size(1)
534  sorted_indices = None
535  unsorted_indices = None
536 
537  output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
538 
539  return output, self.permute_hidden(hidden, unsorted_indices)
540 
541  @weak_script_method
542  def forward_packed(self, input, hx=None):
543  # type: (Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]], Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]], Tuple[Tensor, Tensor]] # noqa
544  input, batch_sizes, sorted_indices, unsorted_indices = input
545  max_batch_size = batch_sizes[0]
546  max_batch_size = int(max_batch_size)
547 
548  output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
549 
550  output = get_packed_sequence(output, batch_sizes, sorted_indices, unsorted_indices)
551  return output, self.permute_hidden(hidden, unsorted_indices)
552 
553  def forward(self, input, hx=None):
554  if isinstance(input, PackedSequence):
555  return self.forward_packed(input, hx)
556  else:
557  return self.forward_tensor(input, hx)
558 
559 
560 class GRU(RNNBase):
561  r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
562 
563 
564  For each element in the input sequence, each layer computes the following
565  function:
566 
567  .. math::
568  \begin{array}{ll}
569  r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
570  z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
571  n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
572  h_t = (1 - z_t) * n_t + z_t * h_{(t-1)}
573  \end{array}
574 
575  where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
576  at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
577  at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
578  :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
579  :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.
580 
581  In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
582  (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
583  dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
584  variable which is :math:`0` with probability :attr:`dropout`.
585 
586  Args:
587  input_size: The number of expected features in the input `x`
588  hidden_size: The number of features in the hidden state `h`
589  num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
590  would mean stacking two GRUs together to form a `stacked GRU`,
591  with the second GRU taking in outputs of the first GRU and
592  computing the final results. Default: 1
593  bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
594  Default: ``True``
595  batch_first: If ``True``, then the input and output tensors are provided
596  as (batch, seq, feature). Default: ``False``
597  dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
598  GRU layer except the last layer, with dropout probability equal to
599  :attr:`dropout`. Default: 0
600  bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False``
601 
602  Inputs: input, h_0
603  - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
604  of the input sequence. The input can also be a packed variable length
605  sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
606  for details.
607  - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
608  containing the initial hidden state for each element in the batch.
609  Defaults to zero if not provided. If the RNN is bidirectional,
610  num_directions should be 2, else it should be 1.
611 
612  Outputs: output, h_n
613  - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
614  containing the output features h_t from the last layer of the GRU,
615  for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
616  given as the input, the output will also be a packed sequence.
617  For the unpacked case, the directions can be separated
618  using ``output.view(seq_len, batch, num_directions, hidden_size)``,
619  with forward and backward being direction `0` and `1` respectively.
620 
621  Similarly, the directions can be separated in the packed case.
622  - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
623  containing the hidden state for `t = seq_len`
624 
625  Like *output*, the layers can be separated using
626  ``h_n.view(num_layers, num_directions, batch, hidden_size)``.
627 
628  Shape:
629  - Input1: :math:`(L, N, H_{in})` tensor containing input features where
630  :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length.
631  - Input2: :math:`(S, N, H_{out})` tensor
632  containing the initial hidden state for each element in the batch.
633  :math:`H_{out}=\text{hidden\_size}`
634  Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}`
635  If the RNN is bidirectional, num_directions should be 2, else it should be 1.
636  - Output1: :math:`(L, N, H_{all})` where :math:`H_all=\text{num\_directions} * \text{hidden\_size}`
637  - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state
638  for each element in the batch
639 
640  Attributes:
641  weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
642  (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`.
643  Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)`
644  weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
645  (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
646  bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
647  (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
648  bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
649  (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`
650 
651  .. note::
652  All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
653  where :math:`k = \frac{1}{\text{hidden\_size}}`
654 
655  .. include:: cudnn_persistent_rnn.rst
656 
657  Examples::
658 
659  >>> rnn = nn.GRU(10, 20, 2)
660  >>> input = torch.randn(5, 3, 10)
661  >>> h0 = torch.randn(2, 3, 20)
662  >>> output, hn = rnn(input, h0)
663  """
664 
665  def __init__(self, *args, **kwargs):
666  super(GRU, self).__init__('GRU', *args, **kwargs)
667 
668 
669 class RNNCellBase(Module):
670  __constants__ = ['input_size', 'hidden_size', 'bias']
671 
672  def __init__(self, input_size, hidden_size, bias, num_chunks):
673  super(RNNCellBase, self).__init__()
674  self.input_size = input_size
675  self.hidden_size = hidden_size
676  self.bias = bias
677  self.weight_ih = Parameter(torch.Tensor(num_chunks * hidden_size, input_size))
678  self.weight_hh = Parameter(torch.Tensor(num_chunks * hidden_size, hidden_size))
679  if bias:
680  self.bias_ih = Parameter(torch.Tensor(num_chunks * hidden_size))
681  self.bias_hh = Parameter(torch.Tensor(num_chunks * hidden_size))
682  else:
683  self.register_parameter('bias_ih', None)
684  self.register_parameter('bias_hh', None)
685  self.reset_parameters()
686 
687  def extra_repr(self):
688  s = '{input_size}, {hidden_size}'
689  if 'bias' in self.__dict__ and self.bias is not True:
690  s += ', bias={bias}'
691  if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
692  s += ', nonlinearity={nonlinearity}'
693  return s.format(**self.__dict__)
694 
695  @weak_script_method
696  def check_forward_input(self, input):
697  if input.size(1) != self.input_size:
698  raise RuntimeError(
699  "input has inconsistent input_size: got {}, expected {}".format(
700  input.size(1), self.input_size))
701 
702  @weak_script_method
703  def check_forward_hidden(self, input, hx, hidden_label=''):
704  # type: (Tensor, Tensor, str) -> None
705  if input.size(0) != hx.size(0):
706  raise RuntimeError(
707  "Input batch size {} doesn't match hidden{} batch size {}".format(
708  input.size(0), hidden_label, hx.size(0)))
709 
710  if hx.size(1) != self.hidden_size:
711  raise RuntimeError(
712  "hidden{} has inconsistent hidden_size: got {}, expected {}".format(
713  hidden_label, hx.size(1), self.hidden_size))
714 
715  def reset_parameters(self):
716  stdv = 1.0 / math.sqrt(self.hidden_size)
717  for weight in self.parameters():
718  init.uniform_(weight, -stdv, stdv)
719 
720 
721 @weak_module
723  r"""An Elman RNN cell with tanh or ReLU non-linearity.
724 
725  .. math::
726 
727  h' = \tanh(W_{ih} x + b_{ih} + W_{hh} h + b_{hh})
728 
729  If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh.
730 
731  Args:
732  input_size: The number of expected features in the input `x`
733  hidden_size: The number of features in the hidden state `h`
734  bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
735  Default: ``True``
736  nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
737 
738  Inputs: input, hidden
739  - **input** of shape `(batch, input_size)`: tensor containing input features
740  - **hidden** of shape `(batch, hidden_size)`: tensor containing the initial hidden
741  state for each element in the batch.
742  Defaults to zero if not provided.
743 
744  Outputs: h'
745  - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
746  for each element in the batch
747 
748  Shape:
749  - Input1: :math:`(N, H_{in})` tensor containing input features where
750  :math:`H_{in}` = `input_size`
751  - Input2: :math:`(N, H_{out})` tensor containing the initial hidden
752  state for each element in the batch where :math:`H_{out}` = `hidden_size`
753  Defaults to zero if not provided.
754  - Output: :math:`(N, H_{out})` tensor containing the next hidden state
755  for each element in the batch
756 
757  Attributes:
758  weight_ih: the learnable input-hidden weights, of shape
759  `(hidden_size, input_size)`
760  weight_hh: the learnable hidden-hidden weights, of shape
761  `(hidden_size, hidden_size)`
762  bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
763  bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`
764 
765  .. note::
766  All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
767  where :math:`k = \frac{1}{\text{hidden\_size}}`
768 
769  Examples::
770 
771  >>> rnn = nn.RNNCell(10, 20)
772  >>> input = torch.randn(6, 3, 10)
773  >>> hx = torch.randn(3, 20)
774  >>> output = []
775  >>> for i in range(6):
776  hx = rnn(input[i], hx)
777  output.append(hx)
778  """
779  __constants__ = ['input_size', 'hidden_size', 'bias', 'nonlinearity']
780 
781  def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
782  super(RNNCell, self).__init__(input_size, hidden_size, bias, num_chunks=1)
783  self.nonlinearity = nonlinearity
784 
785  @weak_script_method
786  def forward(self, input, hx=None):
787  # type: (Tensor, Optional[Tensor]) -> Tensor
788  self.check_forward_input(input)
789  if hx is None:
790  hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
791  self.check_forward_hidden(input, hx, '')
792  if self.nonlinearity == "tanh":
793  ret = _VF.rnn_tanh_cell(
794  input, hx,
795  self.weight_ih, self.weight_hh,
796  self.bias_ih, self.bias_hh,
797  )
798  elif self.nonlinearity == "relu":
799  ret = _VF.rnn_relu_cell(
800  input, hx,
801  self.weight_ih, self.weight_hh,
802  self.bias_ih, self.bias_hh,
803  )
804  else:
805  ret = input # TODO: remove when jit supports exception flow
806  raise RuntimeError(
807  "Unknown nonlinearity: {}".format(self.nonlinearity))
808  return ret
809 
810 
811 @weak_module
813  r"""A long short-term memory (LSTM) cell.
814 
815  .. math::
816 
817  \begin{array}{ll}
818  i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
819  f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
820  g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
821  o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
822  c' = f * c + i * g \\
823  h' = o * \tanh(c') \\
824  \end{array}
825 
826  where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.
827 
828  Args:
829  input_size: The number of expected features in the input `x`
830  hidden_size: The number of features in the hidden state `h`
831  bias: If ``False``, then the layer does not use bias weights `b_ih` and
832  `b_hh`. Default: ``True``
833 
834  Inputs: input, (h_0, c_0)
835  - **input** of shape `(batch, input_size)`: tensor containing input features
836  - **h_0** of shape `(batch, hidden_size)`: tensor containing the initial hidden
837  state for each element in the batch.
838  - **c_0** of shape `(batch, hidden_size)`: tensor containing the initial cell state
839  for each element in the batch.
840 
841  If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.
842 
843  Outputs: (h_1, c_1)
844  - **h_1** of shape `(batch, hidden_size)`: tensor containing the next hidden state
845  for each element in the batch
846  - **c_1** of shape `(batch, hidden_size)`: tensor containing the next cell state
847  for each element in the batch
848 
849  Attributes:
850  weight_ih: the learnable input-hidden weights, of shape
851  `(4*hidden_size, input_size)`
852  weight_hh: the learnable hidden-hidden weights, of shape
853  `(4*hidden_size, hidden_size)`
854  bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
855  bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`
856 
857  .. note::
858  All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
859  where :math:`k = \frac{1}{\text{hidden\_size}}`
860 
861  Examples::
862 
863  >>> rnn = nn.LSTMCell(10, 20)
864  >>> input = torch.randn(6, 3, 10)
865  >>> hx = torch.randn(3, 20)
866  >>> cx = torch.randn(3, 20)
867  >>> output = []
868  >>> for i in range(6):
869  hx, cx = rnn(input[i], (hx, cx))
870  output.append(hx)
871  """
872 
873  def __init__(self, input_size, hidden_size, bias=True):
874  super(LSTMCell, self).__init__(input_size, hidden_size, bias, num_chunks=4)
875 
876  @weak_script_method
877  def forward(self, input, hx=None):
878  # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
879  self.check_forward_input(input)
880  if hx is None:
881  zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
882  hx = (zeros, zeros)
883  self.check_forward_hidden(input, hx[0], '[0]')
884  self.check_forward_hidden(input, hx[1], '[1]')
885  return _VF.lstm_cell(
886  input, hx,
887  self.weight_ih, self.weight_hh,
888  self.bias_ih, self.bias_hh,
889  )
890 
891 
892 @weak_module
894  r"""A gated recurrent unit (GRU) cell
895 
896  .. math::
897 
898  \begin{array}{ll}
899  r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\
900  z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\
901  n = \tanh(W_{in} x + b_{in} + r * (W_{hn} h + b_{hn})) \\
902  h' = (1 - z) * n + z * h
903  \end{array}
904 
905  where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.
906 
907  Args:
908  input_size: The number of expected features in the input `x`
909  hidden_size: The number of features in the hidden state `h`
910  bias: If ``False``, then the layer does not use bias weights `b_ih` and
911  `b_hh`. Default: ``True``
912 
913  Inputs: input, hidden
914  - **input** of shape `(batch, input_size)`: tensor containing input features
915  - **hidden** of shape `(batch, hidden_size)`: tensor containing the initial hidden
916  state for each element in the batch.
917  Defaults to zero if not provided.
918 
919  Outputs: h'
920  - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
921  for each element in the batch
922 
923  Shape:
924  - Input1: :math:`(N, H_{in})` tensor containing input features where
925  :math:`H_{in}` = `input_size`
926  - Input2: :math:`(N, H_{out})` tensor containing the initial hidden
927  state for each element in the batch where :math:`H_{out}` = `hidden_size`
928  Defaults to zero if not provided.
929  - Output: :math:`(N, H_{out})` tensor containing the next hidden state
930  for each element in the batch
931 
932  Attributes:
933  weight_ih: the learnable input-hidden weights, of shape
934  `(3*hidden_size, input_size)`
935  weight_hh: the learnable hidden-hidden weights, of shape
936  `(3*hidden_size, hidden_size)`
937  bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)`
938  bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)`
939 
940  .. note::
941  All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
942  where :math:`k = \frac{1}{\text{hidden\_size}}`
943 
944  Examples::
945 
946  >>> rnn = nn.GRUCell(10, 20)
947  >>> input = torch.randn(6, 3, 10)
948  >>> hx = torch.randn(3, 20)
949  >>> output = []
950  >>> for i in range(6):
951  hx = rnn(input[i], hx)
952  output.append(hx)
953  """
954 
955  def __init__(self, input_size, hidden_size, bias=True):
956  super(GRUCell, self).__init__(input_size, hidden_size, bias, num_chunks=3)
957 
958  @weak_script_method
959  def forward(self, input, hx=None):
960  # type: (Tensor, Optional[Tensor]) -> Tensor
961  self.check_forward_input(input)
962  if hx is None:
963  hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
964  self.check_forward_hidden(input, hx, '')
965  return _VF.gru_cell(
966  input, hx,
967  self.weight_ih, self.weight_hh,
968  self.bias_ih, self.bias_hh,
969  )
def flatten_parameters(self)
Definition: rnn.py:94
def forward_tensor(self, input, hx=None)
Definition: rnn.py:530
def reset_parameters(self)
Definition: rnn.py:131
def check_hidden_size(self, hx, expected_hidden_size, msg='Expected hidden size{}, got)
Definition: rnn.py:167
def is_acceptable(tensor)
Definition: __init__.py:95
def check_forward_args(self, input, hidden, batch_sizes)
Definition: rnn.py:172
def check_forward_input(self, input)
Definition: rnn.py:696
def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices)
Definition: rnn.py:504
def check_forward_hidden(self, input, hx, hidden_label='')
Definition: rnn.py:703
def check_input(self, input, batch_sizes)
Definition: rnn.py:141
def get_expected_hidden_size(self, input, batch_sizes)
Definition: rnn.py:154
def forward_packed(self, input, hx=None)
Definition: rnn.py:542
def permute_hidden(self, hx, permutation)
Definition: rnn.py:178
def get_flat_weights(self)
Definition: rnn.py:137
def _flat_weights(self)
Definition: rnn.py:254