7 from .module
import Module
8 from ..parameter
import Parameter
9 from ..utils.rnn
import PackedSequence, get_packed_sequence
12 from ..._jit_internal
import weak_module, weak_script_method, weak_script, \
17 'RNN_TANH': _VF.rnn_tanh,
18 'RNN_RELU': _VF.rnn_relu,
23 def apply_permutation(tensor, permutation, dim=1):
25 return tensor.index_select(dim, permutation)
29 __constants__ = [
'mode',
'input_size',
'hidden_size',
'num_layers',
'bias',
30 'batch_first',
'dropout',
'bidirectional',
'_flat_parameters']
32 def __init__(self, mode, input_size, hidden_size,
33 num_layers=1, bias=
True, batch_first=
False,
34 dropout=0., bidirectional=
False):
35 super(RNNBase, self).__init__()
44 num_directions = 2
if bidirectional
else 1
46 if not isinstance(dropout, numbers.Number)
or not 0 <= dropout <= 1
or \
47 isinstance(dropout, bool):
48 raise ValueError(
"dropout should be a number in range [0, 1] " 49 "representing the probability of an element being " 51 if dropout > 0
and num_layers == 1:
52 warnings.warn(
"dropout option adds dropout after all but last " 53 "recurrent layer, so non-zero dropout expects " 54 "num_layers greater than 1, but got dropout={} and " 55 "num_layers={}".format(dropout, num_layers))
58 gate_size = 4 * hidden_size
60 gate_size = 3 * hidden_size
61 elif mode ==
'RNN_TANH':
62 gate_size = hidden_size
63 elif mode ==
'RNN_RELU':
64 gate_size = hidden_size
66 raise ValueError(
"Unrecognized RNN mode: " + mode)
69 for layer
in range(num_layers):
70 for direction
in range(num_directions):
71 layer_input_size = input_size
if layer == 0
else hidden_size * num_directions
73 w_ih = Parameter(torch.Tensor(gate_size, layer_input_size))
74 w_hh = Parameter(torch.Tensor(gate_size, hidden_size))
75 b_ih = Parameter(torch.Tensor(gate_size))
78 b_hh = Parameter(torch.Tensor(gate_size))
79 layer_params = (w_ih, w_hh, b_ih, b_hh)
81 suffix =
'_reverse' if direction == 1
else '' 82 param_names = [
'weight_ih_l{}{}',
'weight_hh_l{}{}']
84 param_names += [
'bias_ih_l{}{}',
'bias_hh_l{}{}']
85 param_names = [x.format(layer, suffix)
for x
in param_names]
87 for name, param
in zip(param_names, layer_params):
88 setattr(self, name, param)
89 self._all_weights.append(param_names)
95 """Resets parameter data pointer so that they can use faster code paths. 97 Right now, this works only if the module is on the GPU and cuDNN is enabled. 98 Otherwise, it's a no-op. 100 any_param = next(self.parameters()).data
109 unique_data_ptrs = set(p.data_ptr()
for p
in all_weights)
110 if len(unique_data_ptrs) != len(all_weights):
118 with torch.no_grad():
121 torch._cudnn_rnn_flatten_weight(
122 all_weights, (4
if self.
bias else 2),
126 def _apply(self, fn):
127 ret = super(RNNBase, self)._apply(fn)
131 def reset_parameters(self):
133 for weight
in self.parameters():
134 init.uniform_(weight, -stdv, stdv)
137 def get_flat_weights(self):
141 def check_input(self, input, batch_sizes):
143 expected_input_dim = 2
if batch_sizes
is not None else 3
144 if input.dim() != expected_input_dim:
146 'input must have {} dimensions, got {}'.format(
147 expected_input_dim, input.dim()))
150 'input.size(-1) must be equal to input_size. Expected {}, got {}'.format(
154 def get_expected_hidden_size(self, input, batch_sizes):
156 if batch_sizes
is not None:
157 mini_batch = batch_sizes[0]
158 mini_batch = int(mini_batch)
160 mini_batch = input.size(0)
if self.
batch_first else input.size(1)
162 expected_hidden_size = (self.
num_layers * num_directions,
164 return expected_hidden_size
167 def check_hidden_size(self, hx, expected_hidden_size, msg='Expected hidden size {}, got {}'):
169 if hx.size() != expected_hidden_size:
170 raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
172 def check_forward_args(self, input, hidden, batch_sizes):
178 def permute_hidden(self, hx, permutation):
179 if permutation
is None:
181 return apply_permutation(hx, permutation)
183 def forward(self, input, hx=None):
184 is_packed = isinstance(input, PackedSequence)
186 input, batch_sizes, sorted_indices, unsorted_indices = input
187 max_batch_size = batch_sizes[0]
188 max_batch_size = int(max_batch_size)
191 max_batch_size = input.size(0)
if self.
batch_first else input.size(1)
192 sorted_indices =
None 193 unsorted_indices =
None 197 hx = torch.zeros(self.
num_layers * num_directions,
199 dtype=input.dtype, device=input.device)
206 _impl = _rnn_impls[self.
mode]
207 if batch_sizes
is None:
217 output = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
220 def extra_repr(self):
221 s =
'{input_size}, {hidden_size}' 223 s +=
', num_layers={num_layers}' 224 if self.
bias is not True:
227 s +=
', batch_first={batch_first}' 229 s +=
', dropout={dropout}' 231 s +=
', bidirectional={bidirectional}' 232 return s.format(**self.__dict__)
234 def __setstate__(self, d):
235 super(RNNBase, self).__setstate__(d)
236 if 'all_weights' in d:
243 for layer
in range(num_layers):
244 for direction
in range(num_directions):
245 suffix =
'_reverse' if direction == 1
else '' 246 weights = [
'weight_ih_l{}{}',
'weight_hh_l{}{}',
'bias_ih_l{}{}',
'bias_hh_l{}{}']
247 weights = [x.format(layer, suffix)
for x
in weights]
254 def _flat_weights(self):
255 return [p
for layerparams
in self.
all_weights for p
in layerparams]
258 def all_weights(self):
259 return [[getattr(self, weight)
for weight
in weights]
for weights
in self.
_all_weights]
263 r"""Applies a multi-layer Elman RNN with :math:`tanh` or :math:`ReLU` non-linearity to an 267 For each element in the input sequence, each layer computes the following 271 h_t = \text{tanh}(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh}) 273 where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is 274 the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the 275 previous layer at time `t-1` or the initial hidden state at time `0`. 276 If :attr:`nonlinearity` is ``'relu'``, then `ReLU` is used instead of `tanh`. 279 input_size: The number of expected features in the input `x` 280 hidden_size: The number of features in the hidden state `h` 281 num_layers: Number of recurrent layers. E.g., setting ``num_layers=2`` 282 would mean stacking two RNNs together to form a `stacked RNN`, 283 with the second RNN taking in outputs of the first RNN and 284 computing the final results. Default: 1 285 nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'`` 286 bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. 288 batch_first: If ``True``, then the input and output tensors are provided 289 as `(batch, seq, feature)`. Default: ``False`` 290 dropout: If non-zero, introduces a `Dropout` layer on the outputs of each 291 RNN layer except the last layer, with dropout probability equal to 292 :attr:`dropout`. Default: 0 293 bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False`` 296 - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features 297 of the input sequence. The input can also be a packed variable length 298 sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence` 299 or :func:`torch.nn.utils.rnn.pack_sequence` 301 - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor 302 containing the initial hidden state for each element in the batch. 303 Defaults to zero if not provided. If the RNN is bidirectional, 304 num_directions should be 2, else it should be 1. 307 - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor 308 containing the output features (`h_t`) from the last layer of the RNN, 309 for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has 310 been given as the input, the output will also be a packed sequence. 312 For the unpacked case, the directions can be separated 313 using ``output.view(seq_len, batch, num_directions, hidden_size)``, 314 with forward and backward being direction `0` and `1` respectively. 315 Similarly, the directions can be separated in the packed case. 316 - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor 317 containing the hidden state for `t = seq_len`. 319 Like *output*, the layers can be separated using 320 ``h_n.view(num_layers, num_directions, batch, hidden_size)``. 323 - Input1: :math:`(L, N, H_{in})` tensor containing input features where 324 :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length. 325 - Input2: :math:`(S, N, H_{out})` tensor 326 containing the initial hidden state for each element in the batch. 327 :math:`H_{out}=\text{hidden\_size}` 328 Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}` 329 If the RNN is bidirectional, num_directions should be 2, else it should be 1. 330 - Output1: :math:`(L, N, H_{all})` where :math:`H_all=\text{num\_directions} * \text{hidden\_size}` 331 - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state 332 for each element in the batch 335 weight_ih_l[k]: the learnable input-hidden weights of the k-th layer, 336 of shape `(hidden_size, input_size)` for `k = 0`. Otherwise, the shape is 337 `(hidden_size, num_directions * hidden_size)` 338 weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer, 339 of shape `(hidden_size, hidden_size)` 340 bias_ih_l[k]: the learnable input-hidden bias of the k-th layer, 341 of shape `(hidden_size)` 342 bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer, 343 of shape `(hidden_size)` 346 All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` 347 where :math:`k = \frac{1}{\text{hidden\_size}}` 349 .. include:: cudnn_persistent_rnn.rst 353 >>> rnn = nn.RNN(10, 20, 2) 354 >>> input = torch.randn(5, 3, 10) 355 >>> h0 = torch.randn(2, 3, 20) 356 >>> output, hn = rnn(input, h0) 359 def __init__(self, *args, **kwargs):
360 if 'nonlinearity' in kwargs:
361 if kwargs[
'nonlinearity'] ==
'tanh':
363 elif kwargs[
'nonlinearity'] ==
'relu':
366 raise ValueError(
"Unknown nonlinearity '{}'".format(
367 kwargs[
'nonlinearity']))
368 del kwargs[
'nonlinearity']
372 super(RNN, self).__init__(mode, *args, **kwargs)
377 r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input 381 For each element in the input sequence, each layer computes the following 386 i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\ 387 f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\ 388 g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\ 389 o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\ 390 c_t = f_t * c_{(t-1)} + i_t * g_t \\ 391 h_t = o_t * \tanh(c_t) \\ 394 where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell 395 state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{(t-1)}` 396 is the hidden state of the layer at time `t-1` or the initial hidden 397 state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`, 398 :math:`o_t` are the input, forget, cell, and output gates, respectively. 399 :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. 401 In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer 402 (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by 403 dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random 404 variable which is :math:`0` with probability :attr:`dropout`. 407 input_size: The number of expected features in the input `x` 408 hidden_size: The number of features in the hidden state `h` 409 num_layers: Number of recurrent layers. E.g., setting ``num_layers=2`` 410 would mean stacking two LSTMs together to form a `stacked LSTM`, 411 with the second LSTM taking in outputs of the first LSTM and 412 computing the final results. Default: 1 413 bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. 415 batch_first: If ``True``, then the input and output tensors are provided 416 as (batch, seq, feature). Default: ``False`` 417 dropout: If non-zero, introduces a `Dropout` layer on the outputs of each 418 LSTM layer except the last layer, with dropout probability equal to 419 :attr:`dropout`. Default: 0 420 bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False`` 422 Inputs: input, (h_0, c_0) 423 - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features 424 of the input sequence. 425 The input can also be a packed variable length sequence. 426 See :func:`torch.nn.utils.rnn.pack_padded_sequence` or 427 :func:`torch.nn.utils.rnn.pack_sequence` for details. 428 - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor 429 containing the initial hidden state for each element in the batch. 430 If the LSTM is bidirectional, num_directions should be 2, else it should be 1. 431 - **c_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor 432 containing the initial cell state for each element in the batch. 434 If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero. 437 Outputs: output, (h_n, c_n) 438 - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor 439 containing the output features `(h_t)` from the last layer of the LSTM, 440 for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been 441 given as the input, the output will also be a packed sequence. 443 For the unpacked case, the directions can be separated 444 using ``output.view(seq_len, batch, num_directions, hidden_size)``, 445 with forward and backward being direction `0` and `1` respectively. 446 Similarly, the directions can be separated in the packed case. 447 - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor 448 containing the hidden state for `t = seq_len`. 450 Like *output*, the layers can be separated using 451 ``h_n.view(num_layers, num_directions, batch, hidden_size)`` and similarly for *c_n*. 452 - **c_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor 453 containing the cell state for `t = seq_len`. 456 weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer 457 `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`. 458 Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)` 459 weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer 460 `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)` 461 bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer 462 `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)` 463 bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer 464 `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)` 467 All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` 468 where :math:`k = \frac{1}{\text{hidden\_size}}` 470 .. include:: cudnn_persistent_rnn.rst 474 >>> rnn = nn.LSTM(10, 20, 2) 475 >>> input = torch.randn(5, 3, 10) 476 >>> h0 = torch.randn(2, 3, 20) 477 >>> c0 = torch.randn(2, 3, 20) 478 >>> output, (hn, cn) = rnn(input, (h0, c0)) 480 __overloads__ = {
'forward': [
'forward_packed',
'forward_tensor']}
482 def __init__(self, *args, **kwargs):
483 super(LSTM, self).__init__(
'LSTM', *args, **kwargs)
486 def check_forward_args(self, input, hidden, batch_sizes):
492 'Expected hidden[0] size {}, got {}')
494 'Expected hidden[1] size {}, got {}')
497 def permute_hidden(self, hx, permutation):
499 if permutation
is None:
501 return apply_permutation(hx[0], permutation), apply_permutation(hx[1], permutation)
504 def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices):
508 zeros = torch.zeros(self.
num_layers * num_directions,
510 dtype=input.dtype, device=input.device)
518 if batch_sizes
is None:
527 return output, hidden
530 def forward_tensor(self, input, hx=None):
533 max_batch_size = input.size(0)
if self.
batch_first else input.size(1)
534 sorted_indices =
None 535 unsorted_indices =
None 537 output, hidden = self.
forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
542 def forward_packed(self, input, hx=None):
544 input, batch_sizes, sorted_indices, unsorted_indices = input
545 max_batch_size = batch_sizes[0]
546 max_batch_size = int(max_batch_size)
548 output, hidden = self.
forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
550 output = get_packed_sequence(output, batch_sizes, sorted_indices, unsorted_indices)
553 def forward(self, input, hx=None):
554 if isinstance(input, PackedSequence):
561 r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence. 564 For each element in the input sequence, each layer computes the following 569 r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\ 570 z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\ 571 n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\ 572 h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} 575 where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input 576 at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer 577 at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`, 578 :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively. 579 :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. 581 In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer 582 (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by 583 dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random 584 variable which is :math:`0` with probability :attr:`dropout`. 587 input_size: The number of expected features in the input `x` 588 hidden_size: The number of features in the hidden state `h` 589 num_layers: Number of recurrent layers. E.g., setting ``num_layers=2`` 590 would mean stacking two GRUs together to form a `stacked GRU`, 591 with the second GRU taking in outputs of the first GRU and 592 computing the final results. Default: 1 593 bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. 595 batch_first: If ``True``, then the input and output tensors are provided 596 as (batch, seq, feature). Default: ``False`` 597 dropout: If non-zero, introduces a `Dropout` layer on the outputs of each 598 GRU layer except the last layer, with dropout probability equal to 599 :attr:`dropout`. Default: 0 600 bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False`` 603 - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features 604 of the input sequence. The input can also be a packed variable length 605 sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence` 607 - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor 608 containing the initial hidden state for each element in the batch. 609 Defaults to zero if not provided. If the RNN is bidirectional, 610 num_directions should be 2, else it should be 1. 613 - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor 614 containing the output features h_t from the last layer of the GRU, 615 for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been 616 given as the input, the output will also be a packed sequence. 617 For the unpacked case, the directions can be separated 618 using ``output.view(seq_len, batch, num_directions, hidden_size)``, 619 with forward and backward being direction `0` and `1` respectively. 621 Similarly, the directions can be separated in the packed case. 622 - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor 623 containing the hidden state for `t = seq_len` 625 Like *output*, the layers can be separated using 626 ``h_n.view(num_layers, num_directions, batch, hidden_size)``. 629 - Input1: :math:`(L, N, H_{in})` tensor containing input features where 630 :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length. 631 - Input2: :math:`(S, N, H_{out})` tensor 632 containing the initial hidden state for each element in the batch. 633 :math:`H_{out}=\text{hidden\_size}` 634 Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}` 635 If the RNN is bidirectional, num_directions should be 2, else it should be 1. 636 - Output1: :math:`(L, N, H_{all})` where :math:`H_all=\text{num\_directions} * \text{hidden\_size}` 637 - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state 638 for each element in the batch 641 weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer 642 (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`. 643 Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)` 644 weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer 645 (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)` 646 bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer 647 (b_ir|b_iz|b_in), of shape `(3*hidden_size)` 648 bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer 649 (b_hr|b_hz|b_hn), of shape `(3*hidden_size)` 652 All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` 653 where :math:`k = \frac{1}{\text{hidden\_size}}` 655 .. include:: cudnn_persistent_rnn.rst 659 >>> rnn = nn.GRU(10, 20, 2) 660 >>> input = torch.randn(5, 3, 10) 661 >>> h0 = torch.randn(2, 3, 20) 662 >>> output, hn = rnn(input, h0) 665 def __init__(self, *args, **kwargs):
666 super(GRU, self).__init__(
'GRU', *args, **kwargs)
670 __constants__ = [
'input_size',
'hidden_size',
'bias']
672 def __init__(self, input_size, hidden_size, bias, num_chunks):
673 super(RNNCellBase, self).__init__()
677 self.
weight_ih = Parameter(torch.Tensor(num_chunks * hidden_size, input_size))
678 self.
weight_hh = Parameter(torch.Tensor(num_chunks * hidden_size, hidden_size))
680 self.
bias_ih = Parameter(torch.Tensor(num_chunks * hidden_size))
681 self.
bias_hh = Parameter(torch.Tensor(num_chunks * hidden_size))
683 self.register_parameter(
'bias_ih',
None)
684 self.register_parameter(
'bias_hh',
None)
687 def extra_repr(self):
688 s =
'{input_size}, {hidden_size}' 689 if 'bias' in self.__dict__
and self.
bias is not True:
691 if 'nonlinearity' in self.__dict__
and self.nonlinearity !=
"tanh":
692 s +=
', nonlinearity={nonlinearity}' 693 return s.format(**self.__dict__)
696 def check_forward_input(self, input):
699 "input has inconsistent input_size: got {}, expected {}".format(
703 def check_forward_hidden(self, input, hx, hidden_label=''):
705 if input.size(0) != hx.size(0):
707 "Input batch size {} doesn't match hidden{} batch size {}".format(
708 input.size(0), hidden_label, hx.size(0)))
712 "hidden{} has inconsistent hidden_size: got {}, expected {}".format(
715 def reset_parameters(self):
717 for weight
in self.parameters():
718 init.uniform_(weight, -stdv, stdv)
723 r"""An Elman RNN cell with tanh or ReLU non-linearity. 727 h' = \tanh(W_{ih} x + b_{ih} + W_{hh} h + b_{hh}) 729 If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh. 732 input_size: The number of expected features in the input `x` 733 hidden_size: The number of features in the hidden state `h` 734 bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`. 736 nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'`` 738 Inputs: input, hidden 739 - **input** of shape `(batch, input_size)`: tensor containing input features 740 - **hidden** of shape `(batch, hidden_size)`: tensor containing the initial hidden 741 state for each element in the batch. 742 Defaults to zero if not provided. 745 - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state 746 for each element in the batch 749 - Input1: :math:`(N, H_{in})` tensor containing input features where 750 :math:`H_{in}` = `input_size` 751 - Input2: :math:`(N, H_{out})` tensor containing the initial hidden 752 state for each element in the batch where :math:`H_{out}` = `hidden_size` 753 Defaults to zero if not provided. 754 - Output: :math:`(N, H_{out})` tensor containing the next hidden state 755 for each element in the batch 758 weight_ih: the learnable input-hidden weights, of shape 759 `(hidden_size, input_size)` 760 weight_hh: the learnable hidden-hidden weights, of shape 761 `(hidden_size, hidden_size)` 762 bias_ih: the learnable input-hidden bias, of shape `(hidden_size)` 763 bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)` 766 All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` 767 where :math:`k = \frac{1}{\text{hidden\_size}}` 771 >>> rnn = nn.RNNCell(10, 20) 772 >>> input = torch.randn(6, 3, 10) 773 >>> hx = torch.randn(3, 20) 775 >>> for i in range(6): 776 hx = rnn(input[i], hx) 779 __constants__ = [
'input_size',
'hidden_size',
'bias',
'nonlinearity']
781 def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
782 super(RNNCell, self).__init__(input_size, hidden_size, bias, num_chunks=1)
786 def forward(self, input, hx=None):
790 hx = torch.zeros(input.size(0), self.
hidden_size, dtype=input.dtype, device=input.device)
793 ret = _VF.rnn_tanh_cell(
799 ret = _VF.rnn_relu_cell(
813 r"""A long short-term memory (LSTM) cell. 818 i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\ 819 f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\ 820 g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\ 821 o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\ 822 c' = f * c + i * g \\ 823 h' = o * \tanh(c') \\ 826 where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. 829 input_size: The number of expected features in the input `x` 830 hidden_size: The number of features in the hidden state `h` 831 bias: If ``False``, then the layer does not use bias weights `b_ih` and 832 `b_hh`. Default: ``True`` 834 Inputs: input, (h_0, c_0) 835 - **input** of shape `(batch, input_size)`: tensor containing input features 836 - **h_0** of shape `(batch, hidden_size)`: tensor containing the initial hidden 837 state for each element in the batch. 838 - **c_0** of shape `(batch, hidden_size)`: tensor containing the initial cell state 839 for each element in the batch. 841 If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero. 844 - **h_1** of shape `(batch, hidden_size)`: tensor containing the next hidden state 845 for each element in the batch 846 - **c_1** of shape `(batch, hidden_size)`: tensor containing the next cell state 847 for each element in the batch 850 weight_ih: the learnable input-hidden weights, of shape 851 `(4*hidden_size, input_size)` 852 weight_hh: the learnable hidden-hidden weights, of shape 853 `(4*hidden_size, hidden_size)` 854 bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)` 855 bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)` 858 All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` 859 where :math:`k = \frac{1}{\text{hidden\_size}}` 863 >>> rnn = nn.LSTMCell(10, 20) 864 >>> input = torch.randn(6, 3, 10) 865 >>> hx = torch.randn(3, 20) 866 >>> cx = torch.randn(3, 20) 868 >>> for i in range(6): 869 hx, cx = rnn(input[i], (hx, cx)) 873 def __init__(self, input_size, hidden_size, bias=True):
874 super(LSTMCell, self).__init__(input_size, hidden_size, bias, num_chunks=4)
877 def forward(self, input, hx=None):
881 zeros = torch.zeros(input.size(0), self.
hidden_size, dtype=input.dtype, device=input.device)
885 return _VF.lstm_cell(
894 r"""A gated recurrent unit (GRU) cell 899 r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\ 900 z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\ 901 n = \tanh(W_{in} x + b_{in} + r * (W_{hn} h + b_{hn})) \\ 902 h' = (1 - z) * n + z * h 905 where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. 908 input_size: The number of expected features in the input `x` 909 hidden_size: The number of features in the hidden state `h` 910 bias: If ``False``, then the layer does not use bias weights `b_ih` and 911 `b_hh`. Default: ``True`` 913 Inputs: input, hidden 914 - **input** of shape `(batch, input_size)`: tensor containing input features 915 - **hidden** of shape `(batch, hidden_size)`: tensor containing the initial hidden 916 state for each element in the batch. 917 Defaults to zero if not provided. 920 - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state 921 for each element in the batch 924 - Input1: :math:`(N, H_{in})` tensor containing input features where 925 :math:`H_{in}` = `input_size` 926 - Input2: :math:`(N, H_{out})` tensor containing the initial hidden 927 state for each element in the batch where :math:`H_{out}` = `hidden_size` 928 Defaults to zero if not provided. 929 - Output: :math:`(N, H_{out})` tensor containing the next hidden state 930 for each element in the batch 933 weight_ih: the learnable input-hidden weights, of shape 934 `(3*hidden_size, input_size)` 935 weight_hh: the learnable hidden-hidden weights, of shape 936 `(3*hidden_size, hidden_size)` 937 bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)` 938 bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)` 941 All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` 942 where :math:`k = \frac{1}{\text{hidden\_size}}` 946 >>> rnn = nn.GRUCell(10, 20) 947 >>> input = torch.randn(6, 3, 10) 948 >>> hx = torch.randn(3, 20) 950 >>> for i in range(6): 951 hx = rnn(input[i], hx) 955 def __init__(self, input_size, hidden_size, bias=True):
956 super(GRUCell, self).__init__(input_size, hidden_size, bias, num_chunks=3)
959 def forward(self, input, hx=None):
963 hx = torch.zeros(input.size(0), self.
hidden_size, dtype=input.dtype, device=input.device)
def flatten_parameters(self)
def forward_tensor(self, input, hx=None)
def reset_parameters(self)
def reset_parameters(self)
def check_hidden_size(self, hx, expected_hidden_size, msg='Expected hidden size{}, got)
def is_acceptable(tensor)
def check_forward_args(self, input, hidden, batch_sizes)
def check_forward_input(self, input)
def forward_impl(self, input, hx, batch_sizes, max_batch_size, sorted_indices)
def check_forward_hidden(self, input, hx, hidden_label='')
def check_input(self, input, batch_sizes)
def get_expected_hidden_size(self, input, batch_sizes)
def forward_packed(self, input, hx=None)
def permute_hidden(self, hx, permutation)
def get_flat_weights(self)