Caffe2 - Python API
A deep learning, cross platform ML framework
sparse.py
1 import torch
2 from torch.nn.parameter import Parameter
3 
4 from .module import Module
5 from .. import functional as F
6 from .. import init
7 from torch._jit_internal import weak_module, weak_script, weak_script_method
8 
9 
10 @weak_module
11 class Embedding(Module):
12  r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
13 
14  This module is often used to store word embeddings and retrieve them using indices.
15  The input to the module is a list of indices, and the output is the corresponding
16  word embeddings.
17 
18  Args:
19  num_embeddings (int): size of the dictionary of embeddings
20  embedding_dim (int): the size of each embedding vector
21  padding_idx (int, optional): If given, pads the output with the embedding vector at :attr:`padding_idx`
22  (initialized to zeros) whenever it encounters the index.
23  max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
24  is renormalized to have norm :attr:`max_norm`.
25  norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
26  scale_grad_by_freq (boolean, optional): If given, this will scale gradients by the inverse of frequency of
27  the words in the mini-batch. Default ``False``.
28  sparse (bool, optional): If ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor.
29  See Notes for more details regarding sparse gradients.
30 
31  Attributes:
32  weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
33  initialized from :math:`\mathcal{N}(0, 1)`
34 
35  Shape:
36  - Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract
37  - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
38 
39  .. note::
40  Keep in mind that only a limited number of optimizers support
41  sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
42  :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
43 
44  .. note::
45  With :attr:`padding_idx` set, the embedding vector at
46  :attr:`padding_idx` is initialized to all zeros. However, note that this
47  vector can be modified afterwards, e.g., using a customized
48  initialization method, and thus changing the vector used to pad the
49  output. The gradient for this vector from :class:`~torch.nn.Embedding`
50  is always zero.
51 
52  Examples::
53 
54  >>> # an Embedding module containing 10 tensors of size 3
55  >>> embedding = nn.Embedding(10, 3)
56  >>> # a batch of 2 samples of 4 indices each
57  >>> input = torch.LongTensor([[1,2,4,5],[4,3,2,9]])
58  >>> embedding(input)
59  tensor([[[-0.0251, -1.6902, 0.7172],
60  [-0.6431, 0.0748, 0.6969],
61  [ 1.4970, 1.3448, -0.9685],
62  [-0.3677, -2.7265, -0.1685]],
63 
64  [[ 1.4970, 1.3448, -0.9685],
65  [ 0.4362, -0.4004, 0.9400],
66  [-0.6431, 0.0748, 0.6969],
67  [ 0.9124, -2.3616, 1.1151]]])
68 
69 
70  >>> # example with padding_idx
71  >>> embedding = nn.Embedding(10, 3, padding_idx=0)
72  >>> input = torch.LongTensor([[0,2,0,5]])
73  >>> embedding(input)
74  tensor([[[ 0.0000, 0.0000, 0.0000],
75  [ 0.1535, -2.0309, 0.9315],
76  [ 0.0000, 0.0000, 0.0000],
77  [-0.1655, 0.9897, 0.0635]]])
78  """
79  __constants__ = ['num_embeddings', 'embedding_dim', 'padding_idx', 'max_norm',
80  'norm_type', 'scale_grad_by_freq', 'sparse', '_weight']
81 
82  def __init__(self, num_embeddings, embedding_dim, padding_idx=None,
83  max_norm=None, norm_type=2., scale_grad_by_freq=False,
84  sparse=False, _weight=None):
85  super(Embedding, self).__init__()
86  self.num_embeddings = num_embeddings
87  self.embedding_dim = embedding_dim
88  if padding_idx is not None:
89  if padding_idx > 0:
90  assert padding_idx < self.num_embeddings, 'Padding_idx must be within num_embeddings'
91  elif padding_idx < 0:
92  assert padding_idx >= -self.num_embeddings, 'Padding_idx must be within num_embeddings'
93  padding_idx = self.num_embeddings + padding_idx
94  self.padding_idx = padding_idx
95  self.max_norm = max_norm
96  self.norm_type = norm_type
97  self.scale_grad_by_freq = scale_grad_by_freq
98  if _weight is None:
99  self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim))
100  self.reset_parameters()
101  else:
102  assert list(_weight.shape) == [num_embeddings, embedding_dim], \
103  'Shape of weight does not match num_embeddings and embedding_dim'
104  self.weight = Parameter(_weight)
105  self.sparse = sparse
106 
107  def reset_parameters(self):
108  init.normal_(self.weight)
109  if self.padding_idx is not None:
110  with torch.no_grad():
111  self.weight[self.padding_idx].fill_(0)
112 
113  @weak_script_method
114  def forward(self, input):
115  return F.embedding(
116  input, self.weight, self.padding_idx, self.max_norm,
117  self.norm_type, self.scale_grad_by_freq, self.sparse)
118 
119  def extra_repr(self):
120  s = '{num_embeddings}, {embedding_dim}'
121  if self.padding_idx is not None:
122  s += ', padding_idx={padding_idx}'
123  if self.max_norm is not None:
124  s += ', max_norm={max_norm}'
125  if self.norm_type != 2:
126  s += ', norm_type={norm_type}'
127  if self.scale_grad_by_freq is not False:
128  s += ', scale_grad_by_freq={scale_grad_by_freq}'
129  if self.sparse is not False:
130  s += ', sparse=True'
131  return s.format(**self.__dict__)
132 
133  @classmethod
134  def from_pretrained(cls, embeddings, freeze=True, padding_idx=None,
135  max_norm=None, norm_type=2., scale_grad_by_freq=False,
136  sparse=False):
137  r"""Creates Embedding instance from given 2-dimensional FloatTensor.
138 
139  Args:
140  embeddings (Tensor): FloatTensor containing weights for the Embedding.
141  First dimension is being passed to Embedding as ``num_embeddings``, second as ``embedding_dim``.
142  freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process.
143  Equivalent to ``embedding.weight.requires_grad = False``. Default: ``True``
144  padding_idx (int, optional): See module initialization documentation.
145  max_norm (float, optional): See module initialization documentation.
146  norm_type (float, optional): See module initialization documentation. Default ``2``.
147  scale_grad_by_freq (boolean, optional): See module initialization documentation. Default ``False``.
148  sparse (bool, optional): See module initialization documentation.
149 
150  Examples::
151 
152  >>> # FloatTensor containing pretrained weights
153  >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
154  >>> embedding = nn.Embedding.from_pretrained(weight)
155  >>> # Get embeddings for index 1
156  >>> input = torch.LongTensor([1])
157  >>> embedding(input)
158  tensor([[ 4.0000, 5.1000, 6.3000]])
159  """
160  assert embeddings.dim() == 2, \
161  'Embeddings parameter is expected to be 2-dimensional'
162  rows, cols = embeddings.shape
163  embedding = cls(
164  num_embeddings=rows,
165  embedding_dim=cols,
166  _weight=embeddings,
167  padding_idx=padding_idx,
168  max_norm=max_norm,
169  norm_type=norm_type,
170  scale_grad_by_freq=scale_grad_by_freq,
171  sparse=sparse)
172  embedding.weight.requires_grad = not freeze
173  return embedding
174 
175 
176 @weak_module
177 class EmbeddingBag(Module):
178  r"""Computes sums or means of 'bags' of embeddings, without instantiating the
179  intermediate embeddings.
180 
181  For bags of constant length, this class
182 
183  * with ``mode="sum"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.sum(dim=0)``,
184  * with ``mode="mean"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.mean(dim=0)``,
185  * with ``mode="max"`` is equivalent to :class:`~torch.nn.Embedding` followed by ``torch.max(dim=0)``.
186 
187  However, :class:`~torch.nn.EmbeddingBag` is much more time and memory efficient than using a chain of these
188  operations.
189 
190  Args:
191  num_embeddings (int): size of the dictionary of embeddings
192  embedding_dim (int): the size of each embedding vector
193  max_norm (float, optional): If given, each embedding vector with norm larger than :attr:`max_norm`
194  is renormalized to have norm :attr:`max_norm`.
195  norm_type (float, optional): The p of the p-norm to compute for the :attr:`max_norm` option. Default ``2``.
196  scale_grad_by_freq (boolean, optional): if given, this will scale gradients by the inverse of frequency of
197  the words in the mini-batch. Default ``False``.
198  Note: this option is not supported when ``mode="max"``.
199  mode (string, optional): ``"sum"``, ``"mean"`` or ``"max"``. Specifies the way to reduce the bag.
200  Default: ``"mean"``
201  sparse (bool, optional): if ``True``, gradient w.r.t. :attr:`weight` matrix will be a sparse tensor. See
202  Notes for more details regarding sparse gradients. Note: this option is not
203  supported when ``mode="max"``.
204 
205  Attributes:
206  weight (Tensor): the learnable weights of the module of shape `(num_embeddings, embedding_dim)`
207  initialized from :math:`\mathcal{N}(0, 1)`.
208 
209  Inputs: :attr:`input` (LongTensor) and :attr:`offsets` (LongTensor, optional)
210 
211  - If :attr:`input` is 2D of shape `(B, N)`,
212 
213  it will be treated as ``B`` bags (sequences) each of fixed length ``N``, and
214  this will return ``B`` values aggregated in a way depending on the :attr:`mode`.
215  :attr:`offsets` is ignored and required to be ``None`` in this case.
216 
217  - If :attr:`input` is 1D of shape `(N)`,
218 
219  it will be treated as a concatenation of multiple bags (sequences).
220  :attr:`offsets` is required to be a 1D tensor containing the
221  starting index positions of each bag in :attr:`input`. Therefore,
222  for :attr:`offsets` of shape `(B)`, :attr:`input` will be viewed as
223  having ``B`` bags. Empty bags (i.e., having 0-length) will have
224  returned vectors filled by zeros.
225 
226  Output shape: `(B, embedding_dim)`
227 
228  Examples::
229 
230  >>> # an Embedding module containing 10 tensors of size 3
231  >>> embedding_sum = nn.EmbeddingBag(10, 3, mode='sum')
232  >>> # a batch of 2 samples of 4 indices each
233  >>> input = torch.LongTensor([1,2,4,5,4,3,2,9])
234  >>> offsets = torch.LongTensor([0,4])
235  >>> embedding_sum(input, offsets)
236  tensor([[-0.8861, -5.4350, -0.0523],
237  [ 1.1306, -2.5798, -1.0044]])
238  """
239  __constants__ = ['num_embeddings, embedding_dim', 'max_norm', 'norm_type',
240  'scale_grad_by_freq', 'mode', 'sparse', '_weight']
241 
242  def __init__(self, num_embeddings, embedding_dim,
243  max_norm=None, norm_type=2., scale_grad_by_freq=False,
244  mode='mean', sparse=False, _weight=None):
245  super(EmbeddingBag, self).__init__()
246  self.num_embeddings = num_embeddings
247  self.embedding_dim = embedding_dim
248  self.max_norm = max_norm
249  self.norm_type = norm_type
250  self.scale_grad_by_freq = scale_grad_by_freq
251  if _weight is None:
252  self.weight = Parameter(torch.Tensor(num_embeddings, embedding_dim))
253  self.reset_parameters()
254  else:
255  assert list(_weight.shape) == [num_embeddings, embedding_dim], \
256  'Shape of weight does not match num_embeddings and embedding_dim'
257  self.weight = Parameter(_weight)
258  self.mode = mode
259  self.sparse = sparse
260 
261  def reset_parameters(self):
262  init.normal_(self.weight)
263 
264  @weak_script_method
265  def forward(self, input, offsets=None):
266  # type: (Tensor, Optional[Tensor]) -> Tensor
267  return F.embedding_bag(input, self.weight, offsets,
268  self.max_norm, self.norm_type,
269  self.scale_grad_by_freq, self.mode, self.sparse)
270 
271  def extra_repr(self):
272  s = '{num_embeddings}, {embedding_dim}'
273  if self.max_norm is not None:
274  s += ', max_norm={max_norm}'
275  if self.norm_type != 2:
276  s += ', norm_type={norm_type}'
277  if self.scale_grad_by_freq is not False:
278  s += ', scale_grad_by_freq={scale_grad_by_freq}'
279  s += ', mode={mode}'
280  return s.format(**self.__dict__)
281 
282  @classmethod
283  def from_pretrained(cls, embeddings, freeze=True, max_norm=None,
284  norm_type=2., scale_grad_by_freq=False,
285  mode='mean', sparse=False):
286  r"""Creates EmbeddingBag instance from given 2-dimensional FloatTensor.
287 
288  Args:
289  embeddings (Tensor): FloatTensor containing weights for the EmbeddingBag.
290  First dimension is being passed to EmbeddingBag as 'num_embeddings', second as 'embedding_dim'.
291  freeze (boolean, optional): If ``True``, the tensor does not get updated in the learning process.
292  Equivalent to ``embeddingbag.weight.requires_grad = False``. Default: ``True``
293  max_norm (float, optional): See module initialization documentation. Default: ``None``
294  norm_type (float, optional): See module initialization documentation. Default ``2``.
295  scale_grad_by_freq (boolean, optional): See module initialization documentation. Default ``False``.
296  mode (string, optional): See module initialization documentation. Default: ``"mean"``
297  sparse (bool, optional): See module initialization documentation. Default: ``False``.
298 
299  Examples::
300 
301  >>> # FloatTensor containing pretrained weights
302  >>> weight = torch.FloatTensor([[1, 2.3, 3], [4, 5.1, 6.3]])
303  >>> embeddingbag = nn.EmbeddingBag.from_pretrained(weight)
304  >>> # Get embeddings for index 1
305  >>> input = torch.LongTensor([[1, 0]])
306  >>> embeddingbag(input)
307  tensor([[ 2.5000, 3.7000, 4.6500]])
308  """
309  assert embeddings.dim() == 2, \
310  'Embeddings parameter is expected to be 2-dimensional'
311  rows, cols = embeddings.shape
312  embeddingbag = cls(
313  num_embeddings=rows,
314  embedding_dim=cols,
315  _weight=embeddings,
316  max_norm=max_norm,
317  norm_type=norm_type,
318  scale_grad_by_freq=scale_grad_by_freq,
319  mode=mode,
320  sparse=sparse)
321  embeddingbag.weight.requires_grad = not freeze
322  return embeddingbag