Caffe2 - Python API
A deep learning, cross platform ML framework
loss.py
1 import warnings
2 
3 import torch
4 from .module import Module
5 from .container import Sequential
6 from .activation import LogSoftmax
7 from .. import functional as F
8 from .. import _reduction as _Reduction
9 from ..._jit_internal import weak_module, weak_script_method
10 
11 
12 class _Loss(Module):
13  def __init__(self, size_average=None, reduce=None, reduction='mean'):
14  super(_Loss, self).__init__()
15  if size_average is not None or reduce is not None:
16  self.reduction = _Reduction.legacy_get_string(size_average, reduce)
17  else:
18  self.reduction = reduction
19 
20 
22  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean'):
23  super(_WeightedLoss, self).__init__(size_average, reduce, reduction)
24  self.register_buffer('weight', weight)
25 
26 
27 @weak_module
28 class L1Loss(_Loss):
29  r"""Creates a criterion that measures the mean absolute error (MAE) between each element in
30  the input :math:`x` and target :math:`y`.
31 
32  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
33 
34  .. math::
35  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
36  l_n = \left| x_n - y_n \right|,
37 
38  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
39  (default ``'mean'``), then:
40 
41  .. math::
42  \ell(x, y) =
43  \begin{cases}
44  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
45  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
46  \end{cases}
47 
48  :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
49  of :math:`n` elements each.
50 
51  The sum operation still operates over all the elements, and divides by :math:`n`.
52 
53  The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
54 
55  Args:
56  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
57  the losses are averaged over each loss element in the batch. Note that for
58  some losses, there are multiple elements per sample. If the field :attr:`size_average`
59  is set to ``False``, the losses are instead summed for each minibatch. Ignored
60  when reduce is ``False``. Default: ``True``
61  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
62  losses are averaged or summed over observations for each minibatch depending
63  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
64  batch element instead and ignores :attr:`size_average`. Default: ``True``
65  reduction (string, optional): Specifies the reduction to apply to the output:
66  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
67  ``'mean'``: the sum of the output will be divided by the number of
68  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
69  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
70  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
71 
72  Shape:
73  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
74  dimensions
75  - Target: :math:`(N, *)`, same shape as the input
76  - Output: scalar. If :attr:`reduction` is ``'none'``, then
77  :math:`(N, *)`, same shape as the input
78 
79  Examples::
80 
81  >>> loss = nn.L1Loss()
82  >>> input = torch.randn(3, 5, requires_grad=True)
83  >>> target = torch.randn(3, 5)
84  >>> output = loss(input, target)
85  >>> output.backward()
86  """
87  __constants__ = ['reduction']
88 
89  def __init__(self, size_average=None, reduce=None, reduction='mean'):
90  super(L1Loss, self).__init__(size_average, reduce, reduction)
91 
92  @weak_script_method
93  def forward(self, input, target):
94  return F.l1_loss(input, target, reduction=self.reduction)
95 
96 
97 @weak_module
99  r"""The negative log likelihood loss. It is useful to train a classification
100  problem with `C` classes.
101 
102  If provided, the optional argument :attr:`weight` should be a 1D Tensor assigning
103  weight to each of the classes. This is particularly useful when you have an
104  unbalanced training set.
105 
106  The `input` given through a forward call is expected to contain
107  log-probabilities of each class. `input` has to be a Tensor of size either
108  :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
109  with :math:`K \geq 1` for the `K`-dimensional case (described later).
110 
111  Obtaining log-probabilities in a neural network is easily achieved by
112  adding a `LogSoftmax` layer in the last layer of your network.
113  You may use `CrossEntropyLoss` instead, if you prefer not to add an extra
114  layer.
115 
116  The `target` that this loss expects should be a class index in the range :math:`[0, C-1]`
117  where `C = number of classes`; if `ignore_index` is specified, this loss also accepts
118  this class index (this index may not necessarily be in the class range).
119 
120  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
121 
122  .. math::
123  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
124  l_n = - w_{y_n} x_{n,y_n}, \quad
125  w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
126 
127  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
128  (default ``'mean'``), then
129 
130  .. math::
131  \ell(x, y) = \begin{cases}
132  \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
133  \text{if reduction} = \text{'mean';}\\
134  \sum_{n=1}^N l_n, &
135  \text{if reduction} = \text{'sum'.}
136  \end{cases}
137 
138  Can also be used for higher dimension inputs, such as 2D images, by providing
139  an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`,
140  where :math:`K` is the number of dimensions, and a target of appropriate shape
141  (see below). In the case of images, it computes NLL loss per-pixel.
142 
143  Args:
144  weight (Tensor, optional): a manual rescaling weight given to each
145  class. If given, it has to be a Tensor of size `C`. Otherwise, it is
146  treated as if having all ones.
147  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
148  the losses are averaged over each loss element in the batch. Note that for
149  some losses, there are multiple elements per sample. If the field :attr:`size_average`
150  is set to ``False``, the losses are instead summed for each minibatch. Ignored
151  when reduce is ``False``. Default: ``True``
152  ignore_index (int, optional): Specifies a target value that is ignored
153  and does not contribute to the input gradient. When
154  :attr:`size_average` is ``True``, the loss is averaged over
155  non-ignored targets.
156  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
157  losses are averaged or summed over observations for each minibatch depending
158  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
159  batch element instead and ignores :attr:`size_average`. Default: ``True``
160  reduction (string, optional): Specifies the reduction to apply to the output:
161  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
162  ``'mean'``: the sum of the output will be divided by the number of
163  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
164  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
165  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
166 
167  Shape:
168  - Input: :math:`(N, C)` where `C = number of classes`, or
169  :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
170  in the case of `K`-dimensional loss.
171  - Target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or
172  :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of
173  K-dimensional loss.
174  - Output: scalar.
175  If :attr:`reduction` is ``'none'``, then the same size as the target: :math:`(N)`, or
176  :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case
177  of K-dimensional loss.
178 
179  Examples::
180 
181  >>> m = nn.LogSoftmax(dim=1)
182  >>> loss = nn.NLLLoss()
183  >>> # input is of size N x C = 3 x 5
184  >>> input = torch.randn(3, 5, requires_grad=True)
185  >>> # each element in target has to have 0 <= value < C
186  >>> target = torch.tensor([1, 0, 4])
187  >>> output = loss(m(input), target)
188  >>> output.backward()
189  >>>
190  >>>
191  >>> # 2D loss example (used, for example, with image inputs)
192  >>> N, C = 5, 4
193  >>> loss = nn.NLLLoss()
194  >>> # input is of size N x C x height x width
195  >>> data = torch.randn(N, 16, 10, 10)
196  >>> conv = nn.Conv2d(16, C, (3, 3))
197  >>> m = nn.LogSoftmax(dim=1)
198  >>> # each element in target has to have 0 <= value < C
199  >>> target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
200  >>> output = loss(m(conv(data)), target)
201  >>> output.backward()
202  """
203  __constants__ = ['ignore_index', 'weight', 'reduction']
204 
205  def __init__(self, weight=None, size_average=None, ignore_index=-100,
206  reduce=None, reduction='mean'):
207  super(NLLLoss, self).__init__(weight, size_average, reduce, reduction)
208  self.ignore_index = ignore_index
209 
210  @weak_script_method
211  def forward(self, input, target):
212  return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
213 
214 
215 @weak_module
217  def __init__(self, weight=None, size_average=None, ignore_index=-100,
218  reduce=None, reduction='mean'):
219  warnings.warn("NLLLoss2d has been deprecated. "
220  "Please use NLLLoss instead as a drop-in replacement and see "
221  "https://pytorch.org/docs/master/nn.html#torch.nn.NLLLoss for more details.")
222  super(NLLLoss2d, self).__init__(weight, size_average, ignore_index, reduce, reduction)
223 
224 
225 @weak_module
227  r"""Negative log likelihood loss with Poisson distribution of target.
228 
229  The loss can be described as:
230 
231  .. math::
232  \text{target} \sim \mathrm{Poisson}(\text{input})
233 
234  \text{loss}(\text{input}, \text{target}) = \text{input} - \text{target} * \log(\text{input})
235  + \log(\text{target!})
236 
237  The last term can be omitted or approximated with Stirling formula. The
238  approximation is used for target values more than 1. For targets less or
239  equal to 1 zeros are added to the loss.
240 
241  Args:
242  log_input (bool, optional): if ``True`` the loss is computed as
243  :math:`\exp(\text{input}) - \text{target}*\text{input}`, if ``False`` the loss is
244  :math:`\text{input} - \text{target}*\log(\text{input}+\text{eps})`.
245  full (bool, optional): whether to compute full loss, i. e. to add the
246  Stirling approximation term
247 
248  .. math::
249  \text{target}*\log(\text{target}) - \text{target} + 0.5 * \log(2\pi\text{target}).
250  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
251  the losses are averaged over each loss element in the batch. Note that for
252  some losses, there are multiple elements per sample. If the field :attr:`size_average`
253  is set to ``False``, the losses are instead summed for each minibatch. Ignored
254  when reduce is ``False``. Default: ``True``
255  eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when
256  :attr:`log_input = False`. Default: 1e-8
257  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
258  losses are averaged or summed over observations for each minibatch depending
259  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
260  batch element instead and ignores :attr:`size_average`. Default: ``True``
261  reduction (string, optional): Specifies the reduction to apply to the output:
262  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
263  ``'mean'``: the sum of the output will be divided by the number of
264  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
265  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
266  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
267 
268  Examples::
269 
270  >>> loss = nn.PoissonNLLLoss()
271  >>> log_input = torch.randn(5, 2, requires_grad=True)
272  >>> target = torch.randn(5, 2)
273  >>> output = loss(log_input, target)
274  >>> output.backward()
275 
276  Shape:
277  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
278  dimensions
279  - Target: :math:`(N, *)`, same shape as the input
280  - Output: scalar by default. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`,
281  the same shape as the input
282  """
283  __constants__ = ['log_input', 'full', 'eps', 'reduction']
284 
285  def __init__(self, log_input=True, full=False, size_average=None,
286  eps=1e-8, reduce=None, reduction='mean'):
287  super(PoissonNLLLoss, self).__init__(size_average, reduce, reduction)
288  self.log_input = log_input
289  self.full = full
290  self.eps = eps
291 
292  @weak_script_method
293  def forward(self, log_input, target):
294  return F.poisson_nll_loss(log_input, target, log_input=self.log_input, full=self.full,
295  eps=self.eps, reduction=self.reduction)
296 
297 
298 @weak_module
300  r"""The `Kullback-Leibler divergence`_ Loss
301 
302  KL divergence is a useful distance measure for continuous distributions
303  and is often useful when performing direct regression over the space of
304  (discretely sampled) continuous output distributions.
305 
306  As with :class:`~torch.nn.NLLLoss`, the `input` given is expected to contain
307  *log-probabilities* and is not restricted to a 2D Tensor.
308  The targets are given as *probabilities* (i.e. without taking the logarithm).
309 
310  This criterion expects a `target` `Tensor` of the same size as the
311  `input` `Tensor`.
312 
313  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
314 
315  .. math::
316  l(x,y) = L = \{ l_1,\dots,l_N \}, \quad
317  l_n = y_n \cdot \left( \log y_n - x_n \right)
318 
319  where the index :math:`N` spans all dimensions of ``input`` and :math:`L` has the same
320  shape as ``input``. If :attr:`reduction` is not ``'none'`` (default ``'mean'``), then:
321 
322  .. math::
323  \ell(x, y) = \begin{cases}
324  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';} \\
325  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
326  \end{cases}
327 
328  In default :attr:`reduction` mode ``'mean'``, the losses are averaged for each minibatch over observations
329  **as well as** over dimensions. ``'batchmean'`` mode gives the correct KL divergence where losses
330  are averaged over batch dimension only. ``'mean'`` mode's behavior will be changed to the same as
331  ``'batchmean'`` in the next major release.
332 
333  .. _Kullback-Leibler divergence:
334  https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
335 
336  Args:
337  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
338  the losses are averaged over each loss element in the batch. Note that for
339  some losses, there are multiple elements per sample. If the field :attr:`size_average`
340  is set to ``False``, the losses are instead summed for each minibatch. Ignored
341  when reduce is ``False``. Default: ``True``
342  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
343  losses are averaged or summed over observations for each minibatch depending
344  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
345  batch element instead and ignores :attr:`size_average`. Default: ``True``
346  reduction (string, optional): Specifies the reduction to apply to the output:
347  ``'none'`` | ``'batchmean'`` | ``'sum'`` | ``'mean'``.
348  ``'none'``: no reduction will be applied.
349  ``'batchmean'``: the sum of the output will be divided by batchsize.
350  ``'sum'``: the output will be summed.
351  ``'mean'``: the output will be divided by the number of elements in the output.
352  Default: ``'mean'``
353 
354  .. note::
355  :attr:`size_average` and :attr:`reduce` are in the process of being deprecated,
356  and in the meantime, specifying either of those two args will override :attr:`reduction`.
357 
358  .. note::
359  :attr:``reduction`` = ``'mean'`` doesn't return the true kl divergence value, please use
360  :attr:``reduction`` = ``'batchmean'`` which aligns with KL math definition.
361  In the next major release, ``'mean'`` will be changed to be the same as ``'batchmean'``.
362 
363  Shape:
364  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
365  dimensions
366  - Target: :math:`(N, *)`, same shape as the input
367  - Output: scalar by default. If :attr:``reduction`` is ``'none'``, then :math:`(N, *)`,
368  the same shape as the input
369 
370  """
371  __constants__ = ['reduction']
372 
373  def __init__(self, size_average=None, reduce=None, reduction='mean'):
374  super(KLDivLoss, self).__init__(size_average, reduce, reduction)
375 
376  @weak_script_method
377  def forward(self, input, target):
378  return F.kl_div(input, target, reduction=self.reduction)
379 
380 
381 @weak_module
382 class MSELoss(_Loss):
383  r"""Creates a criterion that measures the mean squared error (squared L2 norm) between
384  each element in the input :math:`x` and target :math:`y`.
385 
386  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
387 
388  .. math::
389  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
390  l_n = \left( x_n - y_n \right)^2,
391 
392  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
393  (default ``'mean'``), then:
394 
395  .. math::
396  \ell(x, y) =
397  \begin{cases}
398  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
399  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
400  \end{cases}
401 
402  :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
403  of :math:`n` elements each.
404 
405  The sum operation still operates over all the elements, and divides by :math:`n`.
406 
407  The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
408 
409  Args:
410  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
411  the losses are averaged over each loss element in the batch. Note that for
412  some losses, there are multiple elements per sample. If the field :attr:`size_average`
413  is set to ``False``, the losses are instead summed for each minibatch. Ignored
414  when reduce is ``False``. Default: ``True``
415  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
416  losses are averaged or summed over observations for each minibatch depending
417  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
418  batch element instead and ignores :attr:`size_average`. Default: ``True``
419  reduction (string, optional): Specifies the reduction to apply to the output:
420  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
421  ``'mean'``: the sum of the output will be divided by the number of
422  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
423  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
424  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
425 
426  Shape:
427  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
428  dimensions
429  - Target: :math:`(N, *)`, same shape as the input
430 
431  Examples::
432 
433  >>> loss = nn.MSELoss()
434  >>> input = torch.randn(3, 5, requires_grad=True)
435  >>> target = torch.randn(3, 5)
436  >>> output = loss(input, target)
437  >>> output.backward()
438  """
439  __constants__ = ['reduction']
440 
441  def __init__(self, size_average=None, reduce=None, reduction='mean'):
442  super(MSELoss, self).__init__(size_average, reduce, reduction)
443 
444  @weak_script_method
445  def forward(self, input, target):
446  return F.mse_loss(input, target, reduction=self.reduction)
447 
448 
449 @weak_module
451  r"""Creates a criterion that measures the Binary Cross Entropy
452  between the target and the output:
453 
454  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
455 
456  .. math::
457  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
458  l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],
459 
460  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
461  (default ``'mean'``), then
462 
463  .. math::
464  \ell(x, y) = \begin{cases}
465  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
466  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
467  \end{cases}
468 
469  This is used for measuring the error of a reconstruction in for example
470  an auto-encoder. Note that the targets :math:`y` should be numbers
471  between 0 and 1.
472 
473  Args:
474  weight (Tensor, optional): a manual rescaling weight given to the loss
475  of each batch element. If given, has to be a Tensor of size `nbatch`.
476  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
477  the losses are averaged over each loss element in the batch. Note that for
478  some losses, there are multiple elements per sample. If the field :attr:`size_average`
479  is set to ``False``, the losses are instead summed for each minibatch. Ignored
480  when reduce is ``False``. Default: ``True``
481  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
482  losses are averaged or summed over observations for each minibatch depending
483  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
484  batch element instead and ignores :attr:`size_average`. Default: ``True``
485  reduction (string, optional): Specifies the reduction to apply to the output:
486  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
487  ``'mean'``: the sum of the output will be divided by the number of
488  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
489  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
490  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
491 
492  Shape:
493  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
494  dimensions
495  - Target: :math:`(N, *)`, same shape as the input
496  - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
497  shape as input.
498 
499  Examples::
500 
501  >>> m = nn.Sigmoid()
502  >>> loss = nn.BCELoss()
503  >>> input = torch.randn(3, requires_grad=True)
504  >>> target = torch.empty(3).random_(2)
505  >>> output = loss(m(input), target)
506  >>> output.backward()
507  """
508  __constants__ = ['reduction', 'weight']
509 
510  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean'):
511  super(BCELoss, self).__init__(weight, size_average, reduce, reduction)
512 
513  @weak_script_method
514  def forward(self, input, target):
515  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
516 
517 
518 @weak_module
520  r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single
521  class. This version is more numerically stable than using a plain `Sigmoid`
522  followed by a `BCELoss` as, by combining the operations into one layer,
523  we take advantage of the log-sum-exp trick for numerical stability.
524 
525  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
526 
527  .. math::
528  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
529  l_n = - w_n \left[ y_n \cdot \log \sigma(x_n)
530  + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
531 
532  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
533  (default ``'mean'``), then
534 
535  .. math::
536  \ell(x, y) = \begin{cases}
537  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
538  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
539  \end{cases}
540 
541  This is used for measuring the error of a reconstruction in for example
542  an auto-encoder. Note that the targets `t[i]` should be numbers
543  between 0 and 1.
544 
545  It's possible to trade off recall and precision by adding weights to positive examples.
546  In this case the loss can be described as:
547 
548  .. math::
549  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
550  l_n = - w_n \left[ p_n y_n \cdot \log \sigma(x_n)
551  + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
552 
553  where :math:`p_n` is the weight of the positive class for sample :math:`n` in the batch.
554  :math:`p_n > 1` increases the recall, :math:`p_n < 1` increases the precision.
555 
556  For example, if a dataset contains 100 positive and 300 negative examples of a single class,
557  then `pos_weight` for the class should be equal to :math:`\frac{300}{100}=3`.
558  The loss would act as if the dataset contains :math:`3\times 100=300` positive examples.
559 
560  Args:
561  weight (Tensor, optional): a manual rescaling weight given to the loss
562  of each batch element. If given, has to be a Tensor of size `nbatch`.
563  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
564  the losses are averaged over each loss element in the batch. Note that for
565  some losses, there are multiple elements per sample. If the field :attr:`size_average`
566  is set to ``False``, the losses are instead summed for each minibatch. Ignored
567  when reduce is ``False``. Default: ``True``
568  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
569  losses are averaged or summed over observations for each minibatch depending
570  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
571  batch element instead and ignores :attr:`size_average`. Default: ``True``
572  reduction (string, optional): Specifies the reduction to apply to the output:
573  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
574  ``'mean'``: the sum of the output will be divided by the number of
575  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
576  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
577  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
578  pos_weight (Tensor, optional): a weight of positive examples.
579  Must be a vector with length equal to the number of classes.
580 
581  Shape:
582  - Input: :math:`(N, *)` where :math:`*` means, any number of additional dimensions
583  - Target: :math:`(N, *)`, same shape as the input
584  - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
585  shape as input.
586 
587  Examples::
588 
589  >>> loss = nn.BCEWithLogitsLoss()
590  >>> input = torch.randn(3, requires_grad=True)
591  >>> target = torch.empty(3).random_(2)
592  >>> output = loss(input, target)
593  >>> output.backward()
594  """
595  __constants__ = ['weight', 'pos_weight', 'reduction']
596 
597  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean', pos_weight=None):
598  super(BCEWithLogitsLoss, self).__init__(size_average, reduce, reduction)
599  self.register_buffer('weight', weight)
600  self.register_buffer('pos_weight', pos_weight)
601 
602  @weak_script_method
603  def forward(self, input, target):
604  return F.binary_cross_entropy_with_logits(input, target,
605  self.weight,
606  pos_weight=self.pos_weight,
607  reduction=self.reduction)
608 
609 
610 @weak_module
612  r"""Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`
613  (containing 1 or -1).
614  This is usually used for measuring whether two inputs are similar or
615  dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
616  used for learning nonlinear embeddings or semi-supervised learning.
617 
618  The loss function for :math:`n`-th sample in the mini-batch is
619 
620  .. math::
621  l_n = \begin{cases}
622  x_n, & \text{if}\; y_n = 1,\\
623  \max \{0, \Delta - x_n\}, & \text{if}\; y_n = -1,
624  \end{cases}
625 
626  and the total loss functions is
627 
628  .. math::
629  \ell(x, y) = \begin{cases}
630  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
631  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
632  \end{cases}
633 
634  where :math:`L = \{l_1,\dots,l_N\}^\top`.
635 
636  Args:
637  margin (float, optional): Has a default value of `1`.
638  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
639  the losses are averaged over each loss element in the batch. Note that for
640  some losses, there are multiple elements per sample. If the field :attr:`size_average`
641  is set to ``False``, the losses are instead summed for each minibatch. Ignored
642  when reduce is ``False``. Default: ``True``
643  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
644  losses are averaged or summed over observations for each minibatch depending
645  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
646  batch element instead and ignores :attr:`size_average`. Default: ``True``
647  reduction (string, optional): Specifies the reduction to apply to the output:
648  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
649  ``'mean'``: the sum of the output will be divided by the number of
650  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
651  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
652  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
653 
654  Shape:
655  - Input: :math:`(*)` where :math:`*` means, any number of dimensions. The sum operation
656  operates over all the elements.
657  - Target: :math:`(*)`, same shape as the input
658  - Output: scalar. If :attr:``reduction`` is ``'none'``, then same shape as the input
659  """
660  __constants__ = ['margin', 'reduction']
661 
662  def __init__(self, margin=1.0, size_average=None, reduce=None, reduction='mean'):
663  super(HingeEmbeddingLoss, self).__init__(size_average, reduce, reduction)
664  self.margin = margin
665 
666  @weak_script_method
667  def forward(self, input, target):
668  return F.hinge_embedding_loss(input, target, margin=self.margin, reduction=self.reduction)
669 
670 
671 @weak_module
673  r"""Creates a criterion that optimizes a multi-class multi-classification
674  hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
675  and output :math:`y` (which is a 2D `Tensor` of target class indices).
676  For each sample in the mini-batch:
677 
678  .. math::
679  \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
680 
681  where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \
682  :math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \
683  :math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \
684  and :math:`i \neq y[j]` for all :math:`i` and :math:`j`.
685 
686  :math:`y` and :math:`x` must have the same size.
687 
688  The criterion only considers a contiguous block of non-negative targets that
689  starts at the front.
690 
691  This allows for different samples to have variable amounts of target classes.
692 
693  Args:
694  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
695  the losses are averaged over each loss element in the batch. Note that for
696  some losses, there are multiple elements per sample. If the field :attr:`size_average`
697  is set to ``False``, the losses are instead summed for each minibatch. Ignored
698  when reduce is ``False``. Default: ``True``
699  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
700  losses are averaged or summed over observations for each minibatch depending
701  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
702  batch element instead and ignores :attr:`size_average`. Default: ``True``
703  reduction (string, optional): Specifies the reduction to apply to the output:
704  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
705  ``'mean'``: the sum of the output will be divided by the number of
706  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
707  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
708  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
709 
710  Shape:
711  - Input: :math:`(C)` or :math:`(N, C)` where `N` is the batch size and `C`
712  is the number of classes.
713  - Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
714  - Output: scalar. If :attr:``reduction`` is ``'none'``, then :math:`(N)`.
715 
716  Examples::
717 
718  >>> loss = nn.MultiLabelMarginLoss()
719  >>> x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
720  >>> # for target y, only consider labels 3 and 0, not after label -1
721  >>> y = torch.LongTensor([[3, 0, -1, 1]])
722  >>> loss(x, y)
723  >>> # 0.25 * ((1-(0.1-0.2)) + (1-(0.1-0.4)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
724  tensor(0.8500)
725 
726  """
727  __constants__ = ['reduction']
728 
729  def __init__(self, size_average=None, reduce=None, reduction='mean'):
730  super(MultiLabelMarginLoss, self).__init__(size_average, reduce, reduction)
731 
732  @weak_script_method
733  def forward(self, input, target):
734  return F.multilabel_margin_loss(input, target, reduction=self.reduction)
735 
736 
737 @weak_module
739  r"""Creates a criterion that uses a squared term if the absolute
740  element-wise error falls below 1 and an L1 term otherwise.
741  It is less sensitive to outliers than the `MSELoss` and in some cases
742  prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick).
743  Also known as the Huber loss:
744 
745  .. math::
746  \text{loss}(x, y) = \frac{1}{n} \sum_{i} z_{i}
747 
748  where :math:`z_{i}` is given by:
749 
750  .. math::
751  z_{i} =
752  \begin{cases}
753  0.5 (x_i - y_i)^2, & \text{if } |x_i - y_i| < 1 \\
754  |x_i - y_i| - 0.5, & \text{otherwise }
755  \end{cases}
756 
757  :math:`x` and :math:`y` arbitrary shapes with a total of :math:`n` elements each
758  the sum operation still operates over all the elements, and divides by :math:`n`.
759 
760  The division by :math:`n` can be avoided if sets ``reduction = 'sum'``.
761 
762  Args:
763  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
764  the losses are averaged over each loss element in the batch. Note that for
765  some losses, there are multiple elements per sample. If the field :attr:`size_average`
766  is set to ``False``, the losses are instead summed for each minibatch. Ignored
767  when reduce is ``False``. Default: ``True``
768  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
769  losses are averaged or summed over observations for each minibatch depending
770  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
771  batch element instead and ignores :attr:`size_average`. Default: ``True``
772  reduction (string, optional): Specifies the reduction to apply to the output:
773  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
774  ``'mean'``: the sum of the output will be divided by the number of
775  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
776  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
777  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
778 
779  Shape:
780  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
781  dimensions
782  - Target: :math:`(N, *)`, same shape as the input
783  - Output: scalar. If :attr:`reduction` is ``'none'``, then
784  :math:`(N, *)`, same shape as the input
785 
786  """
787  __constants__ = ['reduction']
788 
789  def __init__(self, size_average=None, reduce=None, reduction='mean'):
790  super(SmoothL1Loss, self).__init__(size_average, reduce, reduction)
791 
792  @weak_script_method
793  def forward(self, input, target):
794  return F.smooth_l1_loss(input, target, reduction=self.reduction)
795 
796 
797 @weak_module
799  r"""Creates a criterion that optimizes a two-class classification
800  logistic loss between input tensor :math:`x` and target tensor :math:`y`
801  (containing 1 or -1).
802 
803  .. math::
804  \text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()}
805 
806  Args:
807  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
808  the losses are averaged over each loss element in the batch. Note that for
809  some losses, there are multiple elements per sample. If the field :attr:`size_average`
810  is set to ``False``, the losses are instead summed for each minibatch. Ignored
811  when reduce is ``False``. Default: ``True``
812  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
813  losses are averaged or summed over observations for each minibatch depending
814  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
815  batch element instead and ignores :attr:`size_average`. Default: ``True``
816  reduction (string, optional): Specifies the reduction to apply to the output:
817  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
818  ``'mean'``: the sum of the output will be divided by the number of
819  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
820  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
821  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
822 
823  Shape:
824  - Input: :math:`(*)` where :math:`*` means, any number of additional
825  dimensions
826  - Target: :math:`(*)`, same shape as the input
827  - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input
828 
829  """
830  __constants__ = ['reduction']
831 
832  def __init__(self, size_average=None, reduce=None, reduction='mean'):
833  super(SoftMarginLoss, self).__init__(size_average, reduce, reduction)
834 
835  @weak_script_method
836  def forward(self, input, target):
837  return F.soft_margin_loss(input, target, reduction=self.reduction)
838 
839 
840 @weak_module
842  r"""This criterion combines :func:`nn.LogSoftmax` and :func:`nn.NLLLoss` in one single class.
843 
844  It is useful when training a classification problem with `C` classes.
845  If provided, the optional argument :attr:`weight` should be a 1D `Tensor`
846  assigning weight to each of the classes.
847  This is particularly useful when you have an unbalanced training set.
848 
849  The `input` is expected to contain raw, unnormalized scores for each class.
850 
851  `input` has to be a Tensor of size either :math:`(minibatch, C)` or
852  :math:`(minibatch, C, d_1, d_2, ..., d_K)`
853  with :math:`K \geq 1` for the `K`-dimensional case (described later).
854 
855  This criterion expects a class index in the range :math:`[0, C-1]` as the
856  `target`for each value of a 1D tensor of size `minibatch`; if `ignore_index`
857  is specified, this criterion also accepts this class index (this index may not
858  necessarily be in the class range).
859 
860  The loss can be described as:
861 
862  .. math::
863  \text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right)
864  = -x[class] + \log\left(\sum_j \exp(x[j])\right)
865 
866  or in the case of the :attr:`weight` argument being specified:
867 
868  .. math::
869  \text{loss}(x, class) = weight[class] \left(-x[class] + \log\left(\sum_j \exp(x[j])\right)\right)
870 
871  The losses are averaged across observations for each minibatch.
872 
873  Can also be used for higher dimension inputs, such as 2D images, by providing
874  an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`,
875  where :math:`K` is the number of dimensions, and a target of appropriate shape
876  (see below).
877 
878 
879  Args:
880  weight (Tensor, optional): a manual rescaling weight given to each class.
881  If given, has to be a Tensor of size `C`
882  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
883  the losses are averaged over each loss element in the batch. Note that for
884  some losses, there are multiple elements per sample. If the field :attr:`size_average`
885  is set to ``False``, the losses are instead summed for each minibatch. Ignored
886  when reduce is ``False``. Default: ``True``
887  ignore_index (int, optional): Specifies a target value that is ignored
888  and does not contribute to the input gradient. When :attr:`size_average` is
889  ``True``, the loss is averaged over non-ignored targets.
890  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
891  losses are averaged or summed over observations for each minibatch depending
892  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
893  batch element instead and ignores :attr:`size_average`. Default: ``True``
894  reduction (string, optional): Specifies the reduction to apply to the output:
895  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
896  ``'mean'``: the sum of the output will be divided by the number of
897  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
898  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
899  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
900 
901  Shape:
902  - Input: :math:`(N, C)` where `C = number of classes`, or
903  :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
904  in the case of `K`-dimensional loss.
905  - Target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or
906  :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of
907  K-dimensional loss.
908  - Output: scalar.
909  If :attr:`reduction` is ``'none'``, then the same size as the target:
910  :math:`(N)`, or
911  :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case
912  of K-dimensional loss.
913 
914  Examples::
915 
916  >>> loss = nn.CrossEntropyLoss()
917  >>> input = torch.randn(3, 5, requires_grad=True)
918  >>> target = torch.empty(3, dtype=torch.long).random_(5)
919  >>> output = loss(input, target)
920  >>> output.backward()
921  """
922  __constants__ = ['weight', 'ignore_index', 'reduction']
923 
924  def __init__(self, weight=None, size_average=None, ignore_index=-100,
925  reduce=None, reduction='mean'):
926  super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
927  self.ignore_index = ignore_index
928 
929  @weak_script_method
930  def forward(self, input, target):
931  return F.cross_entropy(input, target, weight=self.weight,
932  ignore_index=self.ignore_index, reduction=self.reduction)
933 
934 
935 @weak_module
937  r"""Creates a criterion that optimizes a multi-label one-versus-all
938  loss based on max-entropy, between input :math:`x` and target :math:`y` of size
939  :math:`(N, C)`.
940  For each sample in the minibatch:
941 
942  .. math::
943  loss(x, y) = - \frac{1}{C} * \sum_i y[i] * \log((1 + \exp(-x[i]))^{-1})
944  + (1-y[i]) * \log\left(\frac{\exp(-x[i])}{(1 + \exp(-x[i]))}\right)
945 
946  where :math:`i \in \left\{0, \; \cdots , \; \text{x.nElement}() - 1\right\}`,
947  :math:`y[i] \in \left\{0, \; 1\right\}`.
948 
949  Args:
950  weight (Tensor, optional): a manual rescaling weight given to each
951  class. If given, it has to be a Tensor of size `C`. Otherwise, it is
952  treated as if having all ones.
953  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
954  the losses are averaged over each loss element in the batch. Note that for
955  some losses, there are multiple elements per sample. If the field :attr:`size_average`
956  is set to ``False``, the losses are instead summed for each minibatch. Ignored
957  when reduce is ``False``. Default: ``True``
958  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
959  losses are averaged or summed over observations for each minibatch depending
960  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
961  batch element instead and ignores :attr:`size_average`. Default: ``True``
962  reduction (string, optional): Specifies the reduction to apply to the output:
963  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
964  ``'mean'``: the sum of the output will be divided by the number of
965  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
966  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
967  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
968 
969  Shape:
970  - Input: :math:`(N, C)` where `N` is the batch size and `C` is the number of classes.
971  - Target: :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
972  - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
973  """
974  __constants__ = ['weight', 'reduction']
975 
976  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean'):
977  super(MultiLabelSoftMarginLoss, self).__init__(weight, size_average, reduce, reduction)
978 
979  @weak_script_method
980  def forward(self, input, target):
981  return F.multilabel_soft_margin_loss(input, target, weight=self.weight, reduction=self.reduction)
982 
983 
984 @weak_module
986  r"""Creates a criterion that measures the loss given input tensors
987  :math:`x_1`, :math:`x_2` and a `Tensor` label :math:`y` with values 1 or -1.
988  This is used for measuring whether two inputs are similar or dissimilar,
989  using the cosine distance, and is typically used for learning nonlinear
990  embeddings or semi-supervised learning.
991 
992  The loss function for each sample is:
993 
994  .. math::
995  \text{loss}(x, y) =
996  \begin{cases}
997  1 - \cos(x_1, x_2), & \text{if } y = 1 \\
998  \max(0, \cos(x_1, x_2) - \text{margin}), & \text{if } y = -1
999  \end{cases}
1000 
1001  Args:
1002  margin (float, optional): Should be a number from :math:`-1` to :math:`1`,
1003  :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
1004  default value is :math:`0`.
1005  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
1006  the losses are averaged over each loss element in the batch. Note that for
1007  some losses, there are multiple elements per sample. If the field :attr:`size_average`
1008  is set to ``False``, the losses are instead summed for each minibatch. Ignored
1009  when reduce is ``False``. Default: ``True``
1010  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
1011  losses are averaged or summed over observations for each minibatch depending
1012  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
1013  batch element instead and ignores :attr:`size_average`. Default: ``True``
1014  reduction (string, optional): Specifies the reduction to apply to the output:
1015  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
1016  ``'mean'``: the sum of the output will be divided by the number of
1017  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
1018  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
1019  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
1020  """
1021  __constants__ = ['margin', 'reduction']
1022 
1023  def __init__(self, margin=0., size_average=None, reduce=None, reduction='mean'):
1024  super(CosineEmbeddingLoss, self).__init__(size_average, reduce, reduction)
1025  self.margin = margin
1026 
1027  @weak_script_method
1028  def forward(self, input1, input2, target):
1029  return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
1030 
1031 
1032 @weak_module
1034  r"""Creates a criterion that measures the loss given
1035  inputs :math:`x1`, :math:`x2`, two 1D mini-batch `Tensor`s,
1036  and a label 1D mini-batch tensor :math:`y` (containing 1 or -1).
1037 
1038  If :math:`y = 1` then it assumed the first input should be ranked higher
1039  (have a larger value) than the second input, and vice-versa for :math:`y = -1`.
1040 
1041  The loss function for each sample in the mini-batch is:
1042 
1043  .. math::
1044  \text{loss}(x, y) = \max(0, -y * (x1 - x2) + \text{margin})
1045 
1046  Args:
1047  margin (float, optional): Has a default value of :math:`0`.
1048  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
1049  the losses are averaged over each loss element in the batch. Note that for
1050  some losses, there are multiple elements per sample. If the field :attr:`size_average`
1051  is set to ``False``, the losses are instead summed for each minibatch. Ignored
1052  when reduce is ``False``. Default: ``True``
1053  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
1054  losses are averaged or summed over observations for each minibatch depending
1055  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
1056  batch element instead and ignores :attr:`size_average`. Default: ``True``
1057  reduction (string, optional): Specifies the reduction to apply to the output:
1058  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
1059  ``'mean'``: the sum of the output will be divided by the number of
1060  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
1061  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
1062  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
1063 
1064  Shape:
1065  - Input: :math:`(N, D)` where `N` is the batch size and `D` is the size of a sample.
1066  - Target: :math:`(N)`
1067  - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
1068  """
1069  __constants__ = ['margin', 'reduction']
1070 
1071  def __init__(self, margin=0., size_average=None, reduce=None, reduction='mean'):
1072  super(MarginRankingLoss, self).__init__(size_average, reduce, reduction)
1073  self.margin = margin
1074 
1075  @weak_script_method
1076  def forward(self, input1, input2, target):
1077  return F.margin_ranking_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
1078 
1079 
1080 @weak_module
1082  r"""Creates a criterion that optimizes a multi-class classification hinge
1083  loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) and
1084  output :math:`y` (which is a 1D tensor of target class indices,
1085  :math:`0 \leq y \leq \text{x.size}(1)-1`):
1086 
1087  For each mini-batch sample, the loss in terms of the 1D input :math:`x` and scalar
1088  output :math:`y` is:
1089 
1090  .. math::
1091  \text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i]))^p}{\text{x.size}(0)}
1092 
1093  where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`
1094  and :math:`i \neq y`.
1095 
1096  Optionally, you can give non-equal weighting on the classes by passing
1097  a 1D :attr:`weight` tensor into the constructor.
1098 
1099  The loss function then becomes:
1100 
1101  .. math::
1102  \text{loss}(x, y) = \frac{\sum_i \max(0, w[y] * (\text{margin} - x[y] + x[i]))^p)}{\text{x.size}(0)}
1103 
1104  Args:
1105  p (int, optional): Has a default value of :math:`1`. :math:`1` and :math:`2`
1106  are the only supported values.
1107  margin (float, optional): Has a default value of :math:`1`.
1108  weight (Tensor, optional): a manual rescaling weight given to each
1109  class. If given, it has to be a Tensor of size `C`. Otherwise, it is
1110  treated as if having all ones.
1111  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
1112  the losses are averaged over each loss element in the batch. Note that for
1113  some losses, there are multiple elements per sample. If the field :attr:`size_average`
1114  is set to ``False``, the losses are instead summed for each minibatch. Ignored
1115  when reduce is ``False``. Default: ``True``
1116  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
1117  losses are averaged or summed over observations for each minibatch depending
1118  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
1119  batch element instead and ignores :attr:`size_average`. Default: ``True``
1120  reduction (string, optional): Specifies the reduction to apply to the output:
1121  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
1122  ``'mean'``: the sum of the output will be divided by the number of
1123  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
1124  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
1125  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
1126  """
1127  __constants__ = ['p', 'margin', 'weight', 'reduction']
1128 
1129  def __init__(self, p=1, margin=1., weight=None, size_average=None,
1130  reduce=None, reduction='mean'):
1131  super(MultiMarginLoss, self).__init__(weight, size_average, reduce, reduction)
1132  if p != 1 and p != 2:
1133  raise ValueError("only p == 1 and p == 2 supported")
1134  assert weight is None or weight.dim() == 1
1135  self.p = p
1136  self.margin = margin
1137 
1138  @weak_script_method
1139  def forward(self, input, target):
1140  return F.multi_margin_loss(input, target, p=self.p, margin=self.margin,
1141  weight=self.weight, reduction=self.reduction)
1142 
1143 
1144 @weak_module
1146  r"""Creates a criterion that measures the triplet loss given an input
1147  tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
1148  This is used for measuring a relative similarity between samples. A triplet
1149  is composed by `a`, `p` and `n`: `anchor`, `positive examples` and `negative
1150  examples` respectively. The shapes of all input tensors should be
1151  :math:`(N, D)`.
1152 
1153  The distance swap is described in detail in the paper `Learning shallow
1154  convolutional feature descriptors with triplet losses`_ by
1155  V. Balntas, E. Riba et al.
1156 
1157  The loss function for each sample in the mini-batch is:
1158 
1159  .. math::
1160  L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
1161 
1162 
1163  where
1164 
1165  .. math::
1166  d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
1167 
1168  Args:
1169  margin (float, optional): Default: :math:`1`.
1170  p (int, optional): The norm degree for pairwise distance. Default: :math:`2`.
1171  swap (bool, optional): The distance swap is described in detail in the paper
1172  `Learning shallow convolutional feature descriptors with triplet losses` by
1173  V. Balntas, E. Riba et al. Default: ``False``.
1174  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
1175  the losses are averaged over each loss element in the batch. Note that for
1176  some losses, there are multiple elements per sample. If the field :attr:`size_average`
1177  is set to ``False``, the losses are instead summed for each minibatch. Ignored
1178  when reduce is ``False``. Default: ``True``
1179  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
1180  losses are averaged or summed over observations for each minibatch depending
1181  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
1182  batch element instead and ignores :attr:`size_average`. Default: ``True``
1183  reduction (string, optional): Specifies the reduction to apply to the output:
1184  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
1185  ``'mean'``: the sum of the output will be divided by the number of
1186  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
1187  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
1188  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
1189 
1190  Shape:
1191  - Input: :math:`(N, D)` where :math:`D` is the vector dimension.
1192  - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
1193 
1194  >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
1195  >>> input1 = torch.randn(100, 128, requires_grad=True)
1196  >>> input2 = torch.randn(100, 128, requires_grad=True)
1197  >>> input3 = torch.randn(100, 128, requires_grad=True)
1198  >>> output = triplet_loss(input1, input2, input3)
1199  >>> output.backward()
1200 
1201  .. _Learning shallow convolutional feature descriptors with triplet losses:
1202  http://www.iis.ee.ic.ac.uk/%7Evbalnt/shallow_descr/TFeat_paper.pdf
1203  """
1204  __constants__ = ['margin', 'p', 'eps', 'swap', 'reduction']
1205 
1206  def __init__(self, margin=1.0, p=2., eps=1e-6, swap=False, size_average=None,
1207  reduce=None, reduction='mean'):
1208  super(TripletMarginLoss, self).__init__(size_average, reduce, reduction)
1209  self.margin = margin
1210  self.p = p
1211  self.eps = eps
1212  self.swap = swap
1213 
1214  @weak_script_method
1215  def forward(self, anchor, positive, negative):
1216  return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
1217  eps=self.eps, swap=self.swap, reduction=self.reduction)
1218 
1219 
1220 @weak_module
1222  r"""The Connectionist Temporal Classification loss.
1223 
1224  Args:
1225  blank (int, optional): blank label. Default :math:`0`.
1226  reduction (string, optional): Specifies the reduction to apply to the output:
1227  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
1228  'mean': the output losses will be divided by the target lengths and
1229  then the mean over the batch is taken. Default: 'mean'
1230  zero_infinity (bool, optional):
1231  Whether to zero infinite losses and the associated gradients.
1232  Default: ``False``
1233  Infinite losses mainly occur when the inputs are too short
1234  to be aligned to the targets.
1235 
1236  Inputs:
1237  log_probs: Tensor of size :math:`(T, N, C)` where `C = number of characters in alphabet including blank`,
1238  `T = input length`, and `N = batch size`.
1239  The logarithmized probabilities of the outputs
1240  (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
1241  targets: Tensor of size :math:`(N, S)` or `(sum(target_lengths))`.
1242  Targets (cannot be blank). In the second form, the targets are assumed to be concatenated.
1243  input_lengths: Tuple or tensor of size :math:`(N)`.
1244  Lengths of the inputs (must each be :math:`\leq T`)
1245  target_lengths: Tuple or tensor of size :math:`(N)`.
1246  Lengths of the targets
1247 
1248  Example::
1249 
1250  >>> ctc_loss = nn.CTCLoss()
1251  >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_()
1252  >>> targets = torch.randint(1, 20, (16, 30), dtype=torch.long)
1253  >>> input_lengths = torch.full((16,), 50, dtype=torch.long)
1254  >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long)
1255  >>> loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
1256  >>> loss.backward()
1257 
1258  Reference:
1259  A. Graves et al.: Connectionist Temporal Classification:
1260  Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
1261  https://www.cs.toronto.edu/~graves/icml_2006.pdf
1262 
1263  .. Note::
1264  In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
1265  in concatenated format, all :attr:`input_lengths` must be `T`. :math:`blank=0`,
1266  :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
1267  dtype :attr:`torch.int32`.
1268 
1269  The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
1270 
1271 
1272  .. include:: cudnn_deterministic.rst
1273 
1274 
1275  """
1276  __constants__ = ['blank', 'reduction']
1277 
1278  def __init__(self, blank=0, reduction='mean', zero_infinity=False):
1279  super(CTCLoss, self).__init__(reduction=reduction)
1280  self.blank = blank
1281  self.zero_infinity = zero_infinity
1282 
1283  @weak_script_method
1284  def forward(self, log_probs, targets, input_lengths, target_lengths):
1285  return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction,
1286  self.zero_infinity)
1287 
1288 # TODO: L1HingeEmbeddingCriterion
1289 # TODO: MSECriterion weight
1290 # TODO: ClassSimplexCriterion