Caffe2 - Python API
A deep learning, cross platform ML framework
1 import warnings
3 import torch
4 from .module import Module
5 from .container import Sequential
6 from .activation import LogSoftmax
7 from .. import functional as F
8 from .. import _reduction as _Reduction
9 from ..._jit_internal import weak_module, weak_script_method
12 class _Loss(Module):
13  def __init__(self, size_average=None, reduce=None, reduction='mean'):
14  super(_Loss, self).__init__()
15  if size_average is not None or reduce is not None:
16  self.reduction = _Reduction.legacy_get_string(size_average, reduce)
17  else:
18  self.reduction = reduction
22  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean'):
23  super(_WeightedLoss, self).__init__(size_average, reduce, reduction)
24  self.register_buffer('weight', weight)
27 @weak_module
28 class L1Loss(_Loss):
29  r"""Creates a criterion that measures the mean absolute error (MAE) between each element in
30  the input :math:`x` and target :math:`y`.
32  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
34  .. math::
35  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
36  l_n = \left| x_n - y_n \right|,
38  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
39  (default ``'mean'``), then:
41  .. math::
42  \ell(x, y) =
43  \begin{cases}
44  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
45  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
46  \end{cases}
48  :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
49  of :math:`n` elements each.
51  The sum operation still operates over all the elements, and divides by :math:`n`.
53  The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
55  Args:
56  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
57  the losses are averaged over each loss element in the batch. Note that for
58  some losses, there are multiple elements per sample. If the field :attr:`size_average`
59  is set to ``False``, the losses are instead summed for each minibatch. Ignored
60  when reduce is ``False``. Default: ``True``
61  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
62  losses are averaged or summed over observations for each minibatch depending
63  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
64  batch element instead and ignores :attr:`size_average`. Default: ``True``
65  reduction (string, optional): Specifies the reduction to apply to the output:
66  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
67  ``'mean'``: the sum of the output will be divided by the number of
68  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
69  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
70  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
72  Shape:
73  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
74  dimensions
75  - Target: :math:`(N, *)`, same shape as the input
76  - Output: scalar. If :attr:`reduction` is ``'none'``, then
77  :math:`(N, *)`, same shape as the input
79  Examples::
81  >>> loss = nn.L1Loss()
82  >>> input = torch.randn(3, 5, requires_grad=True)
83  >>> target = torch.randn(3, 5)
84  >>> output = loss(input, target)
85  >>> output.backward()
86  """
87  __constants__ = ['reduction']
89  def __init__(self, size_average=None, reduce=None, reduction='mean'):
90  super(L1Loss, self).__init__(size_average, reduce, reduction)
92  @weak_script_method
93  def forward(self, input, target):
94  return F.l1_loss(input, target, reduction=self.reduction)
97 @weak_module
99  r"""The negative log likelihood loss. It is useful to train a classification
100  problem with `C` classes.
102  If provided, the optional argument :attr:`weight` should be a 1D Tensor assigning
103  weight to each of the classes. This is particularly useful when you have an
104  unbalanced training set.
106  The `input` given through a forward call is expected to contain
107  log-probabilities of each class. `input` has to be a Tensor of size either
108  :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
109  with :math:`K \geq 1` for the `K`-dimensional case (described later).
111  Obtaining log-probabilities in a neural network is easily achieved by
112  adding a `LogSoftmax` layer in the last layer of your network.
113  You may use `CrossEntropyLoss` instead, if you prefer not to add an extra
114  layer.
116  The `target` that this loss expects should be a class index in the range :math:`[0, C-1]`
117  where `C = number of classes`; if `ignore_index` is specified, this loss also accepts
118  this class index (this index may not necessarily be in the class range).
120  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
122  .. math::
123  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
124  l_n = - w_{y_n} x_{n,y_n}, \quad
125  w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
127  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
128  (default ``'mean'``), then
130  .. math::
131  \ell(x, y) = \begin{cases}
132  \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
133  \text{if reduction} = \text{'mean';}\\
134  \sum_{n=1}^N l_n, &
135  \text{if reduction} = \text{'sum'.}
136  \end{cases}
138  Can also be used for higher dimension inputs, such as 2D images, by providing
139  an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`,
140  where :math:`K` is the number of dimensions, and a target of appropriate shape
141  (see below). In the case of images, it computes NLL loss per-pixel.
143  Args:
144  weight (Tensor, optional): a manual rescaling weight given to each
145  class. If given, it has to be a Tensor of size `C`. Otherwise, it is
146  treated as if having all ones.
147  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
148  the losses are averaged over each loss element in the batch. Note that for
149  some losses, there are multiple elements per sample. If the field :attr:`size_average`
150  is set to ``False``, the losses are instead summed for each minibatch. Ignored
151  when reduce is ``False``. Default: ``True``
152  ignore_index (int, optional): Specifies a target value that is ignored
153  and does not contribute to the input gradient. When
154  :attr:`size_average` is ``True``, the loss is averaged over
155  non-ignored targets.
156  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
157  losses are averaged or summed over observations for each minibatch depending
158  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
159  batch element instead and ignores :attr:`size_average`. Default: ``True``
160  reduction (string, optional): Specifies the reduction to apply to the output:
161  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
162  ``'mean'``: the sum of the output will be divided by the number of
163  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
164  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
165  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
167  Shape:
168  - Input: :math:`(N, C)` where `C = number of classes`, or
169  :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
170  in the case of `K`-dimensional loss.
171  - Target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or
172  :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of
173  K-dimensional loss.
174  - Output: scalar.
175  If :attr:`reduction` is ``'none'``, then the same size as the target: :math:`(N)`, or
176  :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case
177  of K-dimensional loss.
179  Examples::
181  >>> m = nn.LogSoftmax(dim=1)
182  >>> loss = nn.NLLLoss()
183  >>> # input is of size N x C = 3 x 5
184  >>> input = torch.randn(3, 5, requires_grad=True)
185  >>> # each element in target has to have 0 <= value < C
186  >>> target = torch.tensor([1, 0, 4])
187  >>> output = loss(m(input), target)
188  >>> output.backward()
189  >>>
190  >>>
191  >>> # 2D loss example (used, for example, with image inputs)
192  >>> N, C = 5, 4
193  >>> loss = nn.NLLLoss()
194  >>> # input is of size N x C x height x width
195  >>> data = torch.randn(N, 16, 10, 10)
196  >>> conv = nn.Conv2d(16, C, (3, 3))
197  >>> m = nn.LogSoftmax(dim=1)
198  >>> # each element in target has to have 0 <= value < C
199  >>> target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
200  >>> output = loss(m(conv(data)), target)
201  >>> output.backward()
202  """
203  __constants__ = ['ignore_index', 'weight', 'reduction']
205  def __init__(self, weight=None, size_average=None, ignore_index=-100,
206  reduce=None, reduction='mean'):
207  super(NLLLoss, self).__init__(weight, size_average, reduce, reduction)
208  self.ignore_index = ignore_index
210  @weak_script_method
211  def forward(self, input, target):
212  return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
215 @weak_module
217  def __init__(self, weight=None, size_average=None, ignore_index=-100,
218  reduce=None, reduction='mean'):
219  warnings.warn("NLLLoss2d has been deprecated. "
220  "Please use NLLLoss instead as a drop-in replacement and see "
221  " for more details.")
222  super(NLLLoss2d, self).__init__(weight, size_average, ignore_index, reduce, reduction)
225 @weak_module
227  r"""Negative log likelihood loss with Poisson distribution of target.
229  The loss can be described as:
231  .. math::
232  \text{target} \sim \mathrm{Poisson}(\text{input})
234  \text{loss}(\text{input}, \text{target}) = \text{input} - \text{target} * \log(\text{input})
235  + \log(\text{target!})
237  The last term can be omitted or approximated with Stirling formula. The
238  approximation is used for target values more than 1. For targets less or
239  equal to 1 zeros are added to the loss.
241  Args:
242  log_input (bool, optional): if ``True`` the loss is computed as
243  :math:`\exp(\text{input}) - \text{target}*\text{input}`, if ``False`` the loss is
244  :math:`\text{input} - \text{target}*\log(\text{input}+\text{eps})`.
245  full (bool, optional): whether to compute full loss, i. e. to add the
246  Stirling approximation term
248  .. math::
249  \text{target}*\log(\text{target}) - \text{target} + 0.5 * \log(2\pi\text{target}).
250  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
251  the losses are averaged over each loss element in the batch. Note that for
252  some losses, there are multiple elements per sample. If the field :attr:`size_average`
253  is set to ``False``, the losses are instead summed for each minibatch. Ignored
254  when reduce is ``False``. Default: ``True``
255  eps (float, optional): Small value to avoid evaluation of :math:`\log(0)` when
256  :attr:`log_input = False`. Default: 1e-8
257  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
258  losses are averaged or summed over observations for each minibatch depending
259  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
260  batch element instead and ignores :attr:`size_average`. Default: ``True``
261  reduction (string, optional): Specifies the reduction to apply to the output:
262  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
263  ``'mean'``: the sum of the output will be divided by the number of
264  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
265  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
266  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
268  Examples::
270  >>> loss = nn.PoissonNLLLoss()
271  >>> log_input = torch.randn(5, 2, requires_grad=True)
272  >>> target = torch.randn(5, 2)
273  >>> output = loss(log_input, target)
274  >>> output.backward()
276  Shape:
277  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
278  dimensions
279  - Target: :math:`(N, *)`, same shape as the input
280  - Output: scalar by default. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`,
281  the same shape as the input
282  """
283  __constants__ = ['log_input', 'full', 'eps', 'reduction']
285  def __init__(self, log_input=True, full=False, size_average=None,
286  eps=1e-8, reduce=None, reduction='mean'):
287  super(PoissonNLLLoss, self).__init__(size_average, reduce, reduction)
288  self.log_input = log_input
289  self.full = full
290  self.eps = eps
292  @weak_script_method
293  def forward(self, log_input, target):
294  return F.poisson_nll_loss(log_input, target, log_input=self.log_input, full=self.full,
295  eps=self.eps, reduction=self.reduction)
298 @weak_module
300  r"""The `Kullback-Leibler divergence`_ Loss
302  KL divergence is a useful distance measure for continuous distributions
303  and is often useful when performing direct regression over the space of
304  (discretely sampled) continuous output distributions.
306  As with :class:`~torch.nn.NLLLoss`, the `input` given is expected to contain
307  *log-probabilities* and is not restricted to a 2D Tensor.
308  The targets are given as *probabilities* (i.e. without taking the logarithm).
310  This criterion expects a `target` `Tensor` of the same size as the
311  `input` `Tensor`.
313  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
315  .. math::
316  l(x,y) = L = \{ l_1,\dots,l_N \}, \quad
317  l_n = y_n \cdot \left( \log y_n - x_n \right)
319  where the index :math:`N` spans all dimensions of ``input`` and :math:`L` has the same
320  shape as ``input``. If :attr:`reduction` is not ``'none'`` (default ``'mean'``), then:
322  .. math::
323  \ell(x, y) = \begin{cases}
324  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';} \\
325  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
326  \end{cases}
328  In default :attr:`reduction` mode ``'mean'``, the losses are averaged for each minibatch over observations
329  **as well as** over dimensions. ``'batchmean'`` mode gives the correct KL divergence where losses
330  are averaged over batch dimension only. ``'mean'`` mode's behavior will be changed to the same as
331  ``'batchmean'`` in the next major release.
333  .. _Kullback-Leibler divergence:
336  Args:
337  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
338  the losses are averaged over each loss element in the batch. Note that for
339  some losses, there are multiple elements per sample. If the field :attr:`size_average`
340  is set to ``False``, the losses are instead summed for each minibatch. Ignored
341  when reduce is ``False``. Default: ``True``
342  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
343  losses are averaged or summed over observations for each minibatch depending
344  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
345  batch element instead and ignores :attr:`size_average`. Default: ``True``
346  reduction (string, optional): Specifies the reduction to apply to the output:
347  ``'none'`` | ``'batchmean'`` | ``'sum'`` | ``'mean'``.
348  ``'none'``: no reduction will be applied.
349  ``'batchmean'``: the sum of the output will be divided by batchsize.
350  ``'sum'``: the output will be summed.
351  ``'mean'``: the output will be divided by the number of elements in the output.
352  Default: ``'mean'``
354  .. note::
355  :attr:`size_average` and :attr:`reduce` are in the process of being deprecated,
356  and in the meantime, specifying either of those two args will override :attr:`reduction`.
358  .. note::
359  :attr:``reduction`` = ``'mean'`` doesn't return the true kl divergence value, please use
360  :attr:``reduction`` = ``'batchmean'`` which aligns with KL math definition.
361  In the next major release, ``'mean'`` will be changed to be the same as ``'batchmean'``.
363  Shape:
364  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
365  dimensions
366  - Target: :math:`(N, *)`, same shape as the input
367  - Output: scalar by default. If :attr:``reduction`` is ``'none'``, then :math:`(N, *)`,
368  the same shape as the input
370  """
371  __constants__ = ['reduction']
373  def __init__(self, size_average=None, reduce=None, reduction='mean'):
374  super(KLDivLoss, self).__init__(size_average, reduce, reduction)
376  @weak_script_method
377  def forward(self, input, target):
378  return F.kl_div(input, target, reduction=self.reduction)
381 @weak_module
382 class MSELoss(_Loss):
383  r"""Creates a criterion that measures the mean squared error (squared L2 norm) between
384  each element in the input :math:`x` and target :math:`y`.
386  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
388  .. math::
389  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
390  l_n = \left( x_n - y_n \right)^2,
392  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
393  (default ``'mean'``), then:
395  .. math::
396  \ell(x, y) =
397  \begin{cases}
398  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
399  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
400  \end{cases}
402  :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
403  of :math:`n` elements each.
405  The sum operation still operates over all the elements, and divides by :math:`n`.
407  The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
409  Args:
410  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
411  the losses are averaged over each loss element in the batch. Note that for
412  some losses, there are multiple elements per sample. If the field :attr:`size_average`
413  is set to ``False``, the losses are instead summed for each minibatch. Ignored
414  when reduce is ``False``. Default: ``True``
415  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
416  losses are averaged or summed over observations for each minibatch depending
417  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
418  batch element instead and ignores :attr:`size_average`. Default: ``True``
419  reduction (string, optional): Specifies the reduction to apply to the output:
420  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
421  ``'mean'``: the sum of the output will be divided by the number of
422  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
423  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
424  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
426  Shape:
427  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
428  dimensions
429  - Target: :math:`(N, *)`, same shape as the input
431  Examples::
433  >>> loss = nn.MSELoss()
434  >>> input = torch.randn(3, 5, requires_grad=True)
435  >>> target = torch.randn(3, 5)
436  >>> output = loss(input, target)
437  >>> output.backward()
438  """
439  __constants__ = ['reduction']
441  def __init__(self, size_average=None, reduce=None, reduction='mean'):
442  super(MSELoss, self).__init__(size_average, reduce, reduction)
444  @weak_script_method
445  def forward(self, input, target):
446  return F.mse_loss(input, target, reduction=self.reduction)
449 @weak_module
451  r"""Creates a criterion that measures the Binary Cross Entropy
452  between the target and the output:
454  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
456  .. math::
457  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
458  l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],
460  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
461  (default ``'mean'``), then
463  .. math::
464  \ell(x, y) = \begin{cases}
465  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
466  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
467  \end{cases}
469  This is used for measuring the error of a reconstruction in for example
470  an auto-encoder. Note that the targets :math:`y` should be numbers
471  between 0 and 1.
473  Args:
474  weight (Tensor, optional): a manual rescaling weight given to the loss
475  of each batch element. If given, has to be a Tensor of size `nbatch`.
476  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
477  the losses are averaged over each loss element in the batch. Note that for
478  some losses, there are multiple elements per sample. If the field :attr:`size_average`
479  is set to ``False``, the losses are instead summed for each minibatch. Ignored
480  when reduce is ``False``. Default: ``True``
481  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
482  losses are averaged or summed over observations for each minibatch depending
483  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
484  batch element instead and ignores :attr:`size_average`. Default: ``True``
485  reduction (string, optional): Specifies the reduction to apply to the output:
486  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
487  ``'mean'``: the sum of the output will be divided by the number of
488  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
489  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
490  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
492  Shape:
493  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
494  dimensions
495  - Target: :math:`(N, *)`, same shape as the input
496  - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
497  shape as input.
499  Examples::
501  >>> m = nn.Sigmoid()
502  >>> loss = nn.BCELoss()
503  >>> input = torch.randn(3, requires_grad=True)
504  >>> target = torch.empty(3).random_(2)
505  >>> output = loss(m(input), target)
506  >>> output.backward()
507  """
508  __constants__ = ['reduction', 'weight']
510  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean'):
511  super(BCELoss, self).__init__(weight, size_average, reduce, reduction)
513  @weak_script_method
514  def forward(self, input, target):
515  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
518 @weak_module
520  r"""This loss combines a `Sigmoid` layer and the `BCELoss` in one single
521  class. This version is more numerically stable than using a plain `Sigmoid`
522  followed by a `BCELoss` as, by combining the operations into one layer,
523  we take advantage of the log-sum-exp trick for numerical stability.
525  The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
527  .. math::
528  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
529  l_n = - w_n \left[ y_n \cdot \log \sigma(x_n)
530  + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
532  where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
533  (default ``'mean'``), then
535  .. math::
536  \ell(x, y) = \begin{cases}
537  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
538  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
539  \end{cases}
541  This is used for measuring the error of a reconstruction in for example
542  an auto-encoder. Note that the targets `t[i]` should be numbers
543  between 0 and 1.
545  It's possible to trade off recall and precision by adding weights to positive examples.
546  In this case the loss can be described as:
548  .. math::
549  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
550  l_n = - w_n \left[ p_n y_n \cdot \log \sigma(x_n)
551  + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
553  where :math:`p_n` is the weight of the positive class for sample :math:`n` in the batch.
554  :math:`p_n > 1` increases the recall, :math:`p_n < 1` increases the precision.
556  For example, if a dataset contains 100 positive and 300 negative examples of a single class,
557  then `pos_weight` for the class should be equal to :math:`\frac{300}{100}=3`.
558  The loss would act as if the dataset contains :math:`3\times 100=300` positive examples.
560  Args:
561  weight (Tensor, optional): a manual rescaling weight given to the loss
562  of each batch element. If given, has to be a Tensor of size `nbatch`.
563  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
564  the losses are averaged over each loss element in the batch. Note that for
565  some losses, there are multiple elements per sample. If the field :attr:`size_average`
566  is set to ``False``, the losses are instead summed for each minibatch. Ignored
567  when reduce is ``False``. Default: ``True``
568  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
569  losses are averaged or summed over observations for each minibatch depending
570  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
571  batch element instead and ignores :attr:`size_average`. Default: ``True``
572  reduction (string, optional): Specifies the reduction to apply to the output:
573  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
574  ``'mean'``: the sum of the output will be divided by the number of
575  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
576  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
577  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
578  pos_weight (Tensor, optional): a weight of positive examples.
579  Must be a vector with length equal to the number of classes.
581  Shape:
582  - Input: :math:`(N, *)` where :math:`*` means, any number of additional dimensions
583  - Target: :math:`(N, *)`, same shape as the input
584  - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
585  shape as input.
587  Examples::
589  >>> loss = nn.BCEWithLogitsLoss()
590  >>> input = torch.randn(3, requires_grad=True)
591  >>> target = torch.empty(3).random_(2)
592  >>> output = loss(input, target)
593  >>> output.backward()
594  """
595  __constants__ = ['weight', 'pos_weight', 'reduction']
597  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean', pos_weight=None):
598  super(BCEWithLogitsLoss, self).__init__(size_average, reduce, reduction)
599  self.register_buffer('weight', weight)
600  self.register_buffer('pos_weight', pos_weight)
602  @weak_script_method
603  def forward(self, input, target):
604  return F.binary_cross_entropy_with_logits(input, target,
605  self.weight,
606  pos_weight=self.pos_weight,
607  reduction=self.reduction)
610 @weak_module
612  r"""Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`
613  (containing 1 or -1).
614  This is usually used for measuring whether two inputs are similar or
615  dissimilar, e.g. using the L1 pairwise distance as :math:`x`, and is typically
616  used for learning nonlinear embeddings or semi-supervised learning.
618  The loss function for :math:`n`-th sample in the mini-batch is
620  .. math::
621  l_n = \begin{cases}
622  x_n, & \text{if}\; y_n = 1,\\
623  \max \{0, \Delta - x_n\}, & \text{if}\; y_n = -1,
624  \end{cases}
626  and the total loss functions is
628  .. math::
629  \ell(x, y) = \begin{cases}
630  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
631  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
632  \end{cases}
634  where :math:`L = \{l_1,\dots,l_N\}^\top`.
636  Args:
637  margin (float, optional): Has a default value of `1`.
638  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
639  the losses are averaged over each loss element in the batch. Note that for
640  some losses, there are multiple elements per sample. If the field :attr:`size_average`
641  is set to ``False``, the losses are instead summed for each minibatch. Ignored
642  when reduce is ``False``. Default: ``True``
643  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
644  losses are averaged or summed over observations for each minibatch depending
645  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
646  batch element instead and ignores :attr:`size_average`. Default: ``True``
647  reduction (string, optional): Specifies the reduction to apply to the output:
648  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
649  ``'mean'``: the sum of the output will be divided by the number of
650  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
651  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
652  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
654  Shape:
655  - Input: :math:`(*)` where :math:`*` means, any number of dimensions. The sum operation
656  operates over all the elements.
657  - Target: :math:`(*)`, same shape as the input
658  - Output: scalar. If :attr:``reduction`` is ``'none'``, then same shape as the input
659  """
660  __constants__ = ['margin', 'reduction']
662  def __init__(self, margin=1.0, size_average=None, reduce=None, reduction='mean'):
663  super(HingeEmbeddingLoss, self).__init__(size_average, reduce, reduction)
664  self.margin = margin
666  @weak_script_method
667  def forward(self, input, target):
668  return F.hinge_embedding_loss(input, target, margin=self.margin, reduction=self.reduction)
671 @weak_module
673  r"""Creates a criterion that optimizes a multi-class multi-classification
674  hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
675  and output :math:`y` (which is a 2D `Tensor` of target class indices).
676  For each sample in the mini-batch:
678  .. math::
679  \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
681  where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \
682  :math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \
683  :math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \
684  and :math:`i \neq y[j]` for all :math:`i` and :math:`j`.
686  :math:`y` and :math:`x` must have the same size.
688  The criterion only considers a contiguous block of non-negative targets that
689  starts at the front.
691  This allows for different samples to have variable amounts of target classes.
693  Args:
694  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
695  the losses are averaged over each loss element in the batch. Note that for
696  some losses, there are multiple elements per sample. If the field :attr:`size_average`
697  is set to ``False``, the losses are instead summed for each minibatch. Ignored
698  when reduce is ``False``. Default: ``True``
699  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
700  losses are averaged or summed over observations for each minibatch depending
701  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
702  batch element instead and ignores :attr:`size_average`. Default: ``True``
703  reduction (string, optional): Specifies the reduction to apply to the output:
704  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
705  ``'mean'``: the sum of the output will be divided by the number of
706  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
707  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
708  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
710  Shape:
711  - Input: :math:`(C)` or :math:`(N, C)` where `N` is the batch size and `C`
712  is the number of classes.
713  - Target: :math:`(C)` or :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
714  - Output: scalar. If :attr:``reduction`` is ``'none'``, then :math:`(N)`.
716  Examples::
718  >>> loss = nn.MultiLabelMarginLoss()
719  >>> x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
720  >>> # for target y, only consider labels 3 and 0, not after label -1
721  >>> y = torch.LongTensor([[3, 0, -1, 1]])
722  >>> loss(x, y)
723  >>> # 0.25 * ((1-(0.1-0.2)) + (1-(0.1-0.4)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
724  tensor(0.8500)
726  """
727  __constants__ = ['reduction']
729  def __init__(self, size_average=None, reduce=None, reduction='mean'):
730  super(MultiLabelMarginLoss, self).__init__(size_average, reduce, reduction)
732  @weak_script_method
733  def forward(self, input, target):
734  return F.multilabel_margin_loss(input, target, reduction=self.reduction)
737 @weak_module
739  r"""Creates a criterion that uses a squared term if the absolute
740  element-wise error falls below 1 and an L1 term otherwise.
741  It is less sensitive to outliers than the `MSELoss` and in some cases
742  prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick).
743  Also known as the Huber loss:
745  .. math::
746  \text{loss}(x, y) = \frac{1}{n} \sum_{i} z_{i}
748  where :math:`z_{i}` is given by:
750  .. math::
751  z_{i} =
752  \begin{cases}
753  0.5 (x_i - y_i)^2, & \text{if } |x_i - y_i| < 1 \\
754  |x_i - y_i| - 0.5, & \text{otherwise }
755  \end{cases}
757  :math:`x` and :math:`y` arbitrary shapes with a total of :math:`n` elements each
758  the sum operation still operates over all the elements, and divides by :math:`n`.
760  The division by :math:`n` can be avoided if sets ``reduction = 'sum'``.
762  Args:
763  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
764  the losses are averaged over each loss element in the batch. Note that for
765  some losses, there are multiple elements per sample. If the field :attr:`size_average`
766  is set to ``False``, the losses are instead summed for each minibatch. Ignored
767  when reduce is ``False``. Default: ``True``
768  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
769  losses are averaged or summed over observations for each minibatch depending
770  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
771  batch element instead and ignores :attr:`size_average`. Default: ``True``
772  reduction (string, optional): Specifies the reduction to apply to the output:
773  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
774  ``'mean'``: the sum of the output will be divided by the number of
775  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
776  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
777  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
779  Shape:
780  - Input: :math:`(N, *)` where :math:`*` means, any number of additional
781  dimensions
782  - Target: :math:`(N, *)`, same shape as the input
783  - Output: scalar. If :attr:`reduction` is ``'none'``, then
784  :math:`(N, *)`, same shape as the input
786  """
787  __constants__ = ['reduction']
789  def __init__(self, size_average=None, reduce=None, reduction='mean'):
790  super(SmoothL1Loss, self).__init__(size_average, reduce, reduction)
792  @weak_script_method
793  def forward(self, input, target):
794  return F.smooth_l1_loss(input, target, reduction=self.reduction)
797 @weak_module
799  r"""Creates a criterion that optimizes a two-class classification
800  logistic loss between input tensor :math:`x` and target tensor :math:`y`
801  (containing 1 or -1).
803  .. math::
804  \text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()}
806  Args:
807  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
808  the losses are averaged over each loss element in the batch. Note that for
809  some losses, there are multiple elements per sample. If the field :attr:`size_average`
810  is set to ``False``, the losses are instead summed for each minibatch. Ignored
811  when reduce is ``False``. Default: ``True``
812  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
813  losses are averaged or summed over observations for each minibatch depending
814  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
815  batch element instead and ignores :attr:`size_average`. Default: ``True``
816  reduction (string, optional): Specifies the reduction to apply to the output:
817  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
818  ``'mean'``: the sum of the output will be divided by the number of
819  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
820  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
821  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
823  Shape:
824  - Input: :math:`(*)` where :math:`*` means, any number of additional
825  dimensions
826  - Target: :math:`(*)`, same shape as the input
827  - Output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input
829  """
830  __constants__ = ['reduction']
832  def __init__(self, size_average=None, reduce=None, reduction='mean'):
833  super(SoftMarginLoss, self).__init__(size_average, reduce, reduction)
835  @weak_script_method
836  def forward(self, input, target):
837  return F.soft_margin_loss(input, target, reduction=self.reduction)
840 @weak_module
842  r"""This criterion combines :func:`nn.LogSoftmax` and :func:`nn.NLLLoss` in one single class.
844  It is useful when training a classification problem with `C` classes.
845  If provided, the optional argument :attr:`weight` should be a 1D `Tensor`
846  assigning weight to each of the classes.
847  This is particularly useful when you have an unbalanced training set.
849  The `input` is expected to contain raw, unnormalized scores for each class.
851  `input` has to be a Tensor of size either :math:`(minibatch, C)` or
852  :math:`(minibatch, C, d_1, d_2, ..., d_K)`
853  with :math:`K \geq 1` for the `K`-dimensional case (described later).
855  This criterion expects a class index in the range :math:`[0, C-1]` as the
856  `target`for each value of a 1D tensor of size `minibatch`; if `ignore_index`
857  is specified, this criterion also accepts this class index (this index may not
858  necessarily be in the class range).
860  The loss can be described as:
862  .. math::
863  \text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right)
864  = -x[class] + \log\left(\sum_j \exp(x[j])\right)
866  or in the case of the :attr:`weight` argument being specified:
868  .. math::
869  \text{loss}(x, class) = weight[class] \left(-x[class] + \log\left(\sum_j \exp(x[j])\right)\right)
871  The losses are averaged across observations for each minibatch.
873  Can also be used for higher dimension inputs, such as 2D images, by providing
874  an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`,
875  where :math:`K` is the number of dimensions, and a target of appropriate shape
876  (see below).
879  Args:
880  weight (Tensor, optional): a manual rescaling weight given to each class.
881  If given, has to be a Tensor of size `C`
882  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
883  the losses are averaged over each loss element in the batch. Note that for
884  some losses, there are multiple elements per sample. If the field :attr:`size_average`
885  is set to ``False``, the losses are instead summed for each minibatch. Ignored
886  when reduce is ``False``. Default: ``True``
887  ignore_index (int, optional): Specifies a target value that is ignored
888  and does not contribute to the input gradient. When :attr:`size_average` is
889  ``True``, the loss is averaged over non-ignored targets.
890  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
891  losses are averaged or summed over observations for each minibatch depending
892  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
893  batch element instead and ignores :attr:`size_average`. Default: ``True``
894  reduction (string, optional): Specifies the reduction to apply to the output:
895  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
896  ``'mean'``: the sum of the output will be divided by the number of
897  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
898  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
899  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
901  Shape:
902  - Input: :math:`(N, C)` where `C = number of classes`, or
903  :math:`(N, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`
904  in the case of `K`-dimensional loss.
905  - Target: :math:`(N)` where each value is :math:`0 \leq \text{targets}[i] \leq C-1`, or
906  :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of
907  K-dimensional loss.
908  - Output: scalar.
909  If :attr:`reduction` is ``'none'``, then the same size as the target:
910  :math:`(N)`, or
911  :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case
912  of K-dimensional loss.
914  Examples::
916  >>> loss = nn.CrossEntropyLoss()
917  >>> input = torch.randn(3, 5, requires_grad=True)
918  >>> target = torch.empty(3, dtype=torch.long).random_(5)
919  >>> output = loss(input, target)
920  >>> output.backward()
921  """
922  __constants__ = ['weight', 'ignore_index', 'reduction']
924  def __init__(self, weight=None, size_average=None, ignore_index=-100,
925  reduce=None, reduction='mean'):
926  super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
927  self.ignore_index = ignore_index
929  @weak_script_method
930  def forward(self, input, target):
931  return F.cross_entropy(input, target, weight=self.weight,
932  ignore_index=self.ignore_index, reduction=self.reduction)
935 @weak_module
937  r"""Creates a criterion that optimizes a multi-label one-versus-all
938  loss based on max-entropy, between input :math:`x` and target :math:`y` of size
939  :math:`(N, C)`.
940  For each sample in the minibatch:
942  .. math::
943  loss(x, y) = - \frac{1}{C} * \sum_i y[i] * \log((1 + \exp(-x[i]))^{-1})
944  + (1-y[i]) * \log\left(\frac{\exp(-x[i])}{(1 + \exp(-x[i]))}\right)
946  where :math:`i \in \left\{0, \; \cdots , \; \text{x.nElement}() - 1\right\}`,
947  :math:`y[i] \in \left\{0, \; 1\right\}`.
949  Args:
950  weight (Tensor, optional): a manual rescaling weight given to each
951  class. If given, it has to be a Tensor of size `C`. Otherwise, it is
952  treated as if having all ones.
953  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
954  the losses are averaged over each loss element in the batch. Note that for
955  some losses, there are multiple elements per sample. If the field :attr:`size_average`
956  is set to ``False``, the losses are instead summed for each minibatch. Ignored
957  when reduce is ``False``. Default: ``True``
958  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
959  losses are averaged or summed over observations for each minibatch depending
960  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
961  batch element instead and ignores :attr:`size_average`. Default: ``True``
962  reduction (string, optional): Specifies the reduction to apply to the output:
963  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
964  ``'mean'``: the sum of the output will be divided by the number of
965  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
966  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
967  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
969  Shape:
970  - Input: :math:`(N, C)` where `N` is the batch size and `C` is the number of classes.
971  - Target: :math:`(N, C)`, label targets padded by -1 ensuring same shape as the input.
972  - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
973  """
974  __constants__ = ['weight', 'reduction']
976  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean'):
977  super(MultiLabelSoftMarginLoss, self).__init__(weight, size_average, reduce, reduction)
979  @weak_script_method
980  def forward(self, input, target):
981  return F.multilabel_soft_margin_loss(input, target, weight=self.weight, reduction=self.reduction)
984 @weak_module
986  r"""Creates a criterion that measures the loss given input tensors
987  :math:`x_1`, :math:`x_2` and a `Tensor` label :math:`y` with values 1 or -1.
988  This is used for measuring whether two inputs are similar or dissimilar,
989  using the cosine distance, and is typically used for learning nonlinear
990  embeddings or semi-supervised learning.
992  The loss function for each sample is:
994  .. math::
995  \text{loss}(x, y) =
996  \begin{cases}
997  1 - \cos(x_1, x_2), & \text{if } y = 1 \\
998  \max(0, \cos(x_1, x_2) - \text{margin}), & \text{if } y = -1
999  \end{cases}
1001  Args:
1002  margin (float, optional): Should be a number from :math:`-1` to :math:`1`,
1003  :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
1004  default value is :math:`0`.
1005  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
1006  the losses are averaged over each loss element in the batch. Note that for
1007  some losses, there are multiple elements per sample. If the field :attr:`size_average`
1008  is set to ``False``, the losses are instead summed for each minibatch. Ignored
1009  when reduce is ``False``. Default: ``True``
1010  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
1011  losses are averaged or summed over observations for each minibatch depending
1012  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
1013  batch element instead and ignores :attr:`size_average`. Default: ``True``
1014  reduction (string, optional): Specifies the reduction to apply to the output:
1015  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
1016  ``'mean'``: the sum of the output will be divided by the number of
1017  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
1018  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
1019  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
1020  """
1021  __constants__ = ['margin', 'reduction']
1023  def __init__(self, margin=0., size_average=None, reduce=None, reduction='mean'):
1024  super(CosineEmbeddingLoss, self).__init__(size_average, reduce, reduction)
1025  self.margin = margin
1027  @weak_script_method
1028  def forward(self, input1, input2, target):
1029  return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
1032 @weak_module
1034  r"""Creates a criterion that measures the loss given
1035  inputs :math:`x1`, :math:`x2`, two 1D mini-batch `Tensor`s,
1036  and a label 1D mini-batch tensor :math:`y` (containing 1 or -1).
1038  If :math:`y = 1` then it assumed the first input should be ranked higher
1039  (have a larger value) than the second input, and vice-versa for :math:`y = -1`.
1041  The loss function for each sample in the mini-batch is:
1043  .. math::
1044  \text{loss}(x, y) = \max(0, -y * (x1 - x2) + \text{margin})
1046  Args:
1047  margin (float, optional): Has a default value of :math:`0`.
1048  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
1049  the losses are averaged over each loss element in the batch. Note that for
1050  some losses, there are multiple elements per sample. If the field :attr:`size_average`
1051  is set to ``False``, the losses are instead summed for each minibatch. Ignored
1052  when reduce is ``False``. Default: ``True``
1053  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
1054  losses are averaged or summed over observations for each minibatch depending
1055  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
1056  batch element instead and ignores :attr:`size_average`. Default: ``True``
1057  reduction (string, optional): Specifies the reduction to apply to the output:
1058  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
1059  ``'mean'``: the sum of the output will be divided by the number of
1060  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
1061  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
1062  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
1064  Shape:
1065  - Input: :math:`(N, D)` where `N` is the batch size and `D` is the size of a sample.
1066  - Target: :math:`(N)`
1067  - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
1068  """
1069  __constants__ = ['margin', 'reduction']
1071  def __init__(self, margin=0., size_average=None, reduce=None, reduction='mean'):
1072  super(MarginRankingLoss, self).__init__(size_average, reduce, reduction)
1073  self.margin = margin
1075  @weak_script_method
1076  def forward(self, input1, input2, target):
1077  return F.margin_ranking_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
1080 @weak_module
1082  r"""Creates a criterion that optimizes a multi-class classification hinge
1083  loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`) and
1084  output :math:`y` (which is a 1D tensor of target class indices,
1085  :math:`0 \leq y \leq \text{x.size}(1)-1`):
1087  For each mini-batch sample, the loss in terms of the 1D input :math:`x` and scalar
1088  output :math:`y` is:
1090  .. math::
1091  \text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i]))^p}{\text{x.size}(0)}
1093  where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`
1094  and :math:`i \neq y`.
1096  Optionally, you can give non-equal weighting on the classes by passing
1097  a 1D :attr:`weight` tensor into the constructor.
1099  The loss function then becomes:
1101  .. math::
1102  \text{loss}(x, y) = \frac{\sum_i \max(0, w[y] * (\text{margin} - x[y] + x[i]))^p)}{\text{x.size}(0)}
1104  Args:
1105  p (int, optional): Has a default value of :math:`1`. :math:`1` and :math:`2`
1106  are the only supported values.
1107  margin (float, optional): Has a default value of :math:`1`.
1108  weight (Tensor, optional): a manual rescaling weight given to each
1109  class. If given, it has to be a Tensor of size `C`. Otherwise, it is
1110  treated as if having all ones.
1111  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
1112  the losses are averaged over each loss element in the batch. Note that for
1113  some losses, there are multiple elements per sample. If the field :attr:`size_average`
1114  is set to ``False``, the losses are instead summed for each minibatch. Ignored
1115  when reduce is ``False``. Default: ``True``
1116  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
1117  losses are averaged or summed over observations for each minibatch depending
1118  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
1119  batch element instead and ignores :attr:`size_average`. Default: ``True``
1120  reduction (string, optional): Specifies the reduction to apply to the output:
1121  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
1122  ``'mean'``: the sum of the output will be divided by the number of
1123  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
1124  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
1125  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
1126  """
1127  __constants__ = ['p', 'margin', 'weight', 'reduction']
1129  def __init__(self, p=1, margin=1., weight=None, size_average=None,
1130  reduce=None, reduction='mean'):
1131  super(MultiMarginLoss, self).__init__(weight, size_average, reduce, reduction)
1132  if p != 1 and p != 2:
1133  raise ValueError("only p == 1 and p == 2 supported")
1134  assert weight is None or weight.dim() == 1
1135  self.p = p
1136  self.margin = margin
1138  @weak_script_method
1139  def forward(self, input, target):
1140  return F.multi_margin_loss(input, target, p=self.p, margin=self.margin,
1141  weight=self.weight, reduction=self.reduction)
1144 @weak_module
1146  r"""Creates a criterion that measures the triplet loss given an input
1147  tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
1148  This is used for measuring a relative similarity between samples. A triplet
1149  is composed by `a`, `p` and `n`: `anchor`, `positive examples` and `negative
1150  examples` respectively. The shapes of all input tensors should be
1151  :math:`(N, D)`.
1153  The distance swap is described in detail in the paper `Learning shallow
1154  convolutional feature descriptors with triplet losses`_ by
1155  V. Balntas, E. Riba et al.
1157  The loss function for each sample in the mini-batch is:
1159  .. math::
1160  L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
1163  where
1165  .. math::
1166  d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
1168  Args:
1169  margin (float, optional): Default: :math:`1`.
1170  p (int, optional): The norm degree for pairwise distance. Default: :math:`2`.
1171  swap (bool, optional): The distance swap is described in detail in the paper
1172  `Learning shallow convolutional feature descriptors with triplet losses` by
1173  V. Balntas, E. Riba et al. Default: ``False``.
1174  size_average (bool, optional): Deprecated (see :attr:`reduction`). By default,
1175  the losses are averaged over each loss element in the batch. Note that for
1176  some losses, there are multiple elements per sample. If the field :attr:`size_average`
1177  is set to ``False``, the losses are instead summed for each minibatch. Ignored
1178  when reduce is ``False``. Default: ``True``
1179  reduce (bool, optional): Deprecated (see :attr:`reduction`). By default, the
1180  losses are averaged or summed over observations for each minibatch depending
1181  on :attr:`size_average`. When :attr:`reduce` is ``False``, returns a loss per
1182  batch element instead and ignores :attr:`size_average`. Default: ``True``
1183  reduction (string, optional): Specifies the reduction to apply to the output:
1184  ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
1185  ``'mean'``: the sum of the output will be divided by the number of
1186  elements in the output, ``'sum'``: the output will be summed. Note: :attr:`size_average`
1187  and :attr:`reduce` are in the process of being deprecated, and in the meantime,
1188  specifying either of those two args will override :attr:`reduction`. Default: ``'mean'``
1190  Shape:
1191  - Input: :math:`(N, D)` where :math:`D` is the vector dimension.
1192  - Output: scalar. If :attr:`reduction` is ``'none'``, then :math:`(N)`.
1194  >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
1195  >>> input1 = torch.randn(100, 128, requires_grad=True)
1196  >>> input2 = torch.randn(100, 128, requires_grad=True)
1197  >>> input3 = torch.randn(100, 128, requires_grad=True)
1198  >>> output = triplet_loss(input1, input2, input3)
1199  >>> output.backward()
1201  .. _Learning shallow convolutional feature descriptors with triplet losses:
1203  """
1204  __constants__ = ['margin', 'p', 'eps', 'swap', 'reduction']
1206  def __init__(self, margin=1.0, p=2., eps=1e-6, swap=False, size_average=None,
1207  reduce=None, reduction='mean'):
1208  super(TripletMarginLoss, self).__init__(size_average, reduce, reduction)
1209  self.margin = margin
1210  self.p = p
1211  self.eps = eps
1212  self.swap = swap
1214  @weak_script_method
1215  def forward(self, anchor, positive, negative):
1216  return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
1217  eps=self.eps, swap=self.swap, reduction=self.reduction)
1220 @weak_module
1222  r"""The Connectionist Temporal Classification loss.
1224  Args:
1225  blank (int, optional): blank label. Default :math:`0`.
1226  reduction (string, optional): Specifies the reduction to apply to the output:
1227  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
1228  'mean': the output losses will be divided by the target lengths and
1229  then the mean over the batch is taken. Default: 'mean'
1230  zero_infinity (bool, optional):
1231  Whether to zero infinite losses and the associated gradients.
1232  Default: ``False``
1233  Infinite losses mainly occur when the inputs are too short
1234  to be aligned to the targets.
1236  Inputs:
1237  log_probs: Tensor of size :math:`(T, N, C)` where `C = number of characters in alphabet including blank`,
1238  `T = input length`, and `N = batch size`.
1239  The logarithmized probabilities of the outputs
1240  (e.g. obtained with :func:`torch.nn.functional.log_softmax`).
1241  targets: Tensor of size :math:`(N, S)` or `(sum(target_lengths))`.
1242  Targets (cannot be blank). In the second form, the targets are assumed to be concatenated.
1243  input_lengths: Tuple or tensor of size :math:`(N)`.
1244  Lengths of the inputs (must each be :math:`\leq T`)
1245  target_lengths: Tuple or tensor of size :math:`(N)`.
1246  Lengths of the targets
1248  Example::
1250  >>> ctc_loss = nn.CTCLoss()
1251  >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_()
1252  >>> targets = torch.randint(1, 20, (16, 30), dtype=torch.long)
1253  >>> input_lengths = torch.full((16,), 50, dtype=torch.long)
1254  >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long)
1255  >>> loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
1256  >>> loss.backward()
1258  Reference:
1259  A. Graves et al.: Connectionist Temporal Classification:
1260  Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
1263  .. Note::
1264  In order to use CuDNN, the following must be satisfied: :attr:`targets` must be
1265  in concatenated format, all :attr:`input_lengths` must be `T`. :math:`blank=0`,
1266  :attr:`target_lengths` :math:`\leq 256`, the integer arguments must be of
1267  dtype :attr:`torch.int32`.
1269  The regular implementation uses the (more common in PyTorch) `torch.long` dtype.
1272  .. include:: cudnn_deterministic.rst
1275  """
1276  __constants__ = ['blank', 'reduction']
1278  def __init__(self, blank=0, reduction='mean', zero_infinity=False):
1279  super(CTCLoss, self).__init__(reduction=reduction)
1280  self.blank = blank
1281  self.zero_infinity = zero_infinity
1283  @weak_script_method
1284  def forward(self, log_probs, targets, input_lengths, target_lengths):
1285  return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction,
1286  self.zero_infinity)
1288 # TODO: L1HingeEmbeddingCriterion
1289 # TODO: MSECriterion weight
1290 # TODO: ClassSimplexCriterion