Caffe2 - Python API A deep learning, cross platform ML framework
loss.py
1 import warnings
2
3 import torch
4 from .module import Module
5 from .container import Sequential
6 from .activation import LogSoftmax
7 from .. import functional as F
8 from .. import _reduction as _Reduction
9 from ..._jit_internal import weak_module, weak_script_method
10
11
12 class _Loss(Module):
13  def __init__(self, size_average=None, reduce=None, reduction='mean'):
14  super(_Loss, self).__init__()
15  if size_average is not None or reduce is not None:
16  self.reduction = _Reduction.legacy_get_string(size_average, reduce)
17  else:
18  self.reduction = reduction
19
20
22  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean'):
23  super(_WeightedLoss, self).__init__(size_average, reduce, reduction)
24  self.register_buffer('weight', weight)
25
26
27 @weak_module
28 class L1Loss(_Loss):
29  r"""Creates a criterion that measures the mean absolute error (MAE) between each element in
30  the input :math:x and target :math:y.
31
32  The unreduced (i.e. with :attr:reduction set to 'none') loss can be described as:
33
34  .. math::
35  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
36  l_n = \left| x_n - y_n \right|,
37
38  where :math:N is the batch size. If :attr:reduction is not 'none'
39  (default 'mean'), then:
40
41  .. math::
42  \ell(x, y) =
43  \begin{cases}
44  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
45  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
46  \end{cases}
47
48  :math:x and :math:y are tensors of arbitrary shapes with a total
49  of :math:n elements each.
50
51  The sum operation still operates over all the elements, and divides by :math:n.
52
53  The division by :math:n can be avoided if one sets reduction = 'sum'.
54
55  Args:
56  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
57  the losses are averaged over each loss element in the batch. Note that for
58  some losses, there are multiple elements per sample. If the field :attr:size_average
59  is set to False, the losses are instead summed for each minibatch. Ignored
60  when reduce is False. Default: True
61  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
62  losses are averaged or summed over observations for each minibatch depending
63  on :attr:size_average. When :attr:reduce is False, returns a loss per
64  batch element instead and ignores :attr:size_average. Default: True
65  reduction (string, optional): Specifies the reduction to apply to the output:
66  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
67  'mean': the sum of the output will be divided by the number of
68  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
69  and :attr:reduce are in the process of being deprecated, and in the meantime,
70  specifying either of those two args will override :attr:reduction. Default: 'mean'
71
72  Shape:
73  - Input: :math:(N, *) where :math:* means, any number of additional
74  dimensions
75  - Target: :math:(N, *), same shape as the input
76  - Output: scalar. If :attr:reduction is 'none', then
77  :math:(N, *), same shape as the input
78
79  Examples::
80
81  >>> loss = nn.L1Loss()
82  >>> input = torch.randn(3, 5, requires_grad=True)
83  >>> target = torch.randn(3, 5)
84  >>> output = loss(input, target)
85  >>> output.backward()
86  """
87  __constants__ = ['reduction']
88
89  def __init__(self, size_average=None, reduce=None, reduction='mean'):
90  super(L1Loss, self).__init__(size_average, reduce, reduction)
91
92  @weak_script_method
93  def forward(self, input, target):
94  return F.l1_loss(input, target, reduction=self.reduction)
95
96
97 @weak_module
99  r"""The negative log likelihood loss. It is useful to train a classification
100  problem with C classes.
101
102  If provided, the optional argument :attr:weight should be a 1D Tensor assigning
103  weight to each of the classes. This is particularly useful when you have an
104  unbalanced training set.
105
106  The input given through a forward call is expected to contain
107  log-probabilities of each class. input has to be a Tensor of size either
108  :math:(minibatch, C) or :math:(minibatch, C, d_1, d_2, ..., d_K)
109  with :math:K \geq 1 for the K-dimensional case (described later).
110
111  Obtaining log-probabilities in a neural network is easily achieved by
112  adding a LogSoftmax layer in the last layer of your network.
113  You may use CrossEntropyLoss instead, if you prefer not to add an extra
114  layer.
115
116  The target that this loss expects should be a class index in the range :math:[0, C-1]
117  where C = number of classes; if ignore_index is specified, this loss also accepts
118  this class index (this index may not necessarily be in the class range).
119
120  The unreduced (i.e. with :attr:reduction set to 'none') loss can be described as:
121
122  .. math::
123  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
124  l_n = - w_{y_n} x_{n,y_n}, \quad
125  w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
126
127  where :math:N is the batch size. If :attr:reduction is not 'none'
128  (default 'mean'), then
129
130  .. math::
131  \ell(x, y) = \begin{cases}
132  \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
133  \text{if reduction} = \text{'mean';}\\
134  \sum_{n=1}^N l_n, &
135  \text{if reduction} = \text{'sum'.}
136  \end{cases}
137
138  Can also be used for higher dimension inputs, such as 2D images, by providing
139  an input of size :math:(minibatch, C, d_1, d_2, ..., d_K) with :math:K \geq 1,
140  where :math:K is the number of dimensions, and a target of appropriate shape
141  (see below). In the case of images, it computes NLL loss per-pixel.
142
143  Args:
144  weight (Tensor, optional): a manual rescaling weight given to each
145  class. If given, it has to be a Tensor of size C. Otherwise, it is
146  treated as if having all ones.
147  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
148  the losses are averaged over each loss element in the batch. Note that for
149  some losses, there are multiple elements per sample. If the field :attr:size_average
150  is set to False, the losses are instead summed for each minibatch. Ignored
151  when reduce is False. Default: True
152  ignore_index (int, optional): Specifies a target value that is ignored
153  and does not contribute to the input gradient. When
154  :attr:size_average is True, the loss is averaged over
155  non-ignored targets.
156  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
157  losses are averaged or summed over observations for each minibatch depending
158  on :attr:size_average. When :attr:reduce is False, returns a loss per
159  batch element instead and ignores :attr:size_average. Default: True
160  reduction (string, optional): Specifies the reduction to apply to the output:
161  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
162  'mean': the sum of the output will be divided by the number of
163  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
164  and :attr:reduce are in the process of being deprecated, and in the meantime,
165  specifying either of those two args will override :attr:reduction. Default: 'mean'
166
167  Shape:
168  - Input: :math:(N, C) where C = number of classes, or
169  :math:(N, C, d_1, d_2, ..., d_K) with :math:K \geq 1
170  in the case of K-dimensional loss.
171  - Target: :math:(N) where each value is :math:0 \leq \text{targets}[i] \leq C-1, or
172  :math:(N, d_1, d_2, ..., d_K) with :math:K \geq 1 in the case of
173  K-dimensional loss.
174  - Output: scalar.
175  If :attr:reduction is 'none', then the same size as the target: :math:(N), or
176  :math:(N, d_1, d_2, ..., d_K) with :math:K \geq 1 in the case
177  of K-dimensional loss.
178
179  Examples::
180
181  >>> m = nn.LogSoftmax(dim=1)
182  >>> loss = nn.NLLLoss()
183  >>> # input is of size N x C = 3 x 5
184  >>> input = torch.randn(3, 5, requires_grad=True)
185  >>> # each element in target has to have 0 <= value < C
186  >>> target = torch.tensor([1, 0, 4])
187  >>> output = loss(m(input), target)
188  >>> output.backward()
189  >>>
190  >>>
191  >>> # 2D loss example (used, for example, with image inputs)
192  >>> N, C = 5, 4
193  >>> loss = nn.NLLLoss()
194  >>> # input is of size N x C x height x width
195  >>> data = torch.randn(N, 16, 10, 10)
196  >>> conv = nn.Conv2d(16, C, (3, 3))
197  >>> m = nn.LogSoftmax(dim=1)
198  >>> # each element in target has to have 0 <= value < C
199  >>> target = torch.empty(N, 8, 8, dtype=torch.long).random_(0, C)
200  >>> output = loss(m(conv(data)), target)
201  >>> output.backward()
202  """
203  __constants__ = ['ignore_index', 'weight', 'reduction']
204
205  def __init__(self, weight=None, size_average=None, ignore_index=-100,
206  reduce=None, reduction='mean'):
207  super(NLLLoss, self).__init__(weight, size_average, reduce, reduction)
208  self.ignore_index = ignore_index
209
210  @weak_script_method
211  def forward(self, input, target):
212  return F.nll_loss(input, target, weight=self.weight, ignore_index=self.ignore_index, reduction=self.reduction)
213
214
215 @weak_module
217  def __init__(self, weight=None, size_average=None, ignore_index=-100,
218  reduce=None, reduction='mean'):
219  warnings.warn("NLLLoss2d has been deprecated. "
220  "Please use NLLLoss instead as a drop-in replacement and see "
221  "https://pytorch.org/docs/master/nn.html#torch.nn.NLLLoss for more details.")
222  super(NLLLoss2d, self).__init__(weight, size_average, ignore_index, reduce, reduction)
223
224
225 @weak_module
227  r"""Negative log likelihood loss with Poisson distribution of target.
228
229  The loss can be described as:
230
231  .. math::
232  \text{target} \sim \mathrm{Poisson}(\text{input})
233
234  \text{loss}(\text{input}, \text{target}) = \text{input} - \text{target} * \log(\text{input})
235  + \log(\text{target!})
236
237  The last term can be omitted or approximated with Stirling formula. The
238  approximation is used for target values more than 1. For targets less or
239  equal to 1 zeros are added to the loss.
240
241  Args:
242  log_input (bool, optional): if True the loss is computed as
243  :math:\exp(\text{input}) - \text{target}*\text{input}, if False the loss is
244  :math:\text{input} - \text{target}*\log(\text{input}+\text{eps}).
245  full (bool, optional): whether to compute full loss, i. e. to add the
246  Stirling approximation term
247
248  .. math::
249  \text{target}*\log(\text{target}) - \text{target} + 0.5 * \log(2\pi\text{target}).
250  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
251  the losses are averaged over each loss element in the batch. Note that for
252  some losses, there are multiple elements per sample. If the field :attr:size_average
253  is set to False, the losses are instead summed for each minibatch. Ignored
254  when reduce is False. Default: True
255  eps (float, optional): Small value to avoid evaluation of :math:\log(0) when
256  :attr:log_input = False. Default: 1e-8
257  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
258  losses are averaged or summed over observations for each minibatch depending
259  on :attr:size_average. When :attr:reduce is False, returns a loss per
260  batch element instead and ignores :attr:size_average. Default: True
261  reduction (string, optional): Specifies the reduction to apply to the output:
262  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
263  'mean': the sum of the output will be divided by the number of
264  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
265  and :attr:reduce are in the process of being deprecated, and in the meantime,
266  specifying either of those two args will override :attr:reduction. Default: 'mean'
267
268  Examples::
269
270  >>> loss = nn.PoissonNLLLoss()
271  >>> log_input = torch.randn(5, 2, requires_grad=True)
272  >>> target = torch.randn(5, 2)
273  >>> output = loss(log_input, target)
274  >>> output.backward()
275
276  Shape:
277  - Input: :math:(N, *) where :math:* means, any number of additional
278  dimensions
279  - Target: :math:(N, *), same shape as the input
280  - Output: scalar by default. If :attr:reduction is 'none', then :math:(N, *),
281  the same shape as the input
282  """
283  __constants__ = ['log_input', 'full', 'eps', 'reduction']
284
285  def __init__(self, log_input=True, full=False, size_average=None,
286  eps=1e-8, reduce=None, reduction='mean'):
287  super(PoissonNLLLoss, self).__init__(size_average, reduce, reduction)
288  self.log_input = log_input
289  self.full = full
290  self.eps = eps
291
292  @weak_script_method
293  def forward(self, log_input, target):
294  return F.poisson_nll_loss(log_input, target, log_input=self.log_input, full=self.full,
295  eps=self.eps, reduction=self.reduction)
296
297
298 @weak_module
300  r"""The Kullback-Leibler divergence_ Loss
301
302  KL divergence is a useful distance measure for continuous distributions
303  and is often useful when performing direct regression over the space of
304  (discretely sampled) continuous output distributions.
305
306  As with :class:~torch.nn.NLLLoss, the input given is expected to contain
307  *log-probabilities* and is not restricted to a 2D Tensor.
308  The targets are given as *probabilities* (i.e. without taking the logarithm).
309
310  This criterion expects a target Tensor of the same size as the
311  input Tensor.
312
313  The unreduced (i.e. with :attr:reduction set to 'none') loss can be described as:
314
315  .. math::
316  l(x,y) = L = \{ l_1,\dots,l_N \}, \quad
317  l_n = y_n \cdot \left( \log y_n - x_n \right)
318
319  where the index :math:N spans all dimensions of input and :math:L has the same
320  shape as input. If :attr:reduction is not 'none' (default 'mean'), then:
321
322  .. math::
323  \ell(x, y) = \begin{cases}
324  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';} \\
325  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
326  \end{cases}
327
328  In default :attr:reduction mode 'mean', the losses are averaged for each minibatch over observations
329  **as well as** over dimensions. 'batchmean' mode gives the correct KL divergence where losses
330  are averaged over batch dimension only. 'mean' mode's behavior will be changed to the same as
331  'batchmean' in the next major release.
332
333  .. _Kullback-Leibler divergence:
334  https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
335
336  Args:
337  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
338  the losses are averaged over each loss element in the batch. Note that for
339  some losses, there are multiple elements per sample. If the field :attr:size_average
340  is set to False, the losses are instead summed for each minibatch. Ignored
341  when reduce is False. Default: True
342  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
343  losses are averaged or summed over observations for each minibatch depending
344  on :attr:size_average. When :attr:reduce is False, returns a loss per
345  batch element instead and ignores :attr:size_average. Default: True
346  reduction (string, optional): Specifies the reduction to apply to the output:
347  'none' | 'batchmean' | 'sum' | 'mean'.
348  'none': no reduction will be applied.
349  'batchmean': the sum of the output will be divided by batchsize.
350  'sum': the output will be summed.
351  'mean': the output will be divided by the number of elements in the output.
352  Default: 'mean'
353
354  .. note::
355  :attr:size_average and :attr:reduce are in the process of being deprecated,
356  and in the meantime, specifying either of those two args will override :attr:reduction.
357
358  .. note::
359  :attr:reduction = 'mean' doesn't return the true kl divergence value, please use
360  :attr:reduction = 'batchmean' which aligns with KL math definition.
361  In the next major release, 'mean' will be changed to be the same as 'batchmean'.
362
363  Shape:
364  - Input: :math:(N, *) where :math:* means, any number of additional
365  dimensions
366  - Target: :math:(N, *), same shape as the input
367  - Output: scalar by default. If :attr:reduction is 'none', then :math:(N, *),
368  the same shape as the input
369
370  """
371  __constants__ = ['reduction']
372
373  def __init__(self, size_average=None, reduce=None, reduction='mean'):
374  super(KLDivLoss, self).__init__(size_average, reduce, reduction)
375
376  @weak_script_method
377  def forward(self, input, target):
378  return F.kl_div(input, target, reduction=self.reduction)
379
380
381 @weak_module
382 class MSELoss(_Loss):
383  r"""Creates a criterion that measures the mean squared error (squared L2 norm) between
384  each element in the input :math:x and target :math:y.
385
386  The unreduced (i.e. with :attr:reduction set to 'none') loss can be described as:
387
388  .. math::
389  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
390  l_n = \left( x_n - y_n \right)^2,
391
392  where :math:N is the batch size. If :attr:reduction is not 'none'
393  (default 'mean'), then:
394
395  .. math::
396  \ell(x, y) =
397  \begin{cases}
398  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
399  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
400  \end{cases}
401
402  :math:x and :math:y are tensors of arbitrary shapes with a total
403  of :math:n elements each.
404
405  The sum operation still operates over all the elements, and divides by :math:n.
406
407  The division by :math:n can be avoided if one sets reduction = 'sum'.
408
409  Args:
410  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
411  the losses are averaged over each loss element in the batch. Note that for
412  some losses, there are multiple elements per sample. If the field :attr:size_average
413  is set to False, the losses are instead summed for each minibatch. Ignored
414  when reduce is False. Default: True
415  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
416  losses are averaged or summed over observations for each minibatch depending
417  on :attr:size_average. When :attr:reduce is False, returns a loss per
418  batch element instead and ignores :attr:size_average. Default: True
419  reduction (string, optional): Specifies the reduction to apply to the output:
420  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
421  'mean': the sum of the output will be divided by the number of
422  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
423  and :attr:reduce are in the process of being deprecated, and in the meantime,
424  specifying either of those two args will override :attr:reduction. Default: 'mean'
425
426  Shape:
427  - Input: :math:(N, *) where :math:* means, any number of additional
428  dimensions
429  - Target: :math:(N, *), same shape as the input
430
431  Examples::
432
433  >>> loss = nn.MSELoss()
434  >>> input = torch.randn(3, 5, requires_grad=True)
435  >>> target = torch.randn(3, 5)
436  >>> output = loss(input, target)
437  >>> output.backward()
438  """
439  __constants__ = ['reduction']
440
441  def __init__(self, size_average=None, reduce=None, reduction='mean'):
442  super(MSELoss, self).__init__(size_average, reduce, reduction)
443
444  @weak_script_method
445  def forward(self, input, target):
446  return F.mse_loss(input, target, reduction=self.reduction)
447
448
449 @weak_module
451  r"""Creates a criterion that measures the Binary Cross Entropy
452  between the target and the output:
453
454  The unreduced (i.e. with :attr:reduction set to 'none') loss can be described as:
455
456  .. math::
457  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
458  l_n = - w_n \left[ y_n \cdot \log x_n + (1 - y_n) \cdot \log (1 - x_n) \right],
459
460  where :math:N is the batch size. If :attr:reduction is not 'none'
461  (default 'mean'), then
462
463  .. math::
464  \ell(x, y) = \begin{cases}
465  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
466  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
467  \end{cases}
468
469  This is used for measuring the error of a reconstruction in for example
470  an auto-encoder. Note that the targets :math:y should be numbers
471  between 0 and 1.
472
473  Args:
474  weight (Tensor, optional): a manual rescaling weight given to the loss
475  of each batch element. If given, has to be a Tensor of size nbatch.
476  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
477  the losses are averaged over each loss element in the batch. Note that for
478  some losses, there are multiple elements per sample. If the field :attr:size_average
479  is set to False, the losses are instead summed for each minibatch. Ignored
480  when reduce is False. Default: True
481  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
482  losses are averaged or summed over observations for each minibatch depending
483  on :attr:size_average. When :attr:reduce is False, returns a loss per
484  batch element instead and ignores :attr:size_average. Default: True
485  reduction (string, optional): Specifies the reduction to apply to the output:
486  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
487  'mean': the sum of the output will be divided by the number of
488  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
489  and :attr:reduce are in the process of being deprecated, and in the meantime,
490  specifying either of those two args will override :attr:reduction. Default: 'mean'
491
492  Shape:
493  - Input: :math:(N, *) where :math:* means, any number of additional
494  dimensions
495  - Target: :math:(N, *), same shape as the input
496  - Output: scalar. If :attr:reduction is 'none', then :math:(N, *), same
497  shape as input.
498
499  Examples::
500
501  >>> m = nn.Sigmoid()
502  >>> loss = nn.BCELoss()
503  >>> input = torch.randn(3, requires_grad=True)
504  >>> target = torch.empty(3).random_(2)
505  >>> output = loss(m(input), target)
506  >>> output.backward()
507  """
508  __constants__ = ['reduction', 'weight']
509
510  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean'):
511  super(BCELoss, self).__init__(weight, size_average, reduce, reduction)
512
513  @weak_script_method
514  def forward(self, input, target):
515  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
516
517
518 @weak_module
520  r"""This loss combines a Sigmoid layer and the BCELoss in one single
521  class. This version is more numerically stable than using a plain Sigmoid
522  followed by a BCELoss as, by combining the operations into one layer,
523  we take advantage of the log-sum-exp trick for numerical stability.
524
525  The unreduced (i.e. with :attr:reduction set to 'none') loss can be described as:
526
527  .. math::
528  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
529  l_n = - w_n \left[ y_n \cdot \log \sigma(x_n)
530  + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
531
532  where :math:N is the batch size. If :attr:reduction is not 'none'
533  (default 'mean'), then
534
535  .. math::
536  \ell(x, y) = \begin{cases}
537  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
538  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
539  \end{cases}
540
541  This is used for measuring the error of a reconstruction in for example
542  an auto-encoder. Note that the targets t[i] should be numbers
543  between 0 and 1.
544
545  It's possible to trade off recall and precision by adding weights to positive examples.
546  In this case the loss can be described as:
547
548  .. math::
549  \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
550  l_n = - w_n \left[ p_n y_n \cdot \log \sigma(x_n)
551  + (1 - y_n) \cdot \log (1 - \sigma(x_n)) \right],
552
553  where :math:p_n is the weight of the positive class for sample :math:n in the batch.
554  :math:p_n > 1 increases the recall, :math:p_n < 1 increases the precision.
555
556  For example, if a dataset contains 100 positive and 300 negative examples of a single class,
557  then pos_weight for the class should be equal to :math:\frac{300}{100}=3.
558  The loss would act as if the dataset contains :math:3\times 100=300 positive examples.
559
560  Args:
561  weight (Tensor, optional): a manual rescaling weight given to the loss
562  of each batch element. If given, has to be a Tensor of size nbatch.
563  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
564  the losses are averaged over each loss element in the batch. Note that for
565  some losses, there are multiple elements per sample. If the field :attr:size_average
566  is set to False, the losses are instead summed for each minibatch. Ignored
567  when reduce is False. Default: True
568  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
569  losses are averaged or summed over observations for each minibatch depending
570  on :attr:size_average. When :attr:reduce is False, returns a loss per
571  batch element instead and ignores :attr:size_average. Default: True
572  reduction (string, optional): Specifies the reduction to apply to the output:
573  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
574  'mean': the sum of the output will be divided by the number of
575  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
576  and :attr:reduce are in the process of being deprecated, and in the meantime,
577  specifying either of those two args will override :attr:reduction. Default: 'mean'
578  pos_weight (Tensor, optional): a weight of positive examples.
579  Must be a vector with length equal to the number of classes.
580
581  Shape:
582  - Input: :math:(N, *) where :math:* means, any number of additional dimensions
583  - Target: :math:(N, *), same shape as the input
584  - Output: scalar. If :attr:reduction is 'none', then :math:(N, *), same
585  shape as input.
586
587  Examples::
588
589  >>> loss = nn.BCEWithLogitsLoss()
590  >>> input = torch.randn(3, requires_grad=True)
591  >>> target = torch.empty(3).random_(2)
592  >>> output = loss(input, target)
593  >>> output.backward()
594  """
595  __constants__ = ['weight', 'pos_weight', 'reduction']
596
597  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean', pos_weight=None):
598  super(BCEWithLogitsLoss, self).__init__(size_average, reduce, reduction)
599  self.register_buffer('weight', weight)
600  self.register_buffer('pos_weight', pos_weight)
601
602  @weak_script_method
603  def forward(self, input, target):
604  return F.binary_cross_entropy_with_logits(input, target,
605  self.weight,
606  pos_weight=self.pos_weight,
607  reduction=self.reduction)
608
609
610 @weak_module
612  r"""Measures the loss given an input tensor :math:x and a labels tensor :math:y
613  (containing 1 or -1).
614  This is usually used for measuring whether two inputs are similar or
615  dissimilar, e.g. using the L1 pairwise distance as :math:x, and is typically
616  used for learning nonlinear embeddings or semi-supervised learning.
617
618  The loss function for :math:n-th sample in the mini-batch is
619
620  .. math::
621  l_n = \begin{cases}
622  x_n, & \text{if}\; y_n = 1,\\
623  \max \{0, \Delta - x_n\}, & \text{if}\; y_n = -1,
624  \end{cases}
625
626  and the total loss functions is
627
628  .. math::
629  \ell(x, y) = \begin{cases}
630  \operatorname{mean}(L), & \text{if reduction} = \text{'mean';}\\
631  \operatorname{sum}(L), & \text{if reduction} = \text{'sum'.}
632  \end{cases}
633
634  where :math:L = \{l_1,\dots,l_N\}^\top.
635
636  Args:
637  margin (float, optional): Has a default value of 1.
638  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
639  the losses are averaged over each loss element in the batch. Note that for
640  some losses, there are multiple elements per sample. If the field :attr:size_average
641  is set to False, the losses are instead summed for each minibatch. Ignored
642  when reduce is False. Default: True
643  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
644  losses are averaged or summed over observations for each minibatch depending
645  on :attr:size_average. When :attr:reduce is False, returns a loss per
646  batch element instead and ignores :attr:size_average. Default: True
647  reduction (string, optional): Specifies the reduction to apply to the output:
648  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
649  'mean': the sum of the output will be divided by the number of
650  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
651  and :attr:reduce are in the process of being deprecated, and in the meantime,
652  specifying either of those two args will override :attr:reduction. Default: 'mean'
653
654  Shape:
655  - Input: :math:(*) where :math:* means, any number of dimensions. The sum operation
656  operates over all the elements.
657  - Target: :math:(*), same shape as the input
658  - Output: scalar. If :attr:reduction is 'none', then same shape as the input
659  """
660  __constants__ = ['margin', 'reduction']
661
662  def __init__(self, margin=1.0, size_average=None, reduce=None, reduction='mean'):
663  super(HingeEmbeddingLoss, self).__init__(size_average, reduce, reduction)
664  self.margin = margin
665
666  @weak_script_method
667  def forward(self, input, target):
668  return F.hinge_embedding_loss(input, target, margin=self.margin, reduction=self.reduction)
669
670
671 @weak_module
673  r"""Creates a criterion that optimizes a multi-class multi-classification
674  hinge loss (margin-based loss) between input :math:x (a 2D mini-batch Tensor)
675  and output :math:y (which is a 2D Tensor of target class indices).
676  For each sample in the mini-batch:
677
678  .. math::
679  \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
680
681  where :math:x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}, \
682  :math:y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}, \
683  :math:0 \leq y[j] \leq \text{x.size}(0)-1, \
684  and :math:i \neq y[j] for all :math:i and :math:j.
685
686  :math:y and :math:x must have the same size.
687
688  The criterion only considers a contiguous block of non-negative targets that
689  starts at the front.
690
691  This allows for different samples to have variable amounts of target classes.
692
693  Args:
694  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
695  the losses are averaged over each loss element in the batch. Note that for
696  some losses, there are multiple elements per sample. If the field :attr:size_average
697  is set to False, the losses are instead summed for each minibatch. Ignored
698  when reduce is False. Default: True
699  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
700  losses are averaged or summed over observations for each minibatch depending
701  on :attr:size_average. When :attr:reduce is False, returns a loss per
702  batch element instead and ignores :attr:size_average. Default: True
703  reduction (string, optional): Specifies the reduction to apply to the output:
704  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
705  'mean': the sum of the output will be divided by the number of
706  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
707  and :attr:reduce are in the process of being deprecated, and in the meantime,
708  specifying either of those two args will override :attr:reduction. Default: 'mean'
709
710  Shape:
711  - Input: :math:(C) or :math:(N, C) where N is the batch size and C
712  is the number of classes.
713  - Target: :math:(C) or :math:(N, C), label targets padded by -1 ensuring same shape as the input.
714  - Output: scalar. If :attr:reduction is 'none', then :math:(N).
715
716  Examples::
717
718  >>> loss = nn.MultiLabelMarginLoss()
719  >>> x = torch.FloatTensor([[0.1, 0.2, 0.4, 0.8]])
720  >>> # for target y, only consider labels 3 and 0, not after label -1
721  >>> y = torch.LongTensor([[3, 0, -1, 1]])
722  >>> loss(x, y)
723  >>> # 0.25 * ((1-(0.1-0.2)) + (1-(0.1-0.4)) + (1-(0.8-0.2)) + (1-(0.8-0.4)))
724  tensor(0.8500)
725
726  """
727  __constants__ = ['reduction']
728
729  def __init__(self, size_average=None, reduce=None, reduction='mean'):
730  super(MultiLabelMarginLoss, self).__init__(size_average, reduce, reduction)
731
732  @weak_script_method
733  def forward(self, input, target):
734  return F.multilabel_margin_loss(input, target, reduction=self.reduction)
735
736
737 @weak_module
739  r"""Creates a criterion that uses a squared term if the absolute
740  element-wise error falls below 1 and an L1 term otherwise.
741  It is less sensitive to outliers than the MSELoss and in some cases
742  prevents exploding gradients (e.g. see "Fast R-CNN" paper by Ross Girshick).
743  Also known as the Huber loss:
744
745  .. math::
746  \text{loss}(x, y) = \frac{1}{n} \sum_{i} z_{i}
747
748  where :math:z_{i} is given by:
749
750  .. math::
751  z_{i} =
752  \begin{cases}
753  0.5 (x_i - y_i)^2, & \text{if } |x_i - y_i| < 1 \\
754  |x_i - y_i| - 0.5, & \text{otherwise }
755  \end{cases}
756
757  :math:x and :math:y arbitrary shapes with a total of :math:n elements each
758  the sum operation still operates over all the elements, and divides by :math:n.
759
760  The division by :math:n can be avoided if sets reduction = 'sum'.
761
762  Args:
763  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
764  the losses are averaged over each loss element in the batch. Note that for
765  some losses, there are multiple elements per sample. If the field :attr:size_average
766  is set to False, the losses are instead summed for each minibatch. Ignored
767  when reduce is False. Default: True
768  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
769  losses are averaged or summed over observations for each minibatch depending
770  on :attr:size_average. When :attr:reduce is False, returns a loss per
771  batch element instead and ignores :attr:size_average. Default: True
772  reduction (string, optional): Specifies the reduction to apply to the output:
773  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
774  'mean': the sum of the output will be divided by the number of
775  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
776  and :attr:reduce are in the process of being deprecated, and in the meantime,
777  specifying either of those two args will override :attr:reduction. Default: 'mean'
778
779  Shape:
780  - Input: :math:(N, *) where :math:* means, any number of additional
781  dimensions
782  - Target: :math:(N, *), same shape as the input
783  - Output: scalar. If :attr:reduction is 'none', then
784  :math:(N, *), same shape as the input
785
786  """
787  __constants__ = ['reduction']
788
789  def __init__(self, size_average=None, reduce=None, reduction='mean'):
790  super(SmoothL1Loss, self).__init__(size_average, reduce, reduction)
791
792  @weak_script_method
793  def forward(self, input, target):
794  return F.smooth_l1_loss(input, target, reduction=self.reduction)
795
796
797 @weak_module
799  r"""Creates a criterion that optimizes a two-class classification
800  logistic loss between input tensor :math:x and target tensor :math:y
801  (containing 1 or -1).
802
803  .. math::
804  \text{loss}(x, y) = \sum_i \frac{\log(1 + \exp(-y[i]*x[i]))}{\text{x.nelement}()}
805
806  Args:
807  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
808  the losses are averaged over each loss element in the batch. Note that for
809  some losses, there are multiple elements per sample. If the field :attr:size_average
810  is set to False, the losses are instead summed for each minibatch. Ignored
811  when reduce is False. Default: True
812  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
813  losses are averaged or summed over observations for each minibatch depending
814  on :attr:size_average. When :attr:reduce is False, returns a loss per
815  batch element instead and ignores :attr:size_average. Default: True
816  reduction (string, optional): Specifies the reduction to apply to the output:
817  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
818  'mean': the sum of the output will be divided by the number of
819  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
820  and :attr:reduce are in the process of being deprecated, and in the meantime,
821  specifying either of those two args will override :attr:reduction. Default: 'mean'
822
823  Shape:
824  - Input: :math:(*) where :math:* means, any number of additional
825  dimensions
826  - Target: :math:(*), same shape as the input
827  - Output: scalar. If :attr:reduction is 'none', then same shape as the input
828
829  """
830  __constants__ = ['reduction']
831
832  def __init__(self, size_average=None, reduce=None, reduction='mean'):
833  super(SoftMarginLoss, self).__init__(size_average, reduce, reduction)
834
835  @weak_script_method
836  def forward(self, input, target):
837  return F.soft_margin_loss(input, target, reduction=self.reduction)
838
839
840 @weak_module
842  r"""This criterion combines :func:nn.LogSoftmax and :func:nn.NLLLoss in one single class.
843
844  It is useful when training a classification problem with C classes.
845  If provided, the optional argument :attr:weight should be a 1D Tensor
846  assigning weight to each of the classes.
847  This is particularly useful when you have an unbalanced training set.
848
849  The input is expected to contain raw, unnormalized scores for each class.
850
851  input has to be a Tensor of size either :math:(minibatch, C) or
852  :math:(minibatch, C, d_1, d_2, ..., d_K)
853  with :math:K \geq 1 for the K-dimensional case (described later).
854
855  This criterion expects a class index in the range :math:[0, C-1] as the
856  targetfor each value of a 1D tensor of size minibatch; if ignore_index
857  is specified, this criterion also accepts this class index (this index may not
858  necessarily be in the class range).
859
860  The loss can be described as:
861
862  .. math::
863  \text{loss}(x, class) = -\log\left(\frac{\exp(x[class])}{\sum_j \exp(x[j])}\right)
864  = -x[class] + \log\left(\sum_j \exp(x[j])\right)
865
866  or in the case of the :attr:weight argument being specified:
867
868  .. math::
869  \text{loss}(x, class) = weight[class] \left(-x[class] + \log\left(\sum_j \exp(x[j])\right)\right)
870
871  The losses are averaged across observations for each minibatch.
872
873  Can also be used for higher dimension inputs, such as 2D images, by providing
874  an input of size :math:(minibatch, C, d_1, d_2, ..., d_K) with :math:K \geq 1,
875  where :math:K is the number of dimensions, and a target of appropriate shape
876  (see below).
877
878
879  Args:
880  weight (Tensor, optional): a manual rescaling weight given to each class.
881  If given, has to be a Tensor of size C
882  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
883  the losses are averaged over each loss element in the batch. Note that for
884  some losses, there are multiple elements per sample. If the field :attr:size_average
885  is set to False, the losses are instead summed for each minibatch. Ignored
886  when reduce is False. Default: True
887  ignore_index (int, optional): Specifies a target value that is ignored
888  and does not contribute to the input gradient. When :attr:size_average is
889  True, the loss is averaged over non-ignored targets.
890  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
891  losses are averaged or summed over observations for each minibatch depending
892  on :attr:size_average. When :attr:reduce is False, returns a loss per
893  batch element instead and ignores :attr:size_average. Default: True
894  reduction (string, optional): Specifies the reduction to apply to the output:
895  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
896  'mean': the sum of the output will be divided by the number of
897  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
898  and :attr:reduce are in the process of being deprecated, and in the meantime,
899  specifying either of those two args will override :attr:reduction. Default: 'mean'
900
901  Shape:
902  - Input: :math:(N, C) where C = number of classes, or
903  :math:(N, C, d_1, d_2, ..., d_K) with :math:K \geq 1
904  in the case of K-dimensional loss.
905  - Target: :math:(N) where each value is :math:0 \leq \text{targets}[i] \leq C-1, or
906  :math:(N, d_1, d_2, ..., d_K) with :math:K \geq 1 in the case of
907  K-dimensional loss.
908  - Output: scalar.
909  If :attr:reduction is 'none', then the same size as the target:
910  :math:(N), or
911  :math:(N, d_1, d_2, ..., d_K) with :math:K \geq 1 in the case
912  of K-dimensional loss.
913
914  Examples::
915
916  >>> loss = nn.CrossEntropyLoss()
917  >>> input = torch.randn(3, 5, requires_grad=True)
918  >>> target = torch.empty(3, dtype=torch.long).random_(5)
919  >>> output = loss(input, target)
920  >>> output.backward()
921  """
922  __constants__ = ['weight', 'ignore_index', 'reduction']
923
924  def __init__(self, weight=None, size_average=None, ignore_index=-100,
925  reduce=None, reduction='mean'):
926  super(CrossEntropyLoss, self).__init__(weight, size_average, reduce, reduction)
927  self.ignore_index = ignore_index
928
929  @weak_script_method
930  def forward(self, input, target):
931  return F.cross_entropy(input, target, weight=self.weight,
932  ignore_index=self.ignore_index, reduction=self.reduction)
933
934
935 @weak_module
937  r"""Creates a criterion that optimizes a multi-label one-versus-all
938  loss based on max-entropy, between input :math:x and target :math:y of size
939  :math:(N, C).
940  For each sample in the minibatch:
941
942  .. math::
943  loss(x, y) = - \frac{1}{C} * \sum_i y[i] * \log((1 + \exp(-x[i]))^{-1})
944  + (1-y[i]) * \log\left(\frac{\exp(-x[i])}{(1 + \exp(-x[i]))}\right)
945
946  where :math:i \in \left\{0, \; \cdots , \; \text{x.nElement}() - 1\right\},
947  :math:y[i] \in \left\{0, \; 1\right\}.
948
949  Args:
950  weight (Tensor, optional): a manual rescaling weight given to each
951  class. If given, it has to be a Tensor of size C. Otherwise, it is
952  treated as if having all ones.
953  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
954  the losses are averaged over each loss element in the batch. Note that for
955  some losses, there are multiple elements per sample. If the field :attr:size_average
956  is set to False, the losses are instead summed for each minibatch. Ignored
957  when reduce is False. Default: True
958  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
959  losses are averaged or summed over observations for each minibatch depending
960  on :attr:size_average. When :attr:reduce is False, returns a loss per
961  batch element instead and ignores :attr:size_average. Default: True
962  reduction (string, optional): Specifies the reduction to apply to the output:
963  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
964  'mean': the sum of the output will be divided by the number of
965  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
966  and :attr:reduce are in the process of being deprecated, and in the meantime,
967  specifying either of those two args will override :attr:reduction. Default: 'mean'
968
969  Shape:
970  - Input: :math:(N, C) where N is the batch size and C is the number of classes.
971  - Target: :math:(N, C), label targets padded by -1 ensuring same shape as the input.
972  - Output: scalar. If :attr:reduction is 'none', then :math:(N).
973  """
974  __constants__ = ['weight', 'reduction']
975
976  def __init__(self, weight=None, size_average=None, reduce=None, reduction='mean'):
977  super(MultiLabelSoftMarginLoss, self).__init__(weight, size_average, reduce, reduction)
978
979  @weak_script_method
980  def forward(self, input, target):
981  return F.multilabel_soft_margin_loss(input, target, weight=self.weight, reduction=self.reduction)
982
983
984 @weak_module
986  r"""Creates a criterion that measures the loss given input tensors
987  :math:x_1, :math:x_2 and a Tensor label :math:y with values 1 or -1.
988  This is used for measuring whether two inputs are similar or dissimilar,
989  using the cosine distance, and is typically used for learning nonlinear
990  embeddings or semi-supervised learning.
991
992  The loss function for each sample is:
993
994  .. math::
995  \text{loss}(x, y) =
996  \begin{cases}
997  1 - \cos(x_1, x_2), & \text{if } y = 1 \\
998  \max(0, \cos(x_1, x_2) - \text{margin}), & \text{if } y = -1
999  \end{cases}
1000
1001  Args:
1002  margin (float, optional): Should be a number from :math:-1 to :math:1,
1003  :math:0 to :math:0.5 is suggested. If :attr:margin is missing, the
1004  default value is :math:0.
1005  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
1006  the losses are averaged over each loss element in the batch. Note that for
1007  some losses, there are multiple elements per sample. If the field :attr:size_average
1008  is set to False, the losses are instead summed for each minibatch. Ignored
1009  when reduce is False. Default: True
1010  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
1011  losses are averaged or summed over observations for each minibatch depending
1012  on :attr:size_average. When :attr:reduce is False, returns a loss per
1013  batch element instead and ignores :attr:size_average. Default: True
1014  reduction (string, optional): Specifies the reduction to apply to the output:
1015  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
1016  'mean': the sum of the output will be divided by the number of
1017  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
1018  and :attr:reduce are in the process of being deprecated, and in the meantime,
1019  specifying either of those two args will override :attr:reduction. Default: 'mean'
1020  """
1021  __constants__ = ['margin', 'reduction']
1022
1023  def __init__(self, margin=0., size_average=None, reduce=None, reduction='mean'):
1024  super(CosineEmbeddingLoss, self).__init__(size_average, reduce, reduction)
1025  self.margin = margin
1026
1027  @weak_script_method
1028  def forward(self, input1, input2, target):
1029  return F.cosine_embedding_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
1030
1031
1032 @weak_module
1034  r"""Creates a criterion that measures the loss given
1035  inputs :math:x1, :math:x2, two 1D mini-batch Tensors,
1036  and a label 1D mini-batch tensor :math:y (containing 1 or -1).
1037
1038  If :math:y = 1 then it assumed the first input should be ranked higher
1039  (have a larger value) than the second input, and vice-versa for :math:y = -1.
1040
1041  The loss function for each sample in the mini-batch is:
1042
1043  .. math::
1044  \text{loss}(x, y) = \max(0, -y * (x1 - x2) + \text{margin})
1045
1046  Args:
1047  margin (float, optional): Has a default value of :math:0.
1048  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
1049  the losses are averaged over each loss element in the batch. Note that for
1050  some losses, there are multiple elements per sample. If the field :attr:size_average
1051  is set to False, the losses are instead summed for each minibatch. Ignored
1052  when reduce is False. Default: True
1053  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
1054  losses are averaged or summed over observations for each minibatch depending
1055  on :attr:size_average. When :attr:reduce is False, returns a loss per
1056  batch element instead and ignores :attr:size_average. Default: True
1057  reduction (string, optional): Specifies the reduction to apply to the output:
1058  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
1059  'mean': the sum of the output will be divided by the number of
1060  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
1061  and :attr:reduce are in the process of being deprecated, and in the meantime,
1062  specifying either of those two args will override :attr:reduction. Default: 'mean'
1063
1064  Shape:
1065  - Input: :math:(N, D) where N is the batch size and D is the size of a sample.
1066  - Target: :math:(N)
1067  - Output: scalar. If :attr:reduction is 'none', then :math:(N).
1068  """
1069  __constants__ = ['margin', 'reduction']
1070
1071  def __init__(self, margin=0., size_average=None, reduce=None, reduction='mean'):
1072  super(MarginRankingLoss, self).__init__(size_average, reduce, reduction)
1073  self.margin = margin
1074
1075  @weak_script_method
1076  def forward(self, input1, input2, target):
1077  return F.margin_ranking_loss(input1, input2, target, margin=self.margin, reduction=self.reduction)
1078
1079
1080 @weak_module
1082  r"""Creates a criterion that optimizes a multi-class classification hinge
1083  loss (margin-based loss) between input :math:x (a 2D mini-batch Tensor) and
1084  output :math:y (which is a 1D tensor of target class indices,
1085  :math:0 \leq y \leq \text{x.size}(1)-1):
1086
1087  For each mini-batch sample, the loss in terms of the 1D input :math:x and scalar
1088  output :math:y is:
1089
1090  .. math::
1091  \text{loss}(x, y) = \frac{\sum_i \max(0, \text{margin} - x[y] + x[i]))^p}{\text{x.size}(0)}
1092
1093  where :math:x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}
1094  and :math:i \neq y.
1095
1096  Optionally, you can give non-equal weighting on the classes by passing
1097  a 1D :attr:weight tensor into the constructor.
1098
1099  The loss function then becomes:
1100
1101  .. math::
1102  \text{loss}(x, y) = \frac{\sum_i \max(0, w[y] * (\text{margin} - x[y] + x[i]))^p)}{\text{x.size}(0)}
1103
1104  Args:
1105  p (int, optional): Has a default value of :math:1. :math:1 and :math:2
1106  are the only supported values.
1107  margin (float, optional): Has a default value of :math:1.
1108  weight (Tensor, optional): a manual rescaling weight given to each
1109  class. If given, it has to be a Tensor of size C. Otherwise, it is
1110  treated as if having all ones.
1111  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
1112  the losses are averaged over each loss element in the batch. Note that for
1113  some losses, there are multiple elements per sample. If the field :attr:size_average
1114  is set to False, the losses are instead summed for each minibatch. Ignored
1115  when reduce is False. Default: True
1116  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
1117  losses are averaged or summed over observations for each minibatch depending
1118  on :attr:size_average. When :attr:reduce is False, returns a loss per
1119  batch element instead and ignores :attr:size_average. Default: True
1120  reduction (string, optional): Specifies the reduction to apply to the output:
1121  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
1122  'mean': the sum of the output will be divided by the number of
1123  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
1124  and :attr:reduce are in the process of being deprecated, and in the meantime,
1125  specifying either of those two args will override :attr:reduction. Default: 'mean'
1126  """
1127  __constants__ = ['p', 'margin', 'weight', 'reduction']
1128
1129  def __init__(self, p=1, margin=1., weight=None, size_average=None,
1130  reduce=None, reduction='mean'):
1131  super(MultiMarginLoss, self).__init__(weight, size_average, reduce, reduction)
1132  if p != 1 and p != 2:
1133  raise ValueError("only p == 1 and p == 2 supported")
1134  assert weight is None or weight.dim() == 1
1135  self.p = p
1136  self.margin = margin
1137
1138  @weak_script_method
1139  def forward(self, input, target):
1140  return F.multi_margin_loss(input, target, p=self.p, margin=self.margin,
1141  weight=self.weight, reduction=self.reduction)
1142
1143
1144 @weak_module
1146  r"""Creates a criterion that measures the triplet loss given an input
1147  tensors :math:x1, :math:x2, :math:x3 and a margin with a value greater than :math:0.
1148  This is used for measuring a relative similarity between samples. A triplet
1149  is composed by a, p and n: anchor, positive examples and negative
1150  examples respectively. The shapes of all input tensors should be
1151  :math:(N, D).
1152
1153  The distance swap is described in detail in the paper Learning shallow
1154  convolutional feature descriptors with triplet losses_ by
1155  V. Balntas, E. Riba et al.
1156
1157  The loss function for each sample in the mini-batch is:
1158
1159  .. math::
1160  L(a, p, n) = \max \{d(a_i, p_i) - d(a_i, n_i) + {\rm margin}, 0\}
1161
1162
1163  where
1164
1165  .. math::
1166  d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
1167
1168  Args:
1169  margin (float, optional): Default: :math:1.
1170  p (int, optional): The norm degree for pairwise distance. Default: :math:2.
1171  swap (bool, optional): The distance swap is described in detail in the paper
1172  Learning shallow convolutional feature descriptors with triplet losses by
1173  V. Balntas, E. Riba et al. Default: False.
1174  size_average (bool, optional): Deprecated (see :attr:reduction). By default,
1175  the losses are averaged over each loss element in the batch. Note that for
1176  some losses, there are multiple elements per sample. If the field :attr:size_average
1177  is set to False, the losses are instead summed for each minibatch. Ignored
1178  when reduce is False. Default: True
1179  reduce (bool, optional): Deprecated (see :attr:reduction). By default, the
1180  losses are averaged or summed over observations for each minibatch depending
1181  on :attr:size_average. When :attr:reduce is False, returns a loss per
1182  batch element instead and ignores :attr:size_average. Default: True
1183  reduction (string, optional): Specifies the reduction to apply to the output:
1184  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
1185  'mean': the sum of the output will be divided by the number of
1186  elements in the output, 'sum': the output will be summed. Note: :attr:size_average
1187  and :attr:reduce are in the process of being deprecated, and in the meantime,
1188  specifying either of those two args will override :attr:reduction. Default: 'mean'
1189
1190  Shape:
1191  - Input: :math:(N, D) where :math:D is the vector dimension.
1192  - Output: scalar. If :attr:reduction is 'none', then :math:(N).
1193
1194  >>> triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)
1195  >>> input1 = torch.randn(100, 128, requires_grad=True)
1196  >>> input2 = torch.randn(100, 128, requires_grad=True)
1197  >>> input3 = torch.randn(100, 128, requires_grad=True)
1198  >>> output = triplet_loss(input1, input2, input3)
1199  >>> output.backward()
1200
1201  .. _Learning shallow convolutional feature descriptors with triplet losses:
1202  http://www.iis.ee.ic.ac.uk/%7Evbalnt/shallow_descr/TFeat_paper.pdf
1203  """
1204  __constants__ = ['margin', 'p', 'eps', 'swap', 'reduction']
1205
1206  def __init__(self, margin=1.0, p=2., eps=1e-6, swap=False, size_average=None,
1207  reduce=None, reduction='mean'):
1208  super(TripletMarginLoss, self).__init__(size_average, reduce, reduction)
1209  self.margin = margin
1210  self.p = p
1211  self.eps = eps
1212  self.swap = swap
1213
1214  @weak_script_method
1215  def forward(self, anchor, positive, negative):
1216  return F.triplet_margin_loss(anchor, positive, negative, margin=self.margin, p=self.p,
1217  eps=self.eps, swap=self.swap, reduction=self.reduction)
1218
1219
1220 @weak_module
1222  r"""The Connectionist Temporal Classification loss.
1223
1224  Args:
1225  blank (int, optional): blank label. Default :math:0.
1226  reduction (string, optional): Specifies the reduction to apply to the output:
1227  'none' | 'mean' | 'sum'. 'none': no reduction will be applied,
1228  'mean': the output losses will be divided by the target lengths and
1229  then the mean over the batch is taken. Default: 'mean'
1230  zero_infinity (bool, optional):
1231  Whether to zero infinite losses and the associated gradients.
1232  Default: False
1233  Infinite losses mainly occur when the inputs are too short
1234  to be aligned to the targets.
1235
1236  Inputs:
1237  log_probs: Tensor of size :math:(T, N, C) where C = number of characters in alphabet including blank,
1238  T = input length, and N = batch size.
1239  The logarithmized probabilities of the outputs
1240  (e.g. obtained with :func:torch.nn.functional.log_softmax).
1241  targets: Tensor of size :math:(N, S) or (sum(target_lengths)).
1242  Targets (cannot be blank). In the second form, the targets are assumed to be concatenated.
1243  input_lengths: Tuple or tensor of size :math:(N).
1244  Lengths of the inputs (must each be :math:\leq T)
1245  target_lengths: Tuple or tensor of size :math:(N).
1246  Lengths of the targets
1247
1248  Example::
1249
1250  >>> ctc_loss = nn.CTCLoss()
1251  >>> log_probs = torch.randn(50, 16, 20).log_softmax(2).detach().requires_grad_()
1252  >>> targets = torch.randint(1, 20, (16, 30), dtype=torch.long)
1253  >>> input_lengths = torch.full((16,), 50, dtype=torch.long)
1254  >>> target_lengths = torch.randint(10,30,(16,), dtype=torch.long)
1255  >>> loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)
1256  >>> loss.backward()
1257
1258  Reference:
1259  A. Graves et al.: Connectionist Temporal Classification:
1260  Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
1261  https://www.cs.toronto.edu/~graves/icml_2006.pdf
1262
1263  .. Note::
1264  In order to use CuDNN, the following must be satisfied: :attr:targets must be
1265  in concatenated format, all :attr:input_lengths must be T. :math:blank=0,
1266  :attr:target_lengths :math:\leq 256, the integer arguments must be of
1267  dtype :attr:torch.int32.
1268
1269  The regular implementation uses the (more common in PyTorch) torch.long dtype.
1270
1271
1272  .. include:: cudnn_deterministic.rst
1273
1274
1275  """
1276  __constants__ = ['blank', 'reduction']
1277
1278  def __init__(self, blank=0, reduction='mean', zero_infinity=False):
1279  super(CTCLoss, self).__init__(reduction=reduction)
1280  self.blank = blank
1281  self.zero_infinity = zero_infinity
1282
1283  @weak_script_method
1284  def forward(self, log_probs, targets, input_lengths, target_lengths):
1285  return F.ctc_loss(log_probs, targets, input_lengths, target_lengths, self.blank, self.reduction,
1286  self.zero_infinity)
1287
1288 # TODO: L1HingeEmbeddingCriterion
1289 # TODO: MSECriterion weight
1290 # TODO: ClassSimplexCriterion