Caffe2 - Python API
A deep learning, cross platform ML framework
1 """
2 ``torch.autograd`` provides classes and functions implementing automatic
3 differentiation of arbitrary scalar valued functions. It requires minimal
4 changes to the existing code - you only need to declare :class:`Tensor` s
5 for which gradients should be computed with the ``requires_grad=True`` keyword.
6 """
7 import torch
8 import warnings
10 from .variable import Variable
11 from .function import Function, NestedIOFunction
12 from .gradcheck import gradcheck, gradgradcheck
13 from .grad_mode import no_grad, enable_grad, set_grad_enabled
14 from .anomaly_mode import detect_anomaly, set_detect_anomaly
15 from . import profiler
17 __all__ = ['Variable', 'Function', 'backward', 'grad_mode']
20 def _make_grads(outputs, grads):
21  new_grads = []
22  for out, grad in zip(outputs, grads):
23  if isinstance(grad, torch.Tensor):
24  new_grads.append(grad)
25  elif grad is None:
26  if out.requires_grad:
27  if out.numel() != 1:
28  raise RuntimeError("grad can be implicitly created only for scalar outputs")
29  new_grads.append(torch.ones_like(out))
30  else:
31  new_grads.append(None)
32  else:
33  raise TypeError("gradients can be either Tensors or None, but got " +
34  type(grad).__name__)
35  return tuple(new_grads)
38 def backward(tensors, grad_tensors=None, retain_graph=None, create_graph=False, grad_variables=None):
39  r"""Computes the sum of gradients of given tensors w.r.t. graph leaves.
41  The graph is differentiated using the chain rule. If any of ``tensors``
42  are non-scalar (i.e. their data has more than one element) and require
43  gradient, then the Jacobian-vector product would be computed, in this
44  case the function additionally requires specifying ``grad_tensors``.
45  It should be a sequence of matching length, that contains the "vector"
46  in the Jacobian-vector product, usually the gradient of the differentiated
47  function w.r.t. corresponding tensors (``None`` is an acceptable value for
48  all tensors that don't need gradient tensors).
50  This function accumulates gradients in the leaves - you might need to zero
51  them before calling it.
53  Arguments:
54  tensors (sequence of Tensor): Tensors of which the derivative will be
55  computed.
56  grad_tensors (sequence of (Tensor or None)): The "vector" in the Jacobian-vector
57  product, usually gradients w.r.t. each element of corresponding tensors.
58  None values can be specified for scalar Tensors or ones that don't require
59  grad. If a None value would be acceptable for all grad_tensors, then this
60  argument is optional.
61  retain_graph (bool, optional): If ``False``, the graph used to compute the grad
62  will be freed. Note that in nearly all cases setting this option to ``True``
63  is not needed and often can be worked around in a much more efficient
64  way. Defaults to the value of ``create_graph``.
65  create_graph (bool, optional): If ``True``, graph of the derivative will
66  be constructed, allowing to compute higher order derivative products.
67  Defaults to ``False``.
68  """
69  if grad_variables is not None:
70  warnings.warn("'grad_variables' is deprecated. Use 'grad_tensors' instead.")
71  if grad_tensors is None:
72  grad_tensors = grad_variables
73  else:
74  raise RuntimeError("'grad_tensors' and 'grad_variables' (deprecated) "
75  "arguments both passed to backward(). Please only "
76  "use 'grad_tensors'.")
78  tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)
80  if grad_tensors is None:
81  grad_tensors = [None] * len(tensors)
82  elif isinstance(grad_tensors, torch.Tensor):
83  grad_tensors = [grad_tensors]
84  else:
85  grad_tensors = list(grad_tensors)
87  grad_tensors = _make_grads(tensors, grad_tensors)
88  if retain_graph is None:
89  retain_graph = create_graph
91  Variable._execution_engine.run_backward(
92  tensors, grad_tensors, retain_graph, create_graph,
93  allow_unreachable=True) # allow_unreachable flag
96 def grad(outputs, inputs, grad_outputs=None, retain_graph=None, create_graph=False,
97  only_inputs=True, allow_unused=False):
98  r"""Computes and returns the sum of gradients of outputs w.r.t. the inputs.
100  ``grad_outputs`` should be a sequence of length matching ``output``
101  containing the "vector" in Jacobian-vector product, usually the pre-computed
102  gradients w.r.t. each of the outputs. If an output doesn't require_grad,
103  then the gradient can be ``None``).
105  If ``only_inputs`` is ``True``, the function will only return a list of gradients
106  w.r.t the specified inputs. If it's ``False``, then gradient w.r.t. all remaining
107  leaves will still be computed, and will be accumulated into their ``.grad``
108  attribute.
110  Arguments:
111  outputs (sequence of Tensor): outputs of the differentiated function.
112  inputs (sequence of Tensor): Inputs w.r.t. which the gradient will be
113  returned (and not accumulated into ``.grad``).
114  grad_outputs (sequence of Tensor): The "vector" in the Jacobian-vector product.
115  Usually gradients w.r.t. each output. None values can be specified for scalar
116  Tensors or ones that don't require grad. If a None value would be acceptable
117  for all grad_tensors, then this argument is optional. Default: None.
118  retain_graph (bool, optional): If ``False``, the graph used to compute the grad
119  will be freed. Note that in nearly all cases setting this option to ``True``
120  is not needed and often can be worked around in a much more efficient
121  way. Defaults to the value of ``create_graph``.
122  create_graph (bool, optional): If ``True``, graph of the derivative will
123  be constructed, allowing to compute higher order derivative products.
124  Default: ``False``.
125  allow_unused (bool, optional): If ``False``, specifying inputs that were not
126  used when computing outputs (and therefore their grad is always zero)
127  is an error. Defaults to ``False``.
128  """
129  if not only_inputs:
130  warnings.warn("only_inputs argument is deprecated and is ignored now "
131  "(defaults to True). To accumulate gradient for other "
132  "parts of the graph, please use torch.autograd.backward.")
134  outputs = (outputs,) if isinstance(outputs, torch.Tensor) else tuple(outputs)
135  inputs = (inputs,) if isinstance(inputs, torch.Tensor) else tuple(inputs)
136  if grad_outputs is None:
137  grad_outputs = [None] * len(outputs)
138  elif isinstance(grad_outputs, torch.Tensor):
139  grad_outputs = [grad_outputs]
140  else:
141  grad_outputs = list(grad_outputs)
143  grad_outputs = _make_grads(outputs, grad_outputs)
144  if retain_graph is None:
145  retain_graph = create_graph
147  return Variable._execution_engine.run_backward(
148  outputs, grad_outputs, retain_graph, create_graph,
149  inputs, allow_unused)
152 # This function applies in case of gradient checkpointing for memory
153 # optimization. Currently, for gradient checkpointing, we only support imperative
154 # backwards call i.e. torch.autograd.backward() and the torch.autograd.grad() won't
155 # work. The reason being that: torch.autograd.grad() only calculates the grads
156 # for the inputs that are passed by user but it doesn't calculate grad for
157 # anything else e.g. model parameters like weights, bias etc. However, for
158 # torch.autograd.backward(), we would actually compute the grad for the weights as well.
159 #
160 # This function returns whether the checkpointing is valid i.e. torch.autograd.backward
161 # or not i.e. torch.autograd.grad. The implementation works by maintaining a thread
162 # local variable in torch/csrc/autograd/engine.cpp which looks at the FunctionTask
163 # in the stack and before a FunctionTask is executed in evaluate_function, it
164 # checks for whether reentrant backwards is imperative or not.
165 # See for more discussion/context
166 def _is_checkpoint_valid():
167  return Variable._execution_engine.is_checkpoint_valid()
170 def variable(*args, **kwargs):
171  warnings.warn("torch.autograd.variable(...) is deprecated, use torch.tensor(...) instead")
172  return torch.tensor(*args, **kwargs)
175 if not torch._C._autograd_init():
176  raise RuntimeError("autograd initialization failed")