Caffe2 - Python API
A deep learning, cross platform ML framework
gradcheck.py
1 import torch
2 from torch._six import container_abcs, istuple
3 import torch.testing
4 import sys
5 from itertools import product
6 import warnings
7 
8 
9 def zero_gradients(x):
10  if isinstance(x, torch.Tensor):
11  if x.grad is not None:
12  x.grad.detach_()
13  x.grad.data.zero_()
14  elif isinstance(x, container_abcs.Iterable):
15  for elem in x:
16  zero_gradients(elem)
17 
18 
19 def make_jacobian(input, num_out):
20  if isinstance(input, torch.Tensor):
21  if not input.is_floating_point():
22  return None
23  if not input.requires_grad:
24  return None
25  return torch.zeros(input.nelement(), num_out, dtype=input.dtype)
26  elif isinstance(input, container_abcs.Iterable) and not isinstance(input, str):
27  jacobians = list(filter(
28  lambda x: x is not None, (make_jacobian(elem, num_out) for elem in input)))
29  if not jacobians:
30  return None
31  return type(input)(jacobians)
32  else:
33  return None
34 
35 
36 def iter_tensors(x, only_requiring_grad=False):
37  if isinstance(x, torch.Tensor):
38  if x.requires_grad or not only_requiring_grad:
39  yield x
40  elif isinstance(x, container_abcs.Iterable) and not isinstance(x, str):
41  for elem in x:
42  for result in iter_tensors(elem, only_requiring_grad):
43  yield result
44 
45 
46 def get_numerical_jacobian(fn, input, target=None, eps=1e-3):
47  """
48  input: input to `fn`
49  target: the Tensors wrt whom Jacobians are calculated (default=`input`)
50 
51  Note that `target` may not even be part of `input` to `fn`, so please be
52  **very careful** in this to not clone `target`.
53  """
54  if target is None:
55  target = input
56  output_size = fn(input).numel()
57  jacobian = make_jacobian(target, output_size)
58 
59  # It's much easier to iterate over flattened lists of tensors.
60  # These are reference to the same objects in jacobian, so any changes
61  # will be reflected in it as well.
62  x_tensors = [t for t in iter_tensors(target, True)]
63  j_tensors = [t for t in iter_tensors(jacobian)]
64 
65  # TODO: compare structure
66  for x_tensor, d_tensor in zip(x_tensors, j_tensors):
67  # need data here to get around the version check because without .data,
68  # the following code updates version but doesn't change content
69  if x_tensor.is_sparse:
70  def get_stride(size):
71  dim = len(size)
72  tmp = 1
73  stride = [0] * dim
74  for i in reversed(range(dim)):
75  stride[i] = tmp
76  tmp *= size[i]
77  return stride
78 
79  x_nnz = x_tensor._nnz()
80  x_size = list(x_tensor.size())
81  x_indices = x_tensor._indices().t()
82  x_values = x_tensor._values().data
83  x_stride = get_stride(x_size)
84 
85  for i in range(x_nnz):
86  x_value = x_values[i]
87  for x_idx in product(*[range(m) for m in x_values.size()[1:]]):
88  indices = x_indices[i].tolist() + list(x_idx)
89  d_idx = sum(indices[k] * x_stride[k] for k in range(len(x_size)))
90 
91  orig = x_value[x_idx].item()
92  x_value[x_idx] = orig - eps
93  outa = fn(input).clone()
94  x_value[x_idx] = orig + eps
95  outb = fn(input).clone()
96  x_value[x_idx] = orig
97  r = (outb - outa) / (2 * eps)
98  d_tensor[d_idx] = r.detach().reshape(-1)
99  else:
100  x_tensor = x_tensor.data
101  for d_idx, x_idx in enumerate(product(*[range(m) for m in x_tensor.size()])):
102  orig = x_tensor[x_idx].item()
103  x_tensor[x_idx] = orig - eps
104  outa = fn(input).clone()
105  x_tensor[x_idx] = orig + eps
106  outb = fn(input).clone()
107  x_tensor[x_idx] = orig
108  r = (outb - outa) / (2 * eps)
109  d_tensor[d_idx] = r.detach().reshape(-1)
110 
111  return jacobian
112 
113 
114 def get_analytical_jacobian(input, output):
115  # it is easier to call to_dense() on the sparse output than
116  # to modify analytical jacobian
117  if output.is_sparse:
118  raise ValueError('Sparse output is not supported at gradcheck yet. '
119  'Please call to_dense() on the output of fn for gradcheck.')
120  diff_input_list = list(iter_tensors(input, True))
121  jacobian = make_jacobian(input, output.numel())
122  jacobian_reentrant = make_jacobian(input, output.numel())
123  grad_output = torch.zeros_like(output)
124  flat_grad_output = grad_output.view(-1)
125  reentrant = True
126  correct_grad_sizes = True
127 
128  for i in range(flat_grad_output.numel()):
129  flat_grad_output.zero_()
130  flat_grad_output[i] = 1
131  for jacobian_c in (jacobian, jacobian_reentrant):
132  grads_input = torch.autograd.grad(output, diff_input_list, grad_output,
133  retain_graph=True, allow_unused=True)
134  for jacobian_x, d_x, x in zip(jacobian_c, grads_input, diff_input_list):
135  if d_x is not None and d_x.size() != x.size():
136  correct_grad_sizes = False
137  elif jacobian_x.numel() != 0:
138  if d_x is None:
139  jacobian_x[:, i].zero_()
140  else:
141  d_x_dense = d_x.to_dense() if d_x.is_sparse else d_x
142  assert jacobian_x[:, i].numel() == d_x_dense.numel()
143  jacobian_x[:, i] = d_x_dense.contiguous().view(-1)
144 
145  for jacobian_x, jacobian_reentrant_x in zip(jacobian, jacobian_reentrant):
146  if jacobian_x.numel() != 0 and (jacobian_x - jacobian_reentrant_x).abs().max() != 0:
147  reentrant = False
148 
149  return jacobian, reentrant, correct_grad_sizes
150 
151 
152 def _as_tuple(x):
153  if istuple(x):
154  return x
155  elif isinstance(x, list):
156  return tuple(x)
157  else:
158  return x,
159 
160 
161 def _differentiable_outputs(x):
162  return tuple(o for o in _as_tuple(x) if o.requires_grad)
163 
164 
165 def gradcheck(func, inputs, eps=1e-6, atol=1e-5, rtol=1e-3, raise_exception=True, check_sparse_nnz=False):
166  r"""Check gradients computed via small finite differences against analytical
167  gradients w.r.t. tensors in :attr:`inputs` that are of floating point type
168  and with ``requires_grad=True``.
169 
170  The check between numerical and analytical gradients uses :func:`~torch.allclose`.
171 
172  .. note::
173  The default values are designed for :attr:`input` of double precision.
174  This check will likely fail if :attr:`input` is of less precision, e.g.,
175  ``FloatTensor``.
176 
177  .. warning::
178  If any checked tensor in :attr:`input` has overlapping memory, i.e.,
179  different indices pointing to the same memory address (e.g., from
180  :func:`torch.expand`), this check will likely fail because the numerical
181  gradients computed by point perturbation at such indices will change
182  values at all other indices that share the same memory address.
183 
184  Args:
185  func (function): a Python function that takes Tensor inputs and returns
186  a Tensor or a tuple of Tensors
187  inputs (tuple of Tensor or Tensor): inputs to the function
188  eps (float, optional): perturbation for finite differences
189  atol (float, optional): absolute tolerance
190  rtol (float, optional): relative tolerance
191  raise_exception (bool, optional): indicating whether to raise an exception if
192  the check fails. The exception gives more information about the
193  exact nature of the failure. This is helpful when debugging gradchecks.
194  check_sparse_nnz (bool, optional): if True, gradcheck allows for SparseTensor input,
195  and for any SparseTensor at input, gradcheck will perform check at nnz positions only.
196 
197  Returns:
198  True if all differences satisfy allclose condition
199  """
200  def fail_test(msg):
201  if raise_exception:
202  raise RuntimeError(msg)
203  return False
204 
205  tupled_inputs = _as_tuple(inputs)
206  if any(t.is_sparse for t in tupled_inputs if isinstance(t, torch.Tensor)) and not check_sparse_nnz:
207  fail_test('gradcheck expects all tensor inputs '
208  'are dense when check_sparse_nnz is set to False.')
209 
210  # Make sure that gradients are saved for all inputs
211  any_input_requiring_grad = False
212  for inp in tupled_inputs:
213  if isinstance(inp, torch.Tensor):
214  if inp.requires_grad:
215  if inp.dtype != torch.float64:
216  warnings.warn(
217  'At least one of the inputs that requires gradient '
218  'is not of double precision floating point. '
219  'This check will likely fail if all the inputs are '
220  'not of double precision floating point. ')
221  any_input_requiring_grad = True
222  inp.retain_grad()
223  if not any_input_requiring_grad:
224  raise ValueError(
225  'gradcheck expects at least one input tensor to require gradient, '
226  'but none of the them have requires_grad=True.')
227 
228  output = _differentiable_outputs(func(*tupled_inputs))
229 
230  for i, o in enumerate(output):
231  if not o.requires_grad:
232  continue
233 
234  def fn(input):
235  return _as_tuple(func(*input))[i]
236 
237  analytical, reentrant, correct_grad_sizes = get_analytical_jacobian(tupled_inputs, o)
238  numerical = get_numerical_jacobian(fn, tupled_inputs, eps=eps)
239 
240  if not correct_grad_sizes:
241  return fail_test('Analytical gradient has incorrect size')
242 
243  for j, (a, n) in enumerate(zip(analytical, numerical)):
244  if a.numel() != 0 or n.numel() != 0:
245  if not torch.allclose(a, n, rtol, atol):
246  return fail_test('Jacobian mismatch for output %d with respect to input %d,\n'
247  'numerical:%s\nanalytical:%s\n' % (i, j, n, a))
248 
249  if not reentrant:
250  return fail_test('Backward is not reentrant, i.e., running backward with same '
251  'input and grad_output multiple times gives different values, '
252  'although analytical gradient matches numerical gradient')
253 
254  # check if the backward multiplies by grad_output
255  output = _differentiable_outputs(func(*tupled_inputs))
256  if any([o.requires_grad for o in output]):
257  diff_input_list = list(iter_tensors(tupled_inputs, True))
258  if not diff_input_list:
259  raise RuntimeError("no Tensors requiring grad found in input")
260  grads_input = torch.autograd.grad(output, diff_input_list, [torch.zeros_like(o) for o in output],
261  allow_unused=True)
262  for gi, i in zip(grads_input, diff_input_list):
263  if gi is None:
264  continue
265  if isinstance(gi, torch.Tensor) and gi.is_sparse:
266  if gi.layout != i.layout:
267  return fail_test('grad is sparse tensor, but has incorrect layout')
268  if gi.sparse_dim() != i.sparse_dim():
269  return fail_test('grad is sparse tensor, but has incorrect sparse_dim')
270  if gi.dense_dim() != i.dense_dim():
271  return fail_test('grad is sparse tensor, but has incorrect dense_dim')
272  gi = gi.to_dense()
273  i = i.to_dense()
274  if not gi.eq(0).all():
275  return fail_test('backward not multiplied by grad_output')
276  if gi.type() != i.type():
277  return fail_test("grad is incorrect type")
278  if gi.size() != i.size():
279  return fail_test('grad is incorrect size')
280 
281  return True
282 
283 
284 def gradgradcheck(func, inputs, grad_outputs=None, eps=1e-6, atol=1e-5, rtol=1e-3,
285  gen_non_contig_grad_outputs=False, raise_exception=True):
286  r"""Check gradients of gradients computed via small finite differences
287  against analytical gradients w.r.t. tensors in :attr:`inputs` and
288  :attr:`grad_outputs` that are of floating point type and with
289  ``requires_grad=True``.
290 
291  This function checks that backpropagating through the gradients computed
292  to the given :attr:`grad_outputs` are correct.
293 
294  The check between numerical and analytical gradients uses :func:`~torch.allclose`.
295 
296  .. note::
297  The default values are designed for :attr:`input` and
298  :attr:`grad_outputs` of double precision. This check will likely fail if
299  they are of less precision, e.g., ``FloatTensor``.
300 
301  .. warning::
302  If any checked tensor in :attr:`input` and :attr:`grad_outputs` has
303  overlapping memory, i.e., different indices pointing to the same memory
304  address (e.g., from :func:`torch.expand`), this check will likely fail
305  because the numerical gradients computed by point perturbation at such
306  indices will change values at all other indices that share the same
307  memory address.
308 
309  Args:
310  func (function): a Python function that takes Tensor inputs and returns
311  a Tensor or a tuple of Tensors
312  inputs (tuple of Tensor or Tensor): inputs to the function
313  grad_outputs (tuple of Tensor or Tensor, optional): The gradients with
314  respect to the function's outputs.
315  eps (float, optional): perturbation for finite differences
316  atol (float, optional): absolute tolerance
317  rtol (float, optional): relative tolerance
318  gen_non_contig_grad_outputs (bool, optional): if :attr:`grad_outputs` is
319  ``None`` and :attr:`gen_non_contig_grad_outputs` is ``True``, the
320  randomly generated gradient outputs are made to be noncontiguous
321  raise_exception (bool, optional): indicating whether to raise an exception if
322  the check fails. The exception gives more information about the
323  exact nature of the failure. This is helpful when debugging gradchecks.
324 
325  Returns:
326  True if all differences satisfy allclose condition
327  """
328  tupled_inputs = _as_tuple(inputs)
329 
330  if grad_outputs is None:
331  # If grad_outputs is not specified, create random Tensors of the same
332  # shape, type, and device as the outputs
333  def randn_like(x):
334  y = torch.testing.randn_like(x if x.is_floating_point() else x.double())
335  if gen_non_contig_grad_outputs:
337  return y.requires_grad_()
338  outputs = _as_tuple(func(*tupled_inputs))
339  tupled_grad_outputs = tuple(randn_like(x) for x in outputs)
340  else:
341  tupled_grad_outputs = _as_tuple(grad_outputs)
342 
343  num_outputs = len(tupled_grad_outputs)
344 
345  def new_func(*args):
346  input_args = args[:-num_outputs]
347  grad_outputs = args[-num_outputs:]
348  outputs = _differentiable_outputs(func(*input_args))
349  input_args = tuple(x for x in input_args if isinstance(x, torch.Tensor) and x.requires_grad)
350  grad_inputs = torch.autograd.grad(outputs, input_args, grad_outputs, create_graph=True)
351  return grad_inputs
352 
353  return gradcheck(new_func, tupled_inputs + tupled_grad_outputs, eps, atol, rtol, raise_exception)
def make_non_contiguous(tensor)
Definition: __init__.py:62
def grad(outputs, inputs, grad_outputs=None, retain_graph=None, create_graph=False, only_inputs=True, allow_unused=False)
Definition: __init__.py:97