Caffe2 - Python API
A deep learning, cross platform ML framework
test_cuda.py
1 import io
2 import math
3 import tempfile
4 import re
5 import unittest
6 import sys
7 from itertools import repeat
8 import os
9 from contextlib import contextmanager
10 import threading
11 import queue
12 
13 import torch
14 import torch.cuda
15 import torch.cuda.comm as comm
16 from torch import multiprocessing as mp
17 from torch._six import inf, nan
18 
19 from test_torch import _TestTorchMixin
20 
21 from common_methods_invocations import tri_tests_args, tri_large_tests_args, \
22  run_additional_tri_tests, _compare_trilu_indices, _compare_large_trilu_indices
23 from common_utils import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, \
24  PY3, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, skipIfRocm, TEST_NUMPY, TEST_WITH_ROCM, load_tests, iter_indices
25 
26 # load_tests from common_utils is used to automatically filter tests for
27 # sharding on sandcastle. This line silences flake warnings
28 load_tests = load_tests
29 
30 # We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here,
31 # because if we do that, the TEST_CUDNN line from common_cuda will be executed
32 # multiple times as well during the execution of this test suite, and it will
33 # cause CUDA OOM error on Windows.
34 TEST_CUDA = torch.cuda.is_available()
35 TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
36 
37 if not TEST_CUDA:
38  print('CUDA not available, skipping tests')
39  TestCase = object # noqa: F811
40 
41 TEST_MAGMA = TEST_CUDA
42 TEST_LARGE_TENSOR = TEST_CUDA
43 if TEST_CUDA:
44  torch.ones(1).cuda() # has_magma shows up after cuda is initialized
45  TEST_MAGMA = torch.cuda.has_magma
46  TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 9e9
47 
48 floating_set = {torch.FloatTensor, torch.DoubleTensor, torch.cuda.FloatTensor,
49  torch.cuda.DoubleTensor, torch.HalfTensor, torch.cuda.HalfTensor}
50 
51 
52 def is_floating(t):
53  if not isinstance(t, type):
54  raise TypeError('t should be an instance of type')
55  assert t != torch.autograd.Variable
56  return t in floating_set
57 
58 
59 def is_half(t):
60  if isinstance(t, torch.Tensor):
61  return t.dtype == torch.float16
62  assert isinstance(t, type)
63  assert t != torch.autograd.Variable
64  return t in [torch.HalfTensor, torch.cuda.HalfTensor]
65 
66 
67 types = [
68  torch.FloatTensor,
69  torch.DoubleTensor,
70  torch.LongTensor,
71  torch.IntTensor,
72  torch.ShortTensor,
73  torch.CharTensor,
74  torch.ByteTensor,
75  torch.HalfTensor,
76 ]
77 
78 signed_types = [
79  torch.FloatTensor,
80  torch.DoubleTensor,
81  torch.LongTensor,
82  torch.IntTensor,
83  torch.ShortTensor,
84  torch.CharTensor,
85 ]
86 
87 unsigned_types = [
88  torch.ByteTensor,
89 ]
90 
91 float_types = [
92  torch.FloatTensor,
93  torch.DoubleTensor,
94  torch.HalfTensor,
95 ]
96 
97 float_types_no_half = [
98  torch.FloatTensor,
99  torch.DoubleTensor,
100 ]
101 
102 
103 def number(floating, integer, t):
104  return floating if is_floating(t) else integer
105 
106 
107 def cast_tensor(tensor, t):
108  return t(tensor.size()).copy_(tensor)
109 
110 S = 10
111 M = 50
112 G = 275000000
113 
114 
115 def make_tensor(t, *sizes):
116  if 'Half' in t.__name__:
117  return t(*sizes).copy_(torch.randn(*sizes))
118  else:
119  tensor = t(*sizes)
120  if tensor.is_floating_point():
121  return tensor.normal_()
122  else:
123  return tensor.random_(0, 10)
124 
125 
126 def make_sparse_tensor(t, n, *sizes):
127  assert t.is_sparse
128  tensor = t()
129  i = tensor._indices()
130  i = i.new(len(sizes), n).copy_(
131  torch.cat([torch.LongTensor(1, n).random_(s) for s in sizes], 0))
132  v = tensor._values()
133  v = v.new(n).copy_(torch.randn(n))
134  return t(i, v, torch.Size(sizes))
135 
136 
137 def tensor_clamp(t, min, max):
138  if is_half(t):
139  return t.float().clamp(min, max).half()
140  else:
141  return t.clamp(min, max)
142 
143 
144 def tensor_mul(t, scale):
145  if is_half(t):
146  return t.float().mul(scale).half()
147  else:
148  return t.mul(scale)
149 
150 
151 def tensor_abs_(t):
152  if is_half(t):
153  return t.float().abs_().half()
154  else:
155  return t.abs_()
156 
157 
158 def constant_tensor_sub(a, b):
159  # helper function to address const - torch.HalfTensor where it doesn't
160  # have resize_as()
161  if is_half(b):
162  return (a - b.float()).half()
163  else:
164  return a - b
165 
166 
167 def constant_tensor_add(a, b):
168  # helper function to address const + torch.HalfTensor where it doesn't
169  # have add()
170  if is_half(b):
171  return (a + b.float()).half()
172  else:
173  return a + b
174 
175 
176 def small_0d(t):
177  return make_tensor(t, (1,)).squeeze()
178 
179 
180 def small_2d(t):
181  return make_tensor(t, S, S)
182 
183 
184 def small_2d_scaled(t, scale=10):
185  return tensor_mul(make_tensor(t, S, S), scale)
186 
187 
188 def small_2d_oneish(t):
189  if is_floating(t):
190  return tensor_clamp(make_tensor(t, S, S), min=0.99, max=1.01)
191  else:
192  return t(S, S).fill_(1)
193 
194 
195 def small_3d(t):
196  return make_tensor(t, S, S, S)
197 
198 
199 def medium_1d(t):
200  return make_tensor(t, M)
201 
202 
203 def medium_2d(t):
204  return make_tensor(t, M, M)
205 
206 
207 def medium_2d_expanded(t):
208  return t(1).expand(M, M)
209 
210 
211 def medium_2d_scaled(t, scale=10):
212  return tensor_mul(make_tensor(t, M, M), scale)
213 
214 
215 def small_3d_ones(t):
216  return t(S, S, S).copy_(torch.ones(S, S, S))
217 
218 
219 def small_3d_positive(t):
220  # In div_tensor(), half cannot achieve float precision
221  min_val = 1e-3 if is_floating(t) and not is_half(t) else 2
222  return tensor_clamp(make_tensor(t, S, S, S), min_val, 120)
223 
224 
225 def small_3d_unique(t):
226  return t(S, S, S).copy_(torch.arange(1, S * S * S + 1).view(S, S, S))
227 
228 
229 def small_1d_lapack(t):
230  return t(1, 3).copy_(torch.arange(1, 4).view(3))
231 
232 
233 def small_2d_lapack(t):
234  return t(3, 3).copy_(torch.arange(1, 10).view(3, 3))
235 
236 
237 def small_2d_lapack_skinny(t):
238  return t(3, 4).copy_(torch.arange(1, 13).view(3, 4))
239 
240 
241 def small_2d_lapack_fat(t):
242  return t(4, 3).copy_(torch.arange(1, 13).view(4, 3))
243 
244 
245 def large_2d_lapack(t):
246  return t(1000, 1000).normal_()
247 
248 
249 def giant_1d_ones(t):
250  return t(G).copy_(torch.ones(G))
251 
252 
253 def long_type(t):
254  return torch.cuda.LongTensor if 'cuda' in t.__module__ else torch.LongTensor
255 
256 
257 def new_t(*sizes):
258  def tmp(t):
259  return t(*sizes).copy_(torch.randn(*sizes))
260  return tmp
261 
262 # Content of each tuple:
263 # - function name
264 # - constructor for the tensor, signature: fn(tensor_type) -> tensor
265 # - constructor for the arguments, signature: fn(tensor_type) -> list
266 # - postfix name for the test (must be unique for a given function) (default='')
267 # - tensor types to use (default=types)
268 # - disable inplace test, if set to True, no inplace test will be done (default=False)
269 # - decorator, e.g., unittest.skipIf (default is no decorator)
270 tests = [
271  ('add', small_3d, lambda t: [number(3.14, 3, t)]),
272  ('add', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
273  ('add', small_3d, lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor'),
274  ('sub', small_3d, lambda t: [number(3.14, 3, t)]),
275  ('sub', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
276  ('mul', small_3d, lambda t: [number(3.14, 3, t)]),
277  ('mul', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
278  ('mul', small_0d, lambda t: [small_0d(torch.IntTensor)], 'scalar', types, True),
279  ('div', small_3d, lambda t: [number(3.14, 3, t)]),
280  ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
281  ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types),
282  ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1'),
283  ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2'),
284  ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3'),
285  ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types),
286  # HalfTensor gives bad result at pow-2 with data sampled from torch.randn
287  ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half, False,
288  "skipIfRocm:FloatTensor"),
289  ('pow', small_3d, lambda t: [tensor_abs_(small_3d(t))], 'tensor', float_types),
290  ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types),
291  ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
292  ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
293  ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)],),
294  ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
295  ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
296  ('bmm', small_3d, lambda t: [small_3d(t)], '', float_types_no_half),
297  ('addcdiv', small_2d_lapack, lambda t: [tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)]),
298  ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t), tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)],
299  'scalar'),
300  ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)]),
301  ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
302  ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)]),
303  ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar'),
304  ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'),
305  ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)],),
306  ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar'),
307  ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'),
308  ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)]),
309  ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar'),
310  ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'),
311  ('atan2', medium_2d, lambda t: [medium_2d(t)], None, float_types + [torch.HalfTensor]),
312  ('fmod', small_3d, lambda t: [3], 'value',),
313  ('fmod', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
314  ('chunk', medium_2d, lambda t: [4],),
315  ('chunk', medium_2d, lambda t: [4, 1], 'dim'),
316  ('chunk', medium_2d, lambda t: [4, -2], 'neg_dim'),
317  ('clamp', medium_2d_scaled, lambda t: [-1, 5], None, signed_types),
318  ('clamp', medium_2d_scaled, lambda t: [1, 5], None, unsigned_types),
319  ('clone', medium_2d, lambda t: [],),
320  ('contiguous', medium_2d, lambda t: [],),
321  ('cross', new_t(M, 3, M), lambda t: [new_t(M, 3, M)(t)],),
322  ('cumprod', small_3d, lambda t: [1]),
323  ('cumprod', small_3d, lambda t: [-1], 'neg_dim'),
324  ('cumsum', small_3d, lambda t: [1]),
325  ('cumsum', small_3d, lambda t: [-1], 'neg_dim'),
326  ('dim', small_3d, lambda t: [],),
327  ('dist', small_2d, lambda t: [small_2d(t)]),
328  ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm'),
329  ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm'),
330  ('dot', medium_1d, lambda t: [medium_1d(t)], '', types, False, "skipIfRocm:HalfTensor"),
331  ('element_size', medium_1d, lambda t: [],),
332  ('eq', small_3d_ones, lambda t: [small_3d(t)],),
333  ('eq', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
334  ('ne', small_3d_ones, lambda t: [small_3d(t)],),
335  ('ne', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
336  ('equal', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
337  ('equal', small_3d_ones, lambda t: [small_3d(t)],),
338  ('expand', new_t(M, 1, M), lambda t: [M, 4, M],),
339  ('expand_as', new_t(M, 1, M), lambda t: [new_t(M, 4, M)(t)],),
340  ('fill', medium_2d, lambda t: [number(3.14, 3, t)]),
341  ('ge', medium_2d, lambda t: [medium_2d(t)],),
342  ('le', medium_2d, lambda t: [medium_2d(t)],),
343  ('gt', medium_2d, lambda t: [medium_2d(t)],),
344  ('lt', medium_2d, lambda t: [medium_2d(t)],),
345  ('is_contiguous', medium_2d, lambda t: [],),
346  # TODO: can't check negative case - GPU copy will be contiguous
347  ('is_same_size', medium_2d, lambda t: [small_3d(t)], 'negative'),
348  ('is_same_size', medium_2d, lambda t: [medium_2d(t)], 'positive'),
349  ('is_set_to', medium_2d, lambda t: [medium_2d(t)],),
350  # TODO: positive case
351  ('kthvalue', small_3d_unique, lambda t: [3],),
352  ('kthvalue', small_3d_unique, lambda t: [3, 1], 'dim'),
353  ('kthvalue', small_3d_unique, lambda t: [3, -1], 'neg_dim'),
354  ('lerp', small_3d, lambda t: [small_3d(t), 0.3]),
355  ('max', small_3d_unique, lambda t: []),
356  ('max', small_3d_unique, lambda t: [1], 'dim'),
357  ('max', small_3d_unique, lambda t: [-1], 'neg_dim'),
358  ('max', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
359  ('min', small_3d_unique, lambda t: []),
360  ('min', small_3d_unique, lambda t: [1], 'dim'),
361  ('min', small_3d_unique, lambda t: [-1], 'neg_dim'),
362  ('min', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
363  ('mean', small_3d, lambda t: []),
364  ('mean', small_3d, lambda t: [-1], 'neg_dim'),
365  ('mean', small_3d, lambda t: [1], 'dim'),
366  ('mean', giant_1d_ones, lambda t: [], '64bit_indexing',
367  # Double here because otherwise the CPU result will be
368  # wrong.
369  [torch.DoubleTensor]),
370  ('mode', small_3d, lambda t: []),
371  ('mode', small_3d, lambda t: [1], 'dim'),
372  ('mode', small_3d, lambda t: [-1], 'neg_dim'),
373  ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.1, 10), lambda t: [1], '2d_p=1', float_types_no_half),
374  ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.6, 10), lambda t: [2], '2d_p=2', float_types_no_half),
375  ('remainder', small_3d, lambda t: [3], 'value',),
376  ('remainder', small_3d, lambda t: [-3], 'negative_value', signed_types),
377  ('remainder', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
378  ('remainder', small_3d, lambda t: [constant_tensor_sub(0, small_3d_positive(t))], 'negative_tensor', signed_types),
379  ('std', small_3d, lambda t: []),
380  ('std', small_3d, lambda t: [1], 'dim', types, False),
381  ('std', small_3d, lambda t: [-1], 'neg_dim', types, False),
382  ('var', small_3d, lambda t: []),
383  ('var', small_3d, lambda t: [1], 'dim'),
384  ('var', small_3d, lambda t: [-1], 'neg_dim'),
385  ('ndimension', small_3d, lambda t: [],),
386  ('nelement', small_3d, lambda t: [],),
387  ('numel', small_3d, lambda t: [],),
388  ('narrow', small_3d, lambda t: [1, 3, 2],),
389  ('narrow', small_3d, lambda t: [-1, 3, 2], 'neg_dim'),
390  ('nonzero', small_3d, lambda t: [], '', types, False),
391  ('norm', small_3d, lambda t: []),
392  ('norm', small_3d, lambda t: [3], '3_norm'),
393  ('norm', small_3d, lambda t: [3, 0], '3_norm_dim'),
394  ('norm', small_3d, lambda t: [3, -2], '3_norm_neg_dim'),
395  ('ones', small_3d, lambda t: [1, 2, 3, 4, 5],),
396  ('permute', new_t(1, 2, 3, 4), lambda t: [2, 1, 3, 0],),
397  ('put_', new_t(2, 5, 3), lambda t: [long_type(t)([[0], [-2]]), t([[3], [4]])], '', types, False),
398  ('put_', new_t(2, 3), lambda t: [long_type(t)([]), t([])], 'empty'),
399  ('put_', new_t(2, 2), lambda t: [long_type(t)([[1], [-3]]), t([[1], [2]]), True], 'accumulate'),
400  ('prod', small_2d_oneish, lambda t: []),
401  ('prod', small_3d, lambda t: [1], 'dim'),
402  ('prod', small_3d, lambda t: [-1], 'neg_dim'),
403  ('sum', small_2d, lambda t: []),
404  ('sum', small_3d, lambda t: [1], 'dim'),
405  ('sum', small_3d, lambda t: [-1], 'neg_dim'),
406  ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm'),
407  ('renorm', small_3d, lambda t: [2, -1, 1], '2_norm_neg_dim'),
408  ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm'),
409  ('repeat', small_2d, lambda t: [2, 2, 2],),
410  ('size', new_t(1, 2, 3, 4), lambda t: [],),
411  ('size', new_t(1, 2, 3, 4), lambda t: [1], 'dim'),
412  ('size', new_t(1, 2, 3, 4), lambda t: [-2], 'neg_dim'),
413  ('sort', small_3d_unique, lambda t: [], ''),
414  ('sort', small_3d_unique, lambda t: [1], 'dim'),
415  ('sort', small_3d_unique, lambda t: [-1], 'neg_dim'),
416  ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending'),
417  ('sort', small_3d_unique, lambda t: [-1, True], 'neg_dim_descending'),
418  ('split', small_3d, lambda t: [2],),
419  ('split', small_3d, lambda t: [2, 1], 'dim'),
420  ('split', small_3d, lambda t: [2, -3], 'neg_dim'),
421  ('squeeze', new_t(1, 2, 1, 4), lambda t: [],),
422  ('squeeze', new_t(1, 2, 1, 4), lambda t: [2], 'dim'),
423  ('squeeze', new_t(1, 2, 1, 4), lambda t: [-2], 'neg_dim'),
424  ('t', new_t(1, 2), lambda t: [],),
425  ('take', new_t(3, 4), lambda t: [long_type(t)([[0], [-2]])], '', types, False),
426  ('transpose', new_t(1, 2, 3, 4), lambda t: [1, 2],),
427  ('transpose', new_t(1, 2, 3, 4), lambda t: [-1, -2], 'neg_dim'),
428  ('to_list', small_3d, lambda t: [],),
429  ('topk', small_3d_unique, lambda t: [2, 1, False, True], 'dim_sort',),
430  ('topk', small_3d_unique, lambda t: [2, -1, False, True], 'neg_dim_sort',),
431  ('topk', small_3d_unique, lambda t: [2, 1, True, True], 'dim_desc_sort',),
432  ('trace', medium_2d, lambda t: []),
433  ('tril', medium_2d, lambda t: [],),
434  ('tril', medium_2d_expanded, lambda t: [], 'zero_stride', types, True),
435  ('tril', medium_2d, lambda t: [2], 'positive'),
436  ('tril', medium_2d, lambda t: [-2], 'negative'),
437  ('triu', medium_2d, lambda t: [],),
438  ('triu', medium_2d_expanded, lambda t: [], 'zero_stride', types, True),
439  ('triu', medium_2d, lambda t: [2], 'positive'),
440  ('triu', medium_2d, lambda t: [-2], 'negative'),
441  ('unsqueeze', new_t(2, 3, 4), lambda t: [2],),
442  ('unsqueeze', new_t(2, 3, 4), lambda t: [-2], 'neg_dim'),
443  ('view', small_3d, lambda t: [100, 10], 'contiguous'),
444  ('view_as', small_3d, lambda t: [make_tensor(t, 100, 10)],),
445  ('zero', small_3d, lambda t: [],),
446  ('zeros', small_3d, lambda t: [1, 2, 3, 4],),
447  ('eye', small_2d, lambda t: [3, 4],),
448  ('flip', small_3d, lambda t: [0], 'd0', types, True),
449  ('flip', small_3d, lambda t: [0, 1, 2], 'd012', types, True),
450  ('flip', small_3d, lambda t: [0, 2], 'd02', types, True),
451  ('flip', small_3d, lambda t: [2, 0], 'd20', types, True),
452  ('flip', small_3d, lambda t: [-1], 'neg_d', types, True),
453  ('rot90', small_2d, lambda t: [1, [0, 1]], 'k1_d01', types, True),
454  ('rot90', small_3d, lambda t: [1, [1, 2]], 'k1_d12', types, True),
455  ('rot90', small_3d, lambda t: [1, [1, -1]], 'k1_neg_d', types, True),
456  ('rot90', small_3d, lambda t: [], 'default', types, True),
457  ('rsqrt', lambda t: constant_tensor_add(1, small_3d(t)), lambda t: [], None, float_types),
458  ('sinh', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types),
459  ('tan', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types),
460  ('__lshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(1, 5), t)),
461  lambda t: [2], None, signed_types),
462  ('__rshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(3, 7), t)),
463  lambda t: [2], None, signed_types),
464  # lapack tests
465  ('qr', small_2d_lapack, lambda t: [], 'square', float_types, False,
466  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
467  ('qr', small_2d_lapack_skinny, lambda t: [], 'skinny', float_types, False,
468  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
469  ('qr', small_2d_lapack_fat, lambda t: [], 'fat', float_types, False,
470  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
471  ('qr', large_2d_lapack, lambda t: [], 'big', float_types, False,
472  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
473  ('geqrf', new_t(20, 20), lambda t: [], None, float_types, False,
474  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
475  ('svd', new_t(10, 10), lambda t: [], 'square', float_types_no_half, False,
476  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
477  ('svd', lambda t: new_t(10, 10)(t).t(), lambda t: [True], 'square_col_maj',
478  float_types_no_half, False,
479  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
480  ('svd', new_t(20, 5), lambda t: [True], 'tall_some', float_types_no_half, False,
481  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
482  ('svd', new_t(20, 5), lambda t: [False], 'tall_all', float_types_no_half, False,
483  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
484  ('svd', lambda t: new_t(5, 20)(t).t(), lambda t: [True],
485  'tall_some_col_maj', float_types_no_half, False,
486  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
487  ('svd', lambda t: new_t(5, 20)(t).t(), lambda t: [False],
488  'tall_all_col_maj', float_types_no_half, False,
489  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
490  ('eig', new_t(10, 10), lambda t: [True], 'with_eigvec', float_types_no_half, False,
491  unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
492 ]
493 
494 # TODO: random functions, cat, gather, scatter, index*, masked*,
495 # resize, resizeAs, storage_offset, storage, stride, unfold
496 
497 custom_precision = {
498  'addbmm': 1e-4,
499  'addmm': 1e-4,
500  'addmv': 1e-4,
501  'addr': 1e-4,
502  'baddbmm': 1e-4,
503  'rsqrt': 1e-4,
504  'cumprod': 1e-4,
505  'qr': 3e-4,
506  'digamma': 1e0, # large values lead to large absolute error but small relative error
507 }
508 
509 custom_half_precision = {
510  'add': 1e-2,
511  'acos': 1e-3,
512  'addbmm': 1e-1,
513  'addcdiv': 1e-2,
514  'addcmul': 1e-2,
515  'addmm': 1e-1,
516  'addmv': 1e-2,
517  'addr': 1e-2,
518  'asin': 1e-3,
519  'atan2': 1e-3,
520  'atan': 1e-3,
521  'baddbmm': 1e-2,
522  'cos': 1e-3,
523  'cosh': 1e-2,
524  'cross': 1e-2,
525  'cumprod': 1e-2,
526  'cumsum': 1e-2,
527  'dist': 1e-2,
528  'div': 1e-3,
529  'dot': 1e-2,
530  'erf': 1e-3,
531  'erfc': 1e-3,
532  'erfinv': 1e-3,
533  'exp': 1e-2,
534  'expm1': 1e-2,
535  'fill': 1e-3,
536  'lerp': 1e-2,
537  'lgamma': 1e-2,
538  'log': 1e-2,
539  'log10': 1e-2,
540  'log1p': 1e-3,
541  'log2': 1e-2,
542  'mean': 1e-3,
543  'mul': 1e-2,
544  'norm': 1e-1,
545  'pow': 1e-1,
546  'prod': 1e-3,
547  'reciprocal': 1e-1,
548  'remainder': 1e-3,
549  'renorm': 1e-3,
550  'rsqrt': 1e-2,
551  'sigmoid': 1e-3,
552  'sin': 1e-3,
553  'sinh': 1e-3,
554  'sqrt': 1e-3,
555  'std': 1e-3,
556  'sub': 1e-2,
557  'sum': 1e-2,
558  'tan': 1e-3,
559  'tanh': 1e-3,
560  'trace': 1e-3,
561  'var': 1e-3,
562  '__lshift__': 1e-3,
563  '__rshift__': 1e-3,
564 }
565 
566 simple_pointwise = [
567  'abs',
568  'sign',
569 ]
570 for fn in simple_pointwise:
571  tests.append((fn, small_3d, lambda t: []))
572 
573 simple_pointwise_float = [
574  'log',
575  'log10',
576  'log1p',
577  'log2',
578  'sigmoid',
579  'sin',
580  'sqrt',
581  'tanh',
582  'acos',
583  'asin',
584  'atan',
585  'cos',
586  'cosh',
587  'erf',
588  'erfc',
589  'erfinv',
590  'exp',
591  'expm1',
592  'reciprocal',
593  'floor',
594  'frac',
595  'neg',
596  'round',
597  'trunc',
598  'ceil',
599  'lgamma',
600  'digamma',
601  'trigamma',
602 ]
603 
604 for fn in simple_pointwise_float:
605  tests.append((fn, small_3d, lambda t: [], None, float_types))
606 
607 _cycles_per_ms = None
608 
609 
610 def get_cycles_per_ms():
611  """Approximate number of cycles per millisecond for torch.cuda._sleep"""
612  global _cycles_per_ms
613  if _cycles_per_ms is None:
614  start = torch.cuda.Event(enable_timing=True)
615  end = torch.cuda.Event(enable_timing=True)
616  start.record()
617  torch.cuda._sleep(1000000)
618  end.record()
619  end.synchronize()
620  _cycles_per_ms = 1000000 / start.elapsed_time(end)
621  return _cycles_per_ms
622 
623 
624 def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):
625  def tmp(self):
626  cpu_tensor = tensor_constructor(t)
627  gpu_tensor = to_gpu(cpu_tensor)
628  cpu_args = arg_constructor(t)
629  gpu_args = [to_gpu(arg) for arg in cpu_args]
630  if is_half(t):
631  cpu_tensor = cpu_tensor.float()
632  cpu_args = [arg.float() if isinstance(arg, torch.Tensor) and is_half(arg) else arg for arg in cpu_args]
633  cpu_result = getattr(cpu_tensor, fn)(*cpu_args)
634  try:
635  gpu_result = getattr(gpu_tensor, fn)(*gpu_args)
636  except RuntimeError as e:
637  reason = e.args[0]
638  data_type_reasons = {'only supports floating-point types',
639  'unimplemented data type',
640  'not implemented for'}
641  if any(data_type_reason in reason for data_type_reason in data_type_reasons):
642  raise unittest.SkipTest('unimplemented data type')
643  raise
644  except AttributeError as e:
645  reason = e.args[0]
646  if 'object has no attribute' in reason:
647  raise unittest.SkipTest('unimplemented data type')
648  raise
649  # If one changes, another should change as well
650  self.assertEqual(cpu_tensor, gpu_tensor, precision)
651  self.assertEqual(cpu_args, gpu_args, precision)
652  # Compare results
653  if fn == 'element_size' and t.__name__ == 'HalfTensor':
654  # Workaround since cpu_result is float
655  self.assertEqual(2, gpu_result)
656  else:
657  self.assertEqual(cpu_result, gpu_result, precision)
658  return tmp
659 
660 
661 class TestCuda(TestCase):
662  _do_cuda_memory_leak_check = True
663  FIFTY_MIL_CYCLES = 50000000
664 
665  @staticmethod
666  def _test_memory_stats_generator(self, device=None, N=35):
667  if device is None:
668  device = torch.cuda.current_device()
669 
670  m0 = torch.cuda.memory_allocated(device)
671  last_m_arr = [torch.cuda.memory_allocated(device)]
672  max_m_arr = [torch.cuda.max_memory_allocated(device)]
673  last_c_arr = [torch.cuda.memory_cached(device)]
674  max_c_arr = [torch.cuda.max_memory_cached(device)]
675 
676  def alloc(*size):
677  with torch.cuda.device(device):
678  # NOTE: do **not** use methods that can have additional
679  # memory overhead, e.g., inplace random sampling methods.
680  # they can leave some memory occupied even after being
681  # deallocated, e.g., initialized RNG state, causing some
682  # memory checks below to fail.
683  return torch.cuda.FloatTensor(*size)
684 
685  def assert_change(comp=1, empty_cache=False, reset_max_alloc=False, reset_max_cached=False):
686  # comp > 0: increased
687  # comp = 0: equal
688  # comp < 0: decreased
689  new_m = torch.cuda.memory_allocated(device)
690  new_max_m = torch.cuda.max_memory_allocated(device)
691  if comp > 0:
692  self.assertGreater(new_m, last_m_arr[0])
693  elif comp < 0:
694  self.assertLess(new_m, last_m_arr[0])
695  else:
696  self.assertEqual(new_m, last_m_arr[0])
697  self.assertLessEqual(new_m, new_max_m)
698  self.assertGreaterEqual(new_max_m, max_m_arr[0])
699  last_m_arr[0] = new_m
700  max_m_arr[0] = new_max_m
701 
702  new_c = torch.cuda.memory_cached(device)
703  new_max_c = torch.cuda.max_memory_cached(device)
704  # emptying cache may happen (due to allocation or empty_cache), so
705  # we can't assert new_c >= last_c
706  self.assertLessEqual(new_c, new_max_c)
707  self.assertGreaterEqual(new_max_c, max_c_arr[0])
708  last_c_arr[0] = new_c
709  max_c_arr[0] = new_max_c
710 
711  if empty_cache:
713  new_c = torch.cuda.memory_cached(device)
714  new_max_c = torch.cuda.max_memory_cached(device)
715  self.assertLessEqual(new_c, last_c_arr[0])
716  self.assertLessEqual(new_c, new_max_c)
717  self.assertEqual(new_max_c, max_c_arr[0])
718  last_c_arr[0] = new_c
719 
720  if reset_max_alloc:
722  self.assertEqual(torch.cuda.memory_allocated(device), last_m_arr[0])
723  self.assertEqual(torch.cuda.max_memory_allocated(device), last_m_arr[0])
724  max_m_arr[0] = last_m_arr[0]
725  self.assertEqual(torch.cuda.memory_cached(device), last_c_arr[0])
726  self.assertEqual(torch.cuda.max_memory_cached(device), max_c_arr[0])
727 
728  if reset_max_cached:
730  self.assertEqual(torch.cuda.memory_allocated(device), last_m_arr[0])
731  self.assertEqual(torch.cuda.max_memory_allocated(device), max_m_arr[0])
732  self.assertEqual(torch.cuda.memory_cached(device), last_c_arr[0])
733  self.assertEqual(torch.cuda.max_memory_cached(device), last_c_arr[0])
734  max_c_arr[0] = last_c_arr[0]
735 
736  assert_change(0)
737  assert_change(0, reset_max_alloc=True)
738  assert_change(0, empty_cache=True)
739  assert_change(0, reset_max_cached=True)
740  assert_change(0)
741  yield
742 
743  tensors1 = [alloc(1), alloc(10, 20), alloc(200, 300, 2000)]
744  m1 = torch.cuda.memory_allocated(device)
745  assert_change(1)
746  yield
747 
748  tensors2 = []
749 
750  for i in range(1, int(N / 2) + 1):
751  # small ones
752  tensors2.append(alloc(i, i * 4))
753  assert_change(1)
754  yield
755 
756  for i in range(5, int(N / 2) + 5):
757  # large ones
758  tensors2.append(alloc(i, i * 7, i * 9, i * 11))
759  assert_change(1, reset_max_alloc=(i % 2 == 0), reset_max_cached=(i % 2 == 1))
760  yield
761 
762  tensors2.append(alloc(0, 0, 0))
763  assert_change(0)
764  yield
765 
766  permute = []
767  for i in torch.randperm(len(tensors2)):
768  permute.append(tensors2[i])
769  assert_change(0)
770  yield
771 
772  del tensors2
773  assert_change(0)
774  yield
775  tensors2 = permute
776  assert_change(0)
777  yield
778  del permute
779  assert_change(0, reset_max_alloc=True)
780  yield
781 
782  for i in range(int(N / 2)):
783  x = tensors2[i].numel()
784  del tensors2[i]
785  assert_change(-x) # in case that tensors2[i] is empty
786  yield
787 
788  for i in range(2, int(2 * N / 3) + 2):
789  tensors2.append(alloc(i, i * 3, i * 8))
790  assert_change(1)
791  yield
792 
793  del tensors2
794  assert_change(-1, reset_max_cached=True)
795  assert_change(0)
796  self.assertEqual(torch.cuda.memory_allocated(device), m1)
797  yield True
798 
799  del tensors1
800  assert_change(-1, reset_max_alloc=True)
801  self.assertEqual(torch.cuda.memory_allocated(device), m0)
802 
803  # test empty_cache and reset_max_memory_*
804  assert_change(0, empty_cache=True)
805  assert_change(0, reset_max_cached=True)
806  assert_change(0, reset_max_alloc=True)
807 
808  def test_memory_stats(self):
810  for _ in self._test_memory_stats_generator(self):
811  pass
812 
813  def test_cuda_get_device_name(self):
814  # Testing the behaviour with None as an argument
815  current_device = torch.cuda.current_device()
816  current_device_name = torch.cuda.get_device_name(current_device)
817  device_name_None = torch.cuda.get_device_name(None)
818  self.assertEqual(current_device_name, device_name_None)
819 
820  # Testing the behaviour for No argument
821  device_name_no_argument = torch.cuda.get_device_name()
822  self.assertEqual(current_device_name, device_name_no_argument)
823 
824  def test_cuda_get_device_capability(self):
825  # Testing the behaviour with None as an argument
826  current_device = torch.cuda.current_device()
827  current_device_capability = torch.cuda.get_device_capability(current_device)
828  device_capability_None = torch.cuda.get_device_capability(None)
829  self.assertEqual(current_device_capability, device_capability_None)
830 
831  # Testing the behaviour for No argument
832  device_capability_no_argument = torch.cuda.get_device_capability()
833  self.assertEqual(current_device_capability, device_capability_no_argument)
834 
835  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
836  def test_memory_stats_multigpu(self):
837  # advance a generator with a end flag
838  def advance(gen, end):
839  if not end:
840  try:
841  next(gen)
842  except StopIteration:
843  end = True
844  return end
845 
846  # interlace
848  gen0 = self._test_memory_stats_generator(self, device='cuda:0', N=35)
849  gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
850  end0 = end1 = False
851  while not (end0 and end1):
852  end0 = advance(gen0, end0)
853  end1 = advance(gen1, end1)
854 
855  # semi-random order
857  gen0 = self._test_memory_stats_generator(self, device=0, N=35)
858  gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
859  end0 = end1 = False
860 
861  while not (end0 and end1):
862  end0 = advance(gen0, end0)
863  if not end0:
864  gen1_max_times = torch.LongTensor(1).random_(0, 3)[0]
865  else:
866  gen1_max_times = inf
867  t = 0
868  while t < gen1_max_times and not end1:
869  end1 = advance(gen1, end1)
870  t += 1
871 
872  def test_out_of_memory(self):
873  tensor = torch.zeros(1024, device='cuda')
874 
875  with self.assertRaisesRegex(RuntimeError, "Tried to allocate 80.00 GiB"):
876  torch.empty(1024 * 1024 * 1024 * 80, dtype=torch.int8, device='cuda')
877 
878  # ensure out of memory error doesn't disturb subsequent kernel
879  tensor.fill_(1)
880  self.assertTrue((tensor == 1).all())
881 
882  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
883  def test_autogpu(self):
884  x = torch.randn(5, 5).cuda()
885  y = torch.randn(5, 5).cuda()
886  self.assertEqual(x.get_device(), 0)
887  self.assertEqual(x.get_device(), 0)
888  with torch.cuda.device(1):
889  z = torch.randn(5, 5).cuda()
890  self.assertEqual(z.get_device(), 1)
891  q = x.add(y)
892  self.assertEqual(q.get_device(), 0)
893  w = torch.randn(5, 5).cuda()
894  self.assertEqual(w.get_device(), 1)
895  self.assertEqual(y.cuda().get_device(), 1)
896  z = z.cuda()
897  self.assertEqual(z.get_device(), 0)
898 
899  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
900  def test_new(self):
901  x = torch.randn(3, 3).cuda()
902  self.assertEqual(x.new([0, 1, 2]).get_device(), 0)
903  self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1)
904 
905  with torch.cuda.device(1):
906  self.assertEqual(x.new([0, 1, 2]).get_device(), 0)
907  self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1)
908 
909  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
910  def test_copy_device(self):
911  x = torch.randn(5, 5).cuda()
912  with torch.cuda.device(1):
913  y = x.cuda()
914  self.assertEqual(y.get_device(), 1)
915  self.assertIs(y.cuda(), y)
916  z = y.cuda(0)
917  self.assertEqual(z.get_device(), 0)
918  self.assertIs(z.cuda(0), z)
919 
920  x = torch.randn(5, 5)
921  with torch.cuda.device(1):
922  y = x.cuda()
923  self.assertEqual(y.get_device(), 1)
924  self.assertIs(y.cuda(), y)
925  z = y.cuda(0)
926  self.assertEqual(z.get_device(), 0)
927  self.assertIs(z.cuda(0), z)
928 
929  def _test_copy_sync_current_stream(self, x, y):
930  x_plus_one = x + 1
931  s0 = torch.cuda.Stream(device=x.device)
932  s1 = torch.cuda.Stream(device=y.device)
933  s2 = torch.cuda.Stream(device=x.device)
934  s3 = torch.cuda.Stream(device=y.device)
935 
936  # same dst stream different src streams
937  with torch.cuda.stream(s0):
938  torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES)
939  with torch.cuda.stream(s1):
940  y.copy_(x_plus_one)
941 
942  with torch.cuda.stream(s2), torch.cuda.stream(s1):
943  y.copy_(x)
944 
945  s1.synchronize()
946  # The copy() is synchronized on the current streams of both src and dst.
947  # In the above test, the _sleep() op on s0 will not block the copy() on
948  # s2, but both copies are synchronized on s1 in the dst device. Hence,
949  # x is copied to y after x_plus_one is copied to y. If x and y are on
950  # the same device, both copy() ops are synchronized on s1.
951  self.assertEqual(y, x)
952 
953  # same src stream different dst streams
954  with torch.cuda.stream(s1):
955  torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES)
956  with torch.cuda.stream(s0):
957  y.copy_(x_plus_one)
958 
959  with torch.cuda.stream(s3), torch.cuda.stream(s0):
960  y.copy_(x)
961 
962  s0.synchronize()
963  # Similarly, both copy() ops are synchronized on s0.
964  self.assertEqual(y, x)
965 
966  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
967  @skipIfRocm
968  def test_copy_streams(self):
969  d0 = torch.device('cuda:0')
970  x0 = torch.zeros(5, 5, device=d0)
971 
972  d1 = torch.device('cuda:1')
973  x1 = torch.zeros(5, 5, device=d1)
974  self._test_copy_sync_current_stream(x0, x1)
975 
976  x2 = torch.zeros(5, 5, device=d0)
977  self._test_copy_sync_current_stream(x0, x2)
978 
979  def test_copy_non_blocking(self):
980  x = torch.randn(5, 5).cuda()
981  y = torch.zeros(5, 5)
982  y.copy_(x, non_blocking=True)
983  self.assertEqual(x, y)
984 
985  x = torch.randn(5, 5)
986  y = torch.zeros(5, 5).cuda()
987  y.copy_(x, non_blocking=True)
988  self.assertEqual(x, y)
989 
990  def test_serialization_array_with_storage(self):
991  x = torch.randn(5, 5).cuda()
992  y = torch.IntTensor(2, 5).fill_(0).cuda()
993  q = [x, y, x, y.storage()]
994  with tempfile.NamedTemporaryFile() as f:
995  torch.save(q, f)
996  f.seek(0)
997  q_copy = torch.load(f)
998  self.assertEqual(q_copy, q, 0)
999  q_copy[0].fill_(5)
1000  self.assertEqual(q_copy[0], q_copy[2], 0)
1001  self.assertTrue(isinstance(q_copy[0], torch.cuda.DoubleTensor))
1002  self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor))
1003  self.assertTrue(isinstance(q_copy[2], torch.cuda.DoubleTensor))
1004  self.assertTrue(isinstance(q_copy[3], torch.cuda.IntStorage))
1005  q_copy[1].fill_(10)
1006  self.assertTrue(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
1007 
1008  def test_type_conversions(self):
1009  x = torch.randn(5, 5)
1010  self.assertIsInstance(x.float(), torch.FloatTensor)
1011  self.assertIsInstance(x.cuda(), torch.cuda.DoubleTensor)
1012  self.assertIsInstance(x.cuda().float(), torch.cuda.FloatTensor)
1013  self.assertIsInstance(x.cuda().float().cpu(), torch.FloatTensor)
1014  self.assertIsInstance(x.cuda().float().cpu().int(), torch.IntTensor)
1015 
1016  y = x.storage()
1017  self.assertIsInstance(y.float(), torch.FloatStorage)
1018  self.assertIsInstance(y.cuda(), torch.cuda.DoubleStorage)
1019  self.assertIsInstance(y.cuda().float(), torch.cuda.FloatStorage)
1020  self.assertIsInstance(y.cuda().float().cpu(), torch.FloatStorage)
1021  self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage)
1022 
1023  def test_mul_intertype_scalar(self):
1024  def test_mul(dtype):
1025  x = torch.tensor(1.5, dtype=dtype, device='cuda')
1026  y = torch.tensor(3, dtype=torch.int32, device='cuda')
1027 
1028  self.assertEqual(x * y, 4.5)
1029  self.assertEqual(y * x, 4.5)
1030  with self.assertRaisesRegex(RuntimeError, "doesn't match the desired type"):
1031  y *= x
1032  x *= y
1033  self.assertEqual(x, 4.5)
1034 
1035  test_mul(torch.float16)
1036  test_mul(torch.float32)
1037  test_mul(torch.float64)
1038 
1039  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
1040  def test_type_conversions_same_gpu(self):
1041  x = torch.randn(5, 5).cuda(1)
1042  self.assertEqual(x.int().get_device(), 1)
1043  self.assertEqual(x.type(torch.int).get_device(), 1)
1044  self.assertEqual(x.to(torch.int).get_device(), 1)
1045 
1046  def test_neg(self):
1047  _TestTorchMixin._test_neg(self, lambda t: t.cuda())
1048 
1049  def test_isinf(self):
1050  _TestTorchMixin._test_isinf(self, lambda t: t.cuda())
1051 
1052  def test_inplace_unary_mem_overlap(self):
1053  _TestTorchMixin._test_inplace_unary_mem_overlap(self, device='cuda')
1054 
1055  @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
1056  def test_arithmetic_large_tensor(self):
1057  x = torch.empty(2**30, device='cuda')
1058 
1059  x.fill_(1)
1060  self.assertEqual(x.sum(), 2**30)
1061 
1062  x += 1
1063  self.assertEqual(x.sum(), 2**31)
1064 
1065  x.fill_(1)
1066  x -= 0.5
1067  self.assertEqual(x.sum(), 2**29)
1068 
1069  x.fill_(1)
1070  x *= 2
1071  self.assertEqual(x.sum(), 2**31)
1072 
1073  x.fill_(1)
1074  x /= 2
1075  self.assertEqual(x.sum(), 2**29)
1076 
1077  def _test_broadcast(self, input):
1078  if not TEST_MULTIGPU:
1079  raise unittest.SkipTest("only one GPU detected")
1080  result = comm.broadcast(input, (0, 1))
1081  for i, t in enumerate(result):
1082  self.assertEqual(t.get_device(), i)
1083  self.assertEqual(t, input)
1084  if input.is_cuda and input.get_device() == i:
1085  self.assertEqual(t.data_ptr(), input.data_ptr())
1086 
1087  def test_broadcast_cpu(self):
1088  self._test_broadcast(torch.randn(5, 5))
1089 
1090  def test_broadcast_gpu(self):
1091  self._test_broadcast(torch.randn(5, 5).cuda())
1092 
1093  def test_min_max_nan(self):
1094  tests = [(lambda x: x.min(), 'min'),
1095  (lambda x: x.max(), 'max'),
1096  (lambda x: x.min(0)[0], 'min_dim'),
1097  (lambda x: x.max(0)[0], 'max_dim')]
1098  for f, name in tests:
1099  a = torch.arange(25.0).view(5, 5)
1100  a[2, 2] = nan
1101  actual = f(a.cuda()).cpu()
1102  expected = f(a).cpu()
1103  self.assertEqual(torch.isnan(actual), torch.isnan(expected), 'nans for {}'.format(name))
1104  self.assertEqual(actual[~torch.isnan(actual)],
1105  expected[~torch.isnan(expected)], 'nans for {}'.format(name))
1106 
1107  @staticmethod
1108  def _test_broadcast_coalesced(self, tensors, buffer_size):
1109  b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors]
1110  for (_, bt), t in zip(b_tensors, tensors):
1111  self.assertEqual(bt.get_device(), 1)
1112  self.assertEqual(bt, t)
1113  self.assertIsInstance(bt, type(t))
1114 
1115  bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=buffer_size)
1116  bc_tensors_t = list(zip(*bc_tensors))
1117  self.assertEqual(b_tensors, bc_tensors_t)
1118  for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t):
1119  self.assertEqual(bt.get_device(), bct.get_device())
1120  self.assertIsInstance(bct, type(bt))
1121 
1122  # check that tensors on device[0] are returned as-is
1123  for out_tensors in (b_tensors, bc_tensors_t):
1124  for inp_t, (out_t, _) in zip(tensors, out_tensors):
1125  self.assertIs(inp_t, out_t)
1126 
1127  # check that the tensors not on device[0] have different version counters
1128  # NOTE [ Version Counter in comm.*_coalesced ]
1129  versions = [t._version for _, t in bc_tensors_t]
1130  for old_version, (_, t) in zip(versions, bc_tensors_t):
1131  self.assertEqual(t._version, old_version)
1132  t.zero_()
1133  self.assertEqual(t._version, old_version + 1)
1134 
1135  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
1136  # Note: fails sometimes on the CI, passes on dual gfx906
1137  @skipIfRocm
1138  def test_broadcast_coalesced(self):
1139  numel = 5
1140  num_bytes = numel * 8
1141  tensors = [
1142  make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3),
1143  torch.randn(numel).long().cuda(),
1144  torch.randn(numel).cuda(),
1145  make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3),
1146  make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3),
1147  make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3),
1148  make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3),
1149  torch.randn(numel).long().cuda(),
1150  torch.randn(numel).long().cuda(),
1151  make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7),
1152  torch.randn(numel * 2).int().cuda(), # int is 2x shorter
1153  torch.randn(numel).cuda(),
1154  ]
1155  self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2)
1156 
1157  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
1158  def test_broadcast_coalesced_dense_only(self):
1159  numel = 5
1160  num_bytes = numel * 8
1161  tensors = [
1162  torch.randn(numel).long().cuda(),
1163  torch.randn(numel).cuda(),
1164  torch.randn(numel).long().cuda(),
1165  torch.randn(numel).long().cuda(),
1166  torch.randn(numel * 2).int().cuda(), # int is 2x shorter
1167  torch.randn(numel).cuda(),
1168  ]
1169  self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2)
1170 
1171  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
1172  def test_reduce_add(self):
1173  x = torch.randn(5, 5)
1174  y = torch.randn(5, 5)
1175  x_cuda = x.cuda(0)
1176  y_cuda = y.cuda(1)
1177  result = comm.reduce_add((x_cuda, y_cuda))
1178  self.assertEqual(result.get_device(), 0)
1179  self.assertEqual(result.cpu(), x + y)
1180 
1181  @staticmethod
1182  def _test_reduce_add_coalesced(self, tensors, buffer_size):
1183  dup_tensors = [tensors, list(map(lambda t: t.cuda(1), tensors))]
1184 
1185  r_tensors = list(map(comm.reduce_add, zip(*dup_tensors)))
1186  for r, t in zip(r_tensors, tensors):
1187  self.assertEqual(r.get_device(), t.get_device())
1188  self.assertEqual(r, t * 2)
1189  self.assertEqual(r.type(), t.type())
1190 
1191  rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=buffer_size)
1192  self.assertEqual(r_tensors, rc_tensors)
1193  for r, rc in zip(r_tensors, rc_tensors):
1194  self.assertEqual(rc.get_device(), r.get_device())
1195  self.assertEqual(rc.type(), r.type())
1196 
1197  # Since we have both cuda:0 and cuda:1 inputs, the outputs must be new.
1198  # We can check that they have different version counters.
1199  # NOTE [ Version Counter in comm.*_coalesced ]
1200  versions = [t._version for t in rc_tensors]
1201  for old_version, t in zip(versions, rc_tensors):
1202  self.assertEqual(t._version, old_version)
1203  t.zero_()
1204  self.assertEqual(t._version, old_version + 1)
1205 
1206  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
1207  def test_reduce_add_coalesced(self):
1208  numel = 5
1209  num_bytes = numel * 8
1210  tensors = [
1211  make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3),
1212  torch.randn(numel).long().cuda(),
1213  torch.randn(numel).cuda(),
1214  make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3),
1215  make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3),
1216  make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3),
1217  make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3),
1218  torch.randn(numel).long().cuda(),
1219  torch.randn(numel).long().cuda(),
1220  make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7),
1221  torch.randn(numel * 2).int().cuda(), # int is 2x shorter
1222  torch.randn(numel).cuda(),
1223  ]
1224  self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2)
1225 
1226  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
1227  def test_reduce_add_coalesced_dense_only(self):
1228  numel = 5
1229  num_bytes = numel * 8
1230  tensors = [
1231  torch.randn(numel).long().cuda(),
1232  torch.randn(numel).cuda(),
1233  torch.randn(numel).long().cuda(),
1234  torch.randn(numel).long().cuda(),
1235  torch.randn(numel * 2).int().cuda(), # int is 2x shorter
1236  torch.randn(numel).cuda(),
1237  ]
1238  self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2)
1239 
1240  def _test_scatter(self, input, chunk_sizes=None, dim=0):
1241  if not TEST_MULTIGPU:
1242  raise unittest.SkipTest("only one GPU detected")
1243  result = comm.scatter(input, (0, 1), chunk_sizes, dim)
1244  self.assertEqual(len(result), 2)
1245  if chunk_sizes is None:
1246  chunk_sizes = tuple(repeat(input.size(dim) // 2, 2))
1247  chunk_start = 0
1248  for i, r in enumerate(result):
1249  chunk_end = chunk_start + chunk_sizes[i]
1250  index = [slice(None, None), slice(None, None)]
1251  index[dim] = slice(chunk_start, chunk_end)
1252  self.assertEqual(r, input[tuple(index)], 0)
1253  chunk_start = chunk_end
1254 
1255  def test_scatter_cpu(self):
1256  self._test_scatter(torch.randn(4, 4), dim=0)
1257 
1258  def test_scatter_cpu_dim(self):
1259  self._test_scatter(torch.randn(4, 4), dim=1)
1260 
1261  def test_scatter_cpu_neg_dim(self):
1262  self._test_scatter(torch.randn(4, 4), dim=-2)
1263 
1264  def test_scatter_cpu_sizes(self):
1265  self._test_scatter(torch.randn(6, 4), chunk_sizes=(2, 4))
1266 
1267  def test_scatter_gpu(self):
1268  self._test_scatter(torch.randn(4, 4).cuda(), dim=0)
1269 
1270  # Note: This test fails on ROCm CI gfx900 but passes on gfx906
1271  @skipIfRocm
1272  def test_scatter_gpu_dim(self):
1273  self._test_scatter(torch.randn(4, 4).cuda(), dim=1)
1274 
1275  def test_scatter_gpu_neg_dim(self):
1276  self._test_scatter(torch.randn(4, 4).cuda(), dim=-2)
1277 
1278  def test_scatter_gpu_sizes(self):
1279  self._test_scatter(torch.randn(6, 4).cuda(), chunk_sizes=(2, 4))
1280 
1281  def _test_gather(self, dim):
1282  if not TEST_MULTIGPU:
1283  raise unittest.SkipTest("only one GPU detected")
1284  x = torch.randn(2, 5).cuda(0)
1285  y = torch.randn(2, 5).cuda(1)
1286  result = comm.gather((x, y), dim)
1287 
1288  expected_size = list(x.size())
1289  expected_size[dim] += y.size(dim)
1290  expected_size = torch.Size(expected_size)
1291  self.assertEqual(result.get_device(), 0)
1292  self.assertEqual(result.size(), expected_size)
1293 
1294  index = [slice(None, None), slice(None, None)]
1295  index[dim] = slice(0, x.size(dim))
1296  self.assertEqual(result[tuple(index)], x)
1297  index[dim] = slice(x.size(dim), x.size(dim) + y.size(dim))
1298  self.assertEqual(result[tuple(index)], y)
1299 
1300  def test_gather(self):
1301  self._test_gather(0)
1302 
1303  @skipIfRocm
1304  def test_gather_dim(self):
1305  self._test_gather(1)
1306 
1307  def test_from_sequence(self):
1308  seq = [list(range(i * 4, i * 4 + 4)) for i in range(5)]
1309  reference = torch.arange(0, 20).resize_(5, 4)
1310  for t in types:
1311  cuda_type = get_gpu_type(t)
1312  self.assertEqual(cuda_type(seq), reference)
1313 
1314  def test_torch_manual_seed_seeds_cuda_devices(self):
1315  with freeze_rng_state():
1316  x = torch.zeros(4, 4).float().cuda()
1317  torch.manual_seed(2)
1318  self.assertEqual(torch.cuda.initial_seed(), 2)
1319  x.uniform_()
1320  torch.manual_seed(2)
1321  y = x.clone().uniform_()
1322  self.assertEqual(x, y)
1323  self.assertEqual(torch.cuda.initial_seed(), 2)
1324 
1325  def test_manual_seed(self):
1326  with freeze_rng_state():
1327  x = torch.zeros(4, 4).float().cuda()
1328  torch.cuda.manual_seed(2)
1329  self.assertEqual(torch.cuda.initial_seed(), 2)
1330  x.uniform_()
1331  a = torch.bernoulli(torch.full_like(x, 0.5))
1332  torch.cuda.manual_seed(2)
1333  y = x.clone().uniform_()
1334  b = torch.bernoulli(torch.full_like(x, 0.5))
1335  self.assertEqual(x, y)
1336  self.assertEqual(a, b)
1337  self.assertEqual(torch.cuda.initial_seed(), 2)
1338 
1339  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
1340  def test_cat_autogpu(self):
1341  x = torch.randn(4, 4).cuda(1)
1342  y = torch.randn(4, 4).cuda(1)
1343  z = torch.cat([x, y], 0)
1344  self.assertEqual(z.get_device(), x.get_device())
1345 
1346  def test_clamp(self):
1347  _TestTorchMixin._test_clamp(self, 'cuda')
1348 
1349  def test_cat(self):
1350  SIZE = 10
1351  for dim in range(-3, 3):
1352  pos_dim = dim if dim >= 0 else 3 + dim
1353  x = torch.rand(13, SIZE, SIZE).transpose(0, pos_dim).cuda()
1354  y = torch.rand(17, SIZE, SIZE).transpose(0, pos_dim).cuda()
1355  z = torch.rand(19, SIZE, SIZE).transpose(0, pos_dim).cuda()
1356 
1357  res1 = torch.cat((x, y, z), dim)
1358  self.assertEqual(res1.narrow(pos_dim, 0, 13), x, 0)
1359  self.assertEqual(res1.narrow(pos_dim, 13, 17), y, 0)
1360  self.assertEqual(res1.narrow(pos_dim, 30, 19), z, 0)
1361 
1362  x = torch.randn(20, SIZE, SIZE).cuda()
1363  self.assertEqual(torch.cat(torch.split(x, 7)), x)
1364  self.assertEqual(torch.cat(torch.chunk(x, 7)), x)
1365 
1366  y = torch.randn(1, SIZE, SIZE).cuda()
1367  z = torch.cat([x, y])
1368  self.assertEqual(z.size(), (21, SIZE, SIZE))
1369 
1370  def test_cat_empty_legacy(self):
1371  _TestTorchMixin._test_cat_empty_legacy(self, use_cuda=True)
1372 
1373  def test_cat_empty(self):
1374  _TestTorchMixin._test_cat_empty(self, use_cuda=True)
1375 
1376  def test_bernoulli(self):
1377  _TestTorchMixin._test_bernoulli(self, torch.float32, torch.float64, 'cuda')
1378  _TestTorchMixin._test_bernoulli(self, torch.float32, torch.float16, 'cuda')
1379  _TestTorchMixin._test_bernoulli(self, torch.float16, torch.float64, 'cuda')
1380  _TestTorchMixin._test_bernoulli(self, torch.float16, torch.float16, 'cuda')
1381  # test that it works with integral tensors
1382  _TestTorchMixin._test_bernoulli(self, torch.uint8, torch.float64, 'cuda')
1383  _TestTorchMixin._test_bernoulli(self, torch.uint8, torch.float16, 'cuda')
1384  _TestTorchMixin._test_bernoulli(self, torch.int64, torch.float64, 'cuda')
1385  _TestTorchMixin._test_bernoulli(self, torch.int64, torch.float16, 'cuda')
1386 
1387  def test_cat_bad_input_sizes(self):
1388  x = torch.randn(2, 1).cuda()
1389  y = torch.randn(2, 1, 1).cuda()
1390  z = torch.randn(2, 1, 1).cuda()
1391  self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z]))
1392 
1393  x = torch.randn(2, 1, 2).cuda()
1394  y = torch.randn(2, 1, 1).cuda()
1395  z = torch.randn(2, 2, 1).cuda()
1396  self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z], dim=1))
1397 
1398  @unittest.skipIf(torch.cuda.device_count() >= 10, "Loading a cuda:9 tensor")
1399  @unittest.skipIf(not PY3, "Tensor was serialized with Python 3")
1400  def test_load_nonexistent_device(self):
1401  # Setup: create a serialized file object with a 'cuda:9' restore location
1402  tensor = torch.randn(2, device='cuda')
1403  buf = io.BytesIO()
1404  torch.save(tensor, buf)
1405  # NB: this might not work in the future if serialization changes
1406  buf = io.BytesIO(buf.getvalue().replace(b'cuda:0', b'cuda:9'))
1407 
1408  msg = r'Attempting to deserialize object on CUDA device 9'
1409  with self.assertRaisesRegex(RuntimeError, msg):
1410  _ = torch.load(buf)
1411 
1412  def test_serialization(self):
1413  x = torch.randn(4, 4).cuda()
1414  with tempfile.NamedTemporaryFile() as f:
1415  torch.save(x, f)
1416  f.seek(0)
1417  x_copy = torch.load(f)
1418  self.assertEqual(x_copy, x)
1419  self.assertIs(type(x_copy), type(x))
1420  self.assertEqual(x_copy.get_device(), x.get_device())
1421 
1422  def test_serialization_array_with_empty(self):
1423  x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()]
1424  with tempfile.NamedTemporaryFile() as f:
1425  torch.save(x, f)
1426  f.seek(0)
1427  x_copy = torch.load(f)
1428  for original, copy in zip(x, x_copy):
1429  self.assertEqual(copy, original)
1430  self.assertIs(type(copy), type(original))
1431  self.assertEqual(copy.get_device(), original.get_device())
1432 
1433  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1434  def test_multigpu_serialization(self):
1435  x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
1436  with tempfile.NamedTemporaryFile() as f:
1437  torch.save(x, f)
1438  f.seek(0)
1439  x_copy = torch.load(f)
1440  for original, copy in zip(x, x_copy):
1441  self.assertEqual(copy, original)
1442  self.assertIs(type(copy), type(original))
1443  self.assertEqual(copy.get_device(), original.get_device())
1444 
1445  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1446  def test_multigpu_serialization_remap(self):
1447  x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
1448 
1449  def gpu_remap(storage, location):
1450  if location == 'cuda:1':
1451  return storage.cuda(0)
1452 
1453  with tempfile.NamedTemporaryFile() as f:
1454  torch.save(x, f)
1455  f.seek(0)
1456  x_copy = torch.load(f, map_location=gpu_remap)
1457 
1458  for original, copy in zip(x, x_copy):
1459  self.assertEqual(copy, original)
1460  self.assertIs(type(copy), type(original))
1461  self.assertEqual(copy.get_device(), 0)
1462 
1463  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1464  def test_multigpu_serialization_remap_dict(self):
1465  x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
1466  with tempfile.NamedTemporaryFile() as f:
1467  torch.save(x, f)
1468  f.seek(0)
1469  x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'})
1470  for original, copy in zip(x, x_copy):
1471  self.assertEqual(copy, original)
1472  self.assertIs(type(copy), type(original))
1473  self.assertEqual(copy.get_device(), 0)
1474 
1475  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1476  def test_multigpu_storage_clone(self):
1477  x = torch.randn(4, 4, device='cuda:1').storage()
1478  y = x.clone()
1479  self.assertEqual(x.get_device(), y.get_device())
1480  for t in ['byte', 'char', 'short', 'int', 'long', 'half', 'double']:
1481  self.assertEqual(getattr(x, t)().get_device(), x.get_device())
1482 
1483  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1484  def test_cuda_set_device(self):
1485  x = torch.randn(5, 5)
1486  with torch.cuda.device(1):
1487  self.assertEqual(x.cuda().get_device(), 1)
1489  self.assertEqual(x.cuda().get_device(), 0)
1490  with torch.cuda.device(1):
1491  self.assertEqual(x.cuda().get_device(), 1)
1492  self.assertEqual(x.cuda().get_device(), 0)
1494  self.assertEqual(x.cuda().get_device(), 0)
1495 
1496  def test_is_tensor(self):
1497  for t in types:
1498  tensor = get_gpu_type(t)()
1499  self.assertTrue(torch.is_tensor(tensor))
1500  self.assertTrue(torch.is_tensor(torch.cuda.HalfTensor()))
1501 
1502  def test_cuda_synchronize(self):
1504 
1505  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1506  @skipIfRocm
1507  def test_current_stream(self):
1508  d0 = torch.device('cuda:0')
1509  d1 = torch.device('cuda:1')
1510 
1512  s1 = torch.cuda.current_stream(device=1)
1513  s2 = torch.cuda.current_stream(device=0)
1514 
1515  self.assertEqual(d0, s0.device)
1516  self.assertEqual(d1, s1.device)
1517  self.assertEqual(d0, s2.device)
1518  self.assertEqual(s0, s2)
1519 
1520  with torch.cuda.device(d1):
1523  s2 = torch.cuda.current_stream(d0)
1524 
1525  self.assertEqual(d1, s0.device)
1526  self.assertEqual(d1, s1.device)
1527  self.assertEqual(d0, s2.device)
1528  self.assertEqual(s0, s1)
1529 
1530  with self.assertRaisesRegex(ValueError,
1531  "Expected a cuda device, but got: cpu"):
1532  torch.cuda.current_stream(torch.device('cpu'))
1533 
1534  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1535  @skipIfRocm
1536  def test_default_stream(self):
1537  d0 = torch.device('cuda:0')
1538  d1 = torch.device('cuda:1')
1539 
1540  with torch.cuda.device(d0):
1542 
1543  with torch.cuda.device(d1):
1545 
1546  s2 = torch.cuda.default_stream(device=0)
1547  s3 = torch.cuda.default_stream(d1)
1548 
1549  self.assertEqual(d0, s0.device)
1550  self.assertEqual(d1, s1.device)
1551  self.assertEqual(d0, s2.device)
1552  self.assertEqual(d1, s3.device)
1553  self.assertEqual(s0, s2)
1554  self.assertEqual(s1, s3)
1555 
1556  with torch.cuda.device(d0):
1557  self.assertEqual(torch.cuda.current_stream(), s0)
1558 
1559  with torch.cuda.device(d1):
1560  self.assertEqual(torch.cuda.current_stream(), s1)
1561 
1562  with self.assertRaisesRegex(ValueError,
1563  "Expected a cuda device, but got: cpu"):
1564  torch.cuda.default_stream(torch.device('cpu'))
1565 
1566  def test_streams(self):
1567  default_stream = torch.cuda.current_stream()
1568  user_stream = torch.cuda.Stream()
1569  self.assertEqual(torch.cuda.current_stream(), default_stream)
1570  self.assertNotEqual(default_stream, user_stream)
1571  self.assertEqual(default_stream.cuda_stream, 0)
1572  self.assertNotEqual(user_stream.cuda_stream, 0)
1573  with torch.cuda.stream(user_stream):
1574  self.assertEqual(torch.cuda.current_stream(), user_stream)
1575  self.assertTrue(user_stream.query())
1576  # copy 10 MB tensor from CPU-GPU which should take some time
1577  tensor1 = torch.ByteTensor(10000000).pin_memory()
1578  tensor2 = tensor1.cuda(non_blocking=True)
1579  self.assertFalse(default_stream.query())
1580  default_stream.synchronize()
1581  self.assertTrue(default_stream.query())
1582 
1583  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1584  @skipIfRocm
1585  def test_stream_event_device(self):
1586  d0 = torch.device('cuda:0')
1587  d1 = torch.device('cuda:1')
1588  e0 = torch.cuda.Event()
1589 
1590  self.assertEqual(None, e0.device)
1591 
1592  with torch.cuda.device(d0):
1594  s0.record_event(e0)
1595 
1596  with torch.cuda.device(d1):
1597  s1 = torch.cuda.Stream()
1598  e1 = s1.record_event()
1599 
1600  self.assertEqual(s0.device, torch.device('cuda:0'))
1601  self.assertEqual(e0.device, torch.device('cuda:0'))
1602  self.assertEqual(s1.device, torch.device('cuda:1'))
1603  self.assertEqual(e1.device, torch.device('cuda:1'))
1604 
1605  def test_stream_event_repr(self):
1607  self.assertTrue("torch.cuda.Stream" in s.__repr__())
1608  e = torch.cuda.Event()
1609  self.assertTrue("torch.cuda.Event" in e.__repr__())
1610  s.record_event(e)
1611  self.assertTrue("torch.cuda.Event" in e.__repr__())
1612 
1613  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1614  # Note: fails sometimes on the CI, passes on dual gfx906
1615  @skipIfRocm
1616  def test_stream_context(self):
1618  s1 = torch.cuda.Stream(device=1)
1619  s2 = torch.cuda.Stream(device=0)
1620 
1621  with torch.cuda.device(s1.device):
1622  prev_stream_on_cuda1 = torch.cuda.current_stream()
1623 
1624  self.assertEqual(torch.cuda.current_stream(), s0)
1625  self.assertEqual(0, torch.cuda.current_device())
1626  with torch.cuda.stream(s1):
1627  self.assertEqual(torch.cuda.current_stream(), s1)
1628  self.assertEqual(1, torch.cuda.current_device())
1629  with torch.cuda.stream(s2):
1630  self.assertEqual(torch.cuda.current_stream(), s2)
1631  self.assertEqual(0, torch.cuda.current_device())
1632  with torch.cuda.stream(s0):
1633  self.assertEqual(torch.cuda.current_stream(), s0)
1634  self.assertEqual(0, torch.cuda.current_device())
1635  self.assertEqual(torch.cuda.current_stream(), s2)
1636  self.assertEqual(0, torch.cuda.current_device())
1637  self.assertEqual(torch.cuda.current_stream(), s1)
1638  self.assertEqual(1, torch.cuda.current_device())
1639 
1640  with torch.cuda.device(s1.device):
1641  self.assertEqual(prev_stream_on_cuda1, torch.cuda.current_stream())
1642 
1643  self.assertEqual(torch.cuda.current_stream(), s0)
1644  self.assertEqual(0, torch.cuda.current_device())
1645 
1646  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1647  @skipIfRocm
1648  def test_streams_multi_gpu(self):
1649  default_stream = torch.cuda.current_stream()
1650  self.assertEqual(default_stream.device, torch.device('cuda:0'))
1651  stream = torch.cuda.Stream(device=1)
1652  self.assertEqual(stream.device, torch.device('cuda:1'))
1653  with torch.cuda.device(1):
1654  self.assertEqual(
1655  torch.cuda.current_stream().device, torch.device('cuda:1'))
1656  self.assertNotEqual(torch.cuda.current_stream(), default_stream)
1657 
1658  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1659  @skipIfRocm
1660  def test_streams_multi_gpu_query(self):
1661  d0 = torch.device('cuda:0')
1662  d1 = torch.device('cuda:1')
1663 
1664  with torch.cuda.device(d0):
1666 
1667  with torch.cuda.device(d1):
1669  torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES)
1670 
1671  self.assertTrue(s0.query())
1672  self.assertFalse(s1.query())
1673 
1674  with torch.cuda.device(d0):
1675  self.assertTrue(s0.query())
1676  self.assertFalse(s1.query())
1677 
1678  with torch.cuda.device(d1):
1679  self.assertTrue(s0.query())
1680  self.assertFalse(s1.query())
1681 
1682  # deliberately using a different device
1683  with torch.cuda.device(d0):
1684  s1.synchronize()
1685 
1686  self.assertTrue(s0.query())
1687  self.assertTrue(s1.query())
1688 
1689  with torch.cuda.device(d0):
1690  self.assertTrue(s0.query())
1691  self.assertTrue(s1.query())
1692 
1693  with torch.cuda.device(d1):
1694  self.assertTrue(s0.query())
1695  self.assertTrue(s1.query())
1696 
1697  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1698  def test_streams_multi_gpu_eq(self):
1699  d0 = torch.device('cuda:0')
1700  d1 = torch.device('cuda:1')
1701 
1702  with torch.cuda.device(d0):
1705 
1706  with torch.cuda.device(d1):
1709 
1710  self.assertTrue(s0 == s0)
1711  self.assertTrue(s0 == s1)
1712  self.assertTrue(s2 == s2)
1713  self.assertTrue(s2 == s3)
1714  self.assertFalse(s0 == s2)
1715  self.assertFalse(s1 == s3)
1716 
1717  self.assertEqual(s0.device, s1.device)
1718  self.assertEqual(s0.cuda_stream, s1.cuda_stream)
1719  self.assertEqual(s2.device, s3.device)
1720  self.assertEqual(s2.cuda_stream, s3.cuda_stream)
1721  self.assertNotEqual(s0.device, s3.device)
1722 
1723  self.assertEqual(hash(s0), hash(s1))
1724  self.assertEqual(hash(s2), hash(s3))
1725  self.assertNotEqual(hash(s0), hash(s3))
1726 
1727  @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
1728  @skipIfRocm
1729  def test_streams_priority(self):
1730  low, high = torch.cuda.Stream.priority_range()
1731  s0 = torch.cuda.Stream(device=0, priority=low)
1732 
1733  self.assertEqual(low, s0.priority)
1734  self.assertEqual(torch.device('cuda:0'), s0.device)
1735 
1736  s1 = torch.cuda.Stream(device=1, priority=high)
1737 
1738  self.assertEqual(high, s1.priority)
1739  self.assertEqual(torch.device('cuda:1'), s1.device)
1740 
1741  @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
1742  def test_tensor_device(self):
1743  self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 0)
1744  self.assertEqual(torch.cuda.FloatTensor(1, device=1).get_device(), 1)
1745  with torch.cuda.device(1):
1746  self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 1)
1747  self.assertEqual(torch.cuda.FloatTensor(1, device=0).get_device(), 0)
1748  self.assertEqual(torch.cuda.FloatTensor(1, device=None).get_device(), 1)
1749 
1750  @skipIfRocm
1751  def test_events(self):
1752  stream = torch.cuda.current_stream()
1753  event = torch.cuda.Event(enable_timing=True)
1754  self.assertTrue(event.query())
1755  start_event = torch.cuda.Event(enable_timing=True)
1756  stream.record_event(start_event)
1757  torch.cuda._sleep(int(50 * get_cycles_per_ms()))
1758  stream.record_event(event)
1759  self.assertFalse(event.query())
1760  event.synchronize()
1761  self.assertTrue(event.query())
1762  self.assertGreater(start_event.elapsed_time(event), 0)
1763 
1764  @staticmethod
1765  def _stream_synchronize(self, spin_time_cycles):
1767  e_tik = torch.cuda.Event(enable_timing=True)
1768  e_tok = torch.cuda.Event(enable_timing=True)
1769 
1770  e_tik.record(s)
1771  torch.cuda._sleep(spin_time_cycles)
1772  e_tok.record(s)
1773  s.synchronize()
1774 
1775  self.assertTrue(s.query())
1776 
1777  # not necessary to check e_tik and e_tok, as elapsed_time would throw
1778  # exception if otherwise.
1779  return e_tik.elapsed_time(e_tok)
1780 
1781  @staticmethod
1782  def _event_synchronize(self, spin_time_cycles):
1784  e_tik = torch.cuda.Event(enable_timing=True)
1785  e_tok = torch.cuda.Event(enable_timing=True)
1786 
1787  e_tik.record(s)
1788  torch.cuda._sleep(spin_time_cycles)
1789  s.record_event(e_tok)
1790  e_tok.synchronize()
1791 
1792  self.assertTrue(s.query())
1793 
1794  # not necessary to check e_tik and e_tok, as elapsed_time would throw
1795  # exception if otherwise.
1796  return e_tik.elapsed_time(e_tok)
1797 
1798  @staticmethod
1799  def _event_wait(self, spin_time_cycles):
1801  s1 = torch.cuda.Stream()
1802  e_tik = torch.cuda.Event(blocking=True, enable_timing=True)
1803  e_tok = torch.cuda.Event(blocking=True, enable_timing=True)
1804 
1805  e_tik.record(s0)
1806  torch.cuda._sleep(spin_time_cycles - 10)
1807  e_sync = torch.cuda.Event(blocking=True)
1808  e_sync.record()
1809  e_sync.wait(s1)
1810  with torch.cuda.stream(s1):
1811  torch.cuda._sleep(10)
1812  s1.synchronize()
1813  s1.record_event(e_tok)
1814 
1815  self.assertTrue(s0.query())
1816  self.assertTrue(s1.query())
1817  self.assertTrue(e_sync.query())
1818 
1819  # not necessary to check e_tik and e_tok, as elapsed_time would throw
1820  # exception if otherwise.
1821  return e_tik.elapsed_time(e_tok)
1822 
1823  @staticmethod
1824  def _test_stream_event_nogil(self, sync_func, p2c, c2p):
1825  with torch.cuda.device('cuda:1'):
1826  c2p.put(0)
1827  p2c.get()
1828  c2p.put(sync_func(self, TestCuda.FIFTY_MIL_CYCLES))
1829 
1830  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1831  @skipIfRocm
1832  def test_stream_event_nogil(self):
1833  for sync_func in [TestCuda._stream_synchronize,
1834  TestCuda._event_synchronize,
1835  TestCuda._event_wait]:
1836  p2c = queue.Queue()
1837  c2p = queue.Queue()
1838  e_tik = torch.cuda.Event(enable_timing=True)
1839  e_tok = torch.cuda.Event(enable_timing=True)
1840 
1841  t = threading.Thread(
1842  target=TestCuda._test_stream_event_nogil,
1843  args=(self, sync_func, p2c, c2p))
1844  t.daemon = True
1845  t.start()
1846 
1847  c2p.get()
1848  with torch.cuda.device('cuda:0'):
1849  e_tik.record()
1850  p2c.put(0)
1851  parent_time = sync_func(self, TestCuda.FIFTY_MIL_CYCLES)
1852  child_time = c2p.get()
1853  e_tok.record()
1854  e_tok.synchronize()
1855  total_time = e_tik.elapsed_time(e_tok)
1856 
1857  # Without GIL, synchronizations in parent and child threads can
1858  # overlap. The total execution time should be a little bit longer
1859  # than spinning fifty million cycles and much shorter than twice of
1860  # that. However, testing absolute execution time is not reliable as
1861  # it may vary on different hardware in different environments.
1862  # Therefore, this test uses relative comparisons, checking if the
1863  # sum of parent and child threads execution time is greater than the
1864  # real execution time by least 40%.
1865  self.assertGreater(parent_time + child_time, total_time * 1.4)
1866 
1867  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1868  @skipIfRocm
1869  def test_events_wait(self):
1870  d0 = torch.device('cuda:0')
1871  d1 = torch.device('cuda:1')
1872 
1873  with torch.cuda.device(d0):
1875  torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES)
1876  e0 = torch.cuda.Event()
1877  s0.record_event(e0)
1878 
1879  with torch.cuda.device(d1):
1881 
1882  self.assertFalse(s0.query())
1883  self.assertTrue(s1.query())
1884 
1885  s1.wait_event(e0)
1886  s1.synchronize()
1887 
1888  self.assertTrue(e0.query())
1889  self.assertTrue(s0.query())
1890  self.assertTrue(s1.query())
1891 
1892  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1893  @skipIfRocm
1894  def test_events_multi_gpu_query(self):
1895  d0 = torch.device('cuda:0')
1896  d1 = torch.device('cuda:1')
1897 
1898  with torch.cuda.device(d0):
1900  e0 = s0.record_event()
1901 
1902  with torch.cuda.device(d1):
1904  torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES)
1905  e1 = s1.record_event()
1906 
1907  self.assertTrue(e0.query())
1908  self.assertFalse(e1.query())
1909 
1910  with torch.cuda.device(d0):
1911  self.assertTrue(e0.query())
1912  self.assertFalse(e1.query())
1913 
1914  with torch.cuda.device(d1):
1915  self.assertTrue(e0.query())
1916  self.assertFalse(e1.query())
1917 
1918  # deliberately using a different device
1919  with torch.cuda.device(d0):
1920  e1.synchronize()
1921 
1922  self.assertTrue(e0.query())
1923  self.assertTrue(e1.query())
1924 
1925  with torch.cuda.device(d0):
1926  self.assertTrue(e0.query())
1927  self.assertTrue(e1.query())
1928 
1929  with torch.cuda.device(d1):
1930  self.assertTrue(e0.query())
1931  self.assertTrue(e1.query())
1932 
1933  @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
1934  @skipIfRocm
1935  def test_events_multi_gpu_elapsed_time(self):
1936  d0 = torch.device('cuda:0')
1937  d1 = torch.device('cuda:1')
1938 
1939  with torch.cuda.device(d0):
1941  e0 = torch.cuda.Event(enable_timing=True)
1942  torch.cuda._sleep(10)
1943  s0.record_event(e0)
1944 
1945  with torch.cuda.device(d1):
1947  e1 = torch.cuda.Event(enable_timing=True)
1948  torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES)
1949  s1.record_event(e1)
1950 
1951  e0.synchronize()
1952  e1.synchronize()
1953  with torch.cuda.device(d0):
1954  with self.assertRaises(RuntimeError):
1955  self.assertGreater(e0.elapsed_time(e1), 0)
1956 
1957  with torch.cuda.device(d1):
1958  with self.assertRaises(RuntimeError):
1959  self.assertGreater(e0.elapsed_time(e1), 0)
1960 
1961  with torch.cuda.device(d0):
1963  e2 = torch.cuda.Event(enable_timing=True)
1964  torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES)
1965  s0.record_event(e2)
1966  s0.synchronize()
1967 
1968  self.assertGreater(e0.elapsed_time(e2), 0)
1969 
1970  # deliberately calling from a different device
1971  with torch.cuda.device(d1):
1972  self.assertGreater(e0.elapsed_time(e2), 0)
1973 
1974  @skipIfRocm
1975  def test_record_stream(self):
1976  cycles_per_ms = get_cycles_per_ms()
1977 
1978  t = torch.FloatTensor([1, 2, 3, 4]).pin_memory()
1979  result = torch.cuda.FloatTensor(t.size())
1980  stream = torch.cuda.Stream()
1981  ptr = [None]
1982 
1983  # Performs the CPU->GPU copy in a background stream
1984  def perform_copy():
1985  with torch.cuda.stream(stream):
1986  tmp = t.cuda(non_blocking=True)
1987  ptr[0] = tmp.data_ptr()
1988  torch.cuda.current_stream().wait_stream(stream)
1989  tmp.record_stream(torch.cuda.current_stream())
1990  torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy
1991  result.copy_(tmp)
1992 
1993  perform_copy()
1994  with torch.cuda.stream(stream):
1995  tmp2 = torch.cuda.FloatTensor(t.size())
1996  tmp2.zero_()
1997  self.assertNotEqual(tmp2.data_ptr(), ptr[0], 'allocation re-used to soon')
1998 
1999  self.assertEqual(result.tolist(), [1, 2, 3, 4])
2000 
2001  # Check that the block will be re-used after the main stream finishes
2002  torch.cuda.current_stream().synchronize()
2003  with torch.cuda.stream(stream):
2004  tmp3 = torch.cuda.FloatTensor(t.size())
2005  self.assertEqual(tmp3.data_ptr(), ptr[0], 'allocation not re-used')
2006 
2007  def test_noncontiguous_pinned_memory(self):
2008  # See issue #3266
2009  x = torch.arange(0, 10).view((2, 5))
2010  self.assertEqual(x.t(), x.t().pin_memory())
2011 
2012  @skipIfRocm
2013  def test_caching_pinned_memory(self):
2014  cycles_per_ms = get_cycles_per_ms()
2015 
2016  # check that allocations are re-used after deletion
2017  t = torch.FloatTensor([1]).pin_memory()
2018  ptr = t.data_ptr()
2019  del t
2020  t = torch.FloatTensor([1]).pin_memory()
2021  self.assertEqual(t.data_ptr(), ptr, 'allocation not reused')
2022 
2023  # check that the allocation is not re-used if it's in-use by a copy
2024  gpu_tensor = torch.cuda.FloatTensor([0])
2025  torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy
2026  gpu_tensor.copy_(t, non_blocking=True)
2027  del t
2028  t = torch.FloatTensor([1]).pin_memory()
2029  self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
2030  self.assertEqual(list(gpu_tensor), [1])
2031 
2032  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
2033  @skipIfRocm
2034  def test_caching_pinned_memory_multi_gpu(self):
2035  # checks that the events preventing pinned memory from being re-used
2036  # too early are recorded on the correct GPU
2037  cycles_per_ms = get_cycles_per_ms()
2038 
2039  t = torch.FloatTensor([1]).pin_memory()
2040  ptr = t.data_ptr()
2041  gpu_tensor0 = torch.cuda.FloatTensor([0], device=0)
2042  gpu_tensor1 = torch.cuda.FloatTensor([0], device=1)
2043 
2044  with torch.cuda.device(1):
2045  torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy
2046  gpu_tensor1.copy_(t, non_blocking=True)
2047 
2048  del t
2049  t = torch.FloatTensor([2]).pin_memory()
2050  self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
2051 
2052  with torch.cuda.device(0):
2053  gpu_tensor0.copy_(t, non_blocking=True)
2054 
2055  self.assertEqual(gpu_tensor1[0], 1)
2056  self.assertEqual(gpu_tensor0[0], 2)
2057 
2058  def test_reduction_gpu_memory_accessing(self):
2059  x = torch.ones(512, 8, dtype=torch.float32, device='cuda')
2060  torch.sum(x, 0)
2061 
2062  def test_sum_cpu_gpu_mismatch(self):
2063  x = torch.randn(20, dtype=torch.float32, device='cuda')
2064  y = torch.randn(1, dtype=torch.float32)
2065  with self.assertRaisesRegex(RuntimeError, 'expected type'
2066  ' torch.FloatTensor but got'
2067  ' torch.cuda.FloatTensor'):
2068  torch.sum(x, dim=[0], dtype=torch.float32, out=y)
2069  # makeing sure half to float promotion is also properly working.
2070  x = x.half()
2071  with self.assertRaisesRegex(RuntimeError, 'expected type'
2072  ' torch.FloatTensor but got'
2073  ' torch.cuda.HalfTensor'):
2074  torch.sum(x, dim=[0], dtype=torch.float32, out=y)
2075 
2076  @skipIfRocm
2077  def test_sum_noncontig(self):
2078  x = torch.randn(1, 75, 57, 20, device='cuda').permute(0, 3, 1, 2)
2079  y = x.cpu()
2080  self.assertEqual(x.sum().cpu(), y.sum())
2081  self.assertEqual(x.sum(dim=(-1, -2)).cpu(), y.sum(dim=(-1, -2)))
2082  self.assertEqual(x.sum(dim=(1, 3)).cpu(), y.sum(dim=(1, 3)))
2083 
2084  def test_sum_fp16(self):
2085  x = torch.zeros(10, device='cuda', dtype=torch.float16)
2086  self.assertEqual(x.sum(), 0)
2087 
2088  x = torch.ones(65504, device='cuda', dtype=torch.float16)
2089  self.assertEqual(x.sum(), 65504)
2090  self.assertEqual(x.sum(dtype=torch.float32), 65504)
2091 
2092  x = torch.ones(65536, device='cuda', dtype=torch.float16)
2093  self.assertEqual(x.sum(dtype=torch.float32), 65536)
2094 
2095  a = torch.zeros(1203611).bernoulli_(0.0005)
2096  x = a.to(device='cuda', dtype=torch.float16)
2097  self.assertEqual(x.sum().item(), a.sum().item())
2098 
2099  a = torch.zeros(100, 121, 80).bernoulli_(0.0005)
2100  x = a.to(device='cuda', dtype=torch.float16)
2101  self.assertEqual(x.sum((0, 2)).float().cpu(), a.sum((0, 2)))
2102 
2103  def test_mean_fp16(self):
2104  x = torch.ones(65536, device='cuda', dtype=torch.float16)
2105  self.assertEqual(x.mean(), 1)
2106 
2107  x = torch.ones(65536, device='cuda', dtype=torch.float16)
2108  self.assertEqual(x.mean(dtype=torch.float32), 1)
2109 
2110  def test_prod_large(self):
2111  # tests global reduction (should_global_reduce = true) in case of non-zero identity element
2112  x = torch.ones(240000, device='cuda', dtype=torch.float32)
2113  self.assertEqual(x.prod(), 1)
2114 
2115  @staticmethod
2116  def _select_broadcastable_dims(dims_full=None):
2117  return _TestTorchMixin._select_broadcastable_dims(dims_full)
2118 
2119  @skipIfRocm
2120  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2121  def test_inverse(self):
2122  _TestTorchMixin._test_inverse(self, lambda t: t.cuda())
2123 
2124  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2125  def test_pinverse(self):
2126  _TestTorchMixin._test_pinverse(self, lambda t: t.cuda())
2127 
2128  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2129  def test_matrix_rank(self):
2130  _TestTorchMixin._test_matrix_rank(self, lambda x: x.cuda())
2131 
2132  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2133  def test_matrix_power(self):
2134  _TestTorchMixin._test_matrix_power(self, conv_fn=lambda t: t.cuda())
2135 
2136  def test_chain_matmul(self):
2137  _TestTorchMixin._test_chain_matmul(self, cast=lambda t: t.cuda())
2138 
2139  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2140  def test_det_logdet_slogdet(self):
2141  _TestTorchMixin._test_det_logdet_slogdet(self, lambda t: t.cuda())
2142 
2143  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2144  def test_solve(self):
2145  _TestTorchMixin._test_solve(self, lambda t: t.cuda())
2146 
2147  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2148  def test_solve_batched(self):
2149  _TestTorchMixin._test_solve_batched(self, lambda t: t.cuda())
2150 
2151  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2152  def test_solve_batched_dims(self):
2153  _TestTorchMixin._test_solve_batched_dims(self, lambda t: t.cuda())
2154 
2155  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2156  def test_cholesky_solve(self):
2157  _TestTorchMixin._test_cholesky_solve(self, lambda t: t.cuda())
2158 
2159  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2160  def test_cholesky_solve_batched(self):
2161  _TestTorchMixin._test_cholesky_solve_batched(self, lambda t: t.cuda())
2162 
2163  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2164  def test_cholesky_solve_batched_dims(self):
2165  _TestTorchMixin._test_cholesky_solve_batched_dims(self, lambda t: t.cuda())
2166 
2167  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2168  def test_cholesky(self):
2169  _TestTorchMixin._test_cholesky(self, lambda t: t.cuda())
2170 
2171  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2172  def test_cholesky_batched(self):
2173  _TestTorchMixin._test_cholesky_batched(self, lambda t: t.cuda())
2174 
2175  def test_view(self):
2176  _TestTorchMixin._test_view(self, lambda t: t.cuda())
2177 
2178  def test_flip(self):
2179  _TestTorchMixin._test_flip(self, use_cuda=True)
2180 
2181  def test_rot90(self):
2182  _TestTorchMixin._test_rot90(self, use_cuda=True)
2183 
2184  def test_signal_window_functions(self):
2185  _TestTorchMixin._test_signal_window_functions(self, device=torch.device('cuda'))
2186 
2187  @skipIfRocm
2188  def test_fft_ifft_rfft_irfft(self):
2189  _TestTorchMixin._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
2190 
2191  @contextmanager
2192  def plan_cache_max_size(n):
2193  original = torch.backends.cuda.cufft_plan_cache.max_size
2194  torch.backends.cuda.cufft_plan_cache.max_size = n
2195  yield
2196  torch.backends.cuda.cufft_plan_cache.max_size = original
2197 
2198  with plan_cache_max_size(max(1, torch.backends.cuda.cufft_plan_cache.size - 10)):
2199  _TestTorchMixin._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
2200 
2201  with plan_cache_max_size(0):
2202  _TestTorchMixin._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
2203 
2204  torch.backends.cuda.cufft_plan_cache.clear()
2205 
2206  # check that stll works after clearing cache
2207  with plan_cache_max_size(10):
2208  _TestTorchMixin._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda'))
2209 
2210  with self.assertRaisesRegex(RuntimeError, r"must be non-negative"):
2211  torch.backends.cuda.cufft_plan_cache.max_size = -1
2212 
2213  with self.assertRaisesRegex(RuntimeError, r"read-only property"):
2214  torch.backends.cuda.cufft_plan_cache.size = -1
2215 
2216  def test_stft(self):
2217  _TestTorchMixin._test_stft(self, device=torch.device('cuda'))
2218 
2219  def test_multinomial(self):
2220  _TestTorchMixin._test_multinomial(self, torch.cuda.FloatTensor)
2221 
2222  # Test two corner cases from older PyTorch (Issue #4858)
2223  freqs = torch.cuda.FloatTensor([
2224  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
2225  0.03178183361887932, 0.027680952101945877, 0.033176131546497345,
2226  0.046052902936935425, 0.07742464542388916, 0.11543981730937958,
2227  0.14148041605949402, 0.15784293413162231, 0.13180233538150787,
2228  0.08271478116512299, 0.049702685326337814, 0.027557924389839172,
2229  0.018125897273421288, 0.011851548217236996, 0.010252203792333603,
2230  0.007422595750540495, 0.005372154992073774, 0.0045109698548913,
2231  0.0036087757907807827, 0.0035267581697553396, 0.0018864056328311563,
2232  0.0024605290964245796, 0.0022964938543736935, 0.0018453967059031129,
2233  0.0010662291897460818, 0.0009842115687206388, 0.00045109697384759784,
2234  0.0007791675161570311, 0.00020504408166743815, 0.00020504408166743815,
2235  0.00020504408166743815, 0.00012302644609007984, 0.0,
2236  0.00012302644609007984, 4.100881778867915e-05, 0.0, 0.0, 0.0, 0.0,
2237  0.0, 0.0])
2238 
2239  torch.cuda.manual_seed(11042)
2240  sample = torch.multinomial(freqs, 1000, True)
2241  self.assertNotEqual(freqs[sample].min(), 0)
2242 
2243  p = torch.zeros(3421, 2, device="cuda", dtype=torch.float)
2244  p[:, 1] = 1
2245  torch.cuda.manual_seed(5214)
2246  r = torch.multinomial(p, 1)
2247  self.assertNotEqual(r.min().item(), 0)
2248 
2249  # test corner case from Issue #13867
2250  torch.cuda.manual_seed(33)
2251  probs = torch.randn(1000000, device='cuda').clamp(min=0) * 3e-5
2252  samples = probs.multinomial(1000000, replacement=True)
2253  self.assertGreater(probs[samples].min().item(), 0)
2254 
2255  @staticmethod
2256  def mute():
2257  os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stderr.fileno())
2258 
2259  def _spawn_method(self, method, arg):
2260  ctx = mp.get_context("spawn")
2261  with ctx.Pool(1, initializer=self.mute) as pool:
2262  errors = pool.map(method, [arg])
2263  for e in errors:
2264  if 'device-side assert triggered' not in str(e):
2265  self.fail(e)
2266 
2267  @staticmethod
2268  def _test_multinomial_invalid_probs_cuda(probs):
2269  try:
2270  with torch.random.fork_rng(devices=[0]):
2271  torch.multinomial(probs.to('cuda'), 2)
2273  return False # Should not be reached
2274  except RuntimeError as e:
2275  return e
2276 
2277  @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
2278  don't support multiprocessing with spawn start method")
2279  @unittest.skipIf(IS_WINDOWS, 'FIXME: CUDA OOM error on Windows')
2280  @unittest.skipIf(not PY3,
2281  "spawn start method is not supported in Python 2, \
2282  but we need it for creating another process with CUDA")
2283  def test_multinomial_invalid_probs_cuda(self):
2284  test_method = TestCuda._test_multinomial_invalid_probs_cuda
2285  self._spawn_method(test_method, torch.Tensor([1, -1, 1]))
2286  self._spawn_method(test_method, torch.Tensor([1, inf, 1]))
2287  self._spawn_method(test_method, torch.Tensor([1, -inf, 1]))
2288  self._spawn_method(test_method, torch.Tensor([1, 1, nan]))
2289  self._spawn_method(test_method, torch.Tensor([0, 1, 0]))
2290 
2291  def test_broadcast(self):
2292  _TestTorchMixin._test_broadcast(self, lambda t: t.cuda())
2293 
2294  def test_contiguous(self):
2295  _TestTorchMixin._test_contiguous(self, lambda t: t.cuda())
2296 
2297  def test_broadcast_fused_matmul(self):
2298  _TestTorchMixin._test_broadcast_fused_matmul(self, lambda t: t.cuda())
2299 
2300  def test_broadcast_batched_matmul(self):
2301  _TestTorchMixin._test_broadcast_batched_matmul(self, lambda t: t.cuda())
2302 
2303  def test_index(self):
2304  _TestTorchMixin._test_index(self, lambda t: t.cuda())
2305 
2306  def test_advancedindex(self):
2307  _TestTorchMixin._test_advancedindex(self, lambda t: t.cuda())
2308 
2309  def test_advancedindex_mixed_cpu_cuda(self):
2310  def test(x, ia, ib):
2311  # test getitem
2312  self.assertEqual(x[:, ia, None, ib, 0].cpu(),
2313  x.cpu()[:, ia.cpu(), None, ib.cpu(), 0])
2314  self.assertEqual(x[ia], x.cpu()[ia.cpu()])
2315  # test setitem
2316  x_clone1 = x.clone()
2317  x_clone2 = x.clone()
2318  first_shape = x[:, ia, None, ib, 0].shape
2319  second_shape = x[ia].shape
2320  x_clone1[:, ia, None, ib, 0] = torch.randn(first_shape).to(x_clone1)
2321  x_clone2[ia] = torch.randn(second_shape).to(x_clone2)
2322 
2323  cpu = torch.device('cpu')
2324  for device in ['cuda:0', 'cuda:1'] if torch.cuda.device_count() > 1 else ['cuda']:
2325  # Index cpu tensor with cuda tensor
2326  x = torch.randn(3, 4, 4, 4, 3)
2327  ia = torch.tensor([0, 2, 1]).to(device)
2328  ib = torch.tensor([0, 2, 1]).to(device)
2329  test(x, ia, ib)
2330 
2331  # Index cuda tensor with cpu tensor
2332  x = x.to(device)
2333  ia = ia.to(cpu)
2334  ib = ib.to(cpu)
2335  test(x, ia, ib)
2336 
2337  # Index cpu tensor with mixed cpu, cuda tensors
2338  x = x.to(cpu)
2339  ia = ia.to(cpu)
2340  ib = ib.to(device)
2341  test(x, ia, ib)
2342 
2343  # Index cuda tensor with mixed cpu, cuda tensors
2344  x = x.to(device)
2345  ia = ia.to(cpu)
2346  ib = ib.to(device)
2347  test(x, ia, ib)
2348 
2349  if torch.cuda.device_count() > 1:
2350  other_device = 'cuda:0' if device != 'cuda:0' else 'cuda:1'
2351  # Index cuda tensor with mixed cpu, cuda tensors on different devices
2352  x = x.to(device)
2353  ia = ia.to(cpu)
2354  ib = ib.to(other_device)
2355  test(x, ia, ib)
2356 
2357  def test_advancedindex_big(self):
2358  _TestTorchMixin._test_advancedindex_big(self, lambda t: t.cuda())
2359 
2360  def test_kthvalue(self):
2361  _TestTorchMixin._test_kthvalue(self, device='cuda')
2362 
2363  @skipIfRocm
2364  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2365  def test_btrifact(self):
2366  _TestTorchMixin._test_btrifact(self, lambda t: t.cuda())
2367 
2368  @skipIfRocm
2369  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2370  def test_btrisolve(self):
2371  _TestTorchMixin._test_btrisolve(self, lambda t: t.cuda())
2372 
2373  @skipIfRocm
2374  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2375  def test_btriunpack(self):
2376  _TestTorchMixin._test_btriunpack(self, lambda t: t.cuda())
2377 
2378  def test_dim_reduction(self):
2379  _TestTorchMixin._test_dim_reduction(self, lambda t: t.cuda())
2380 
2381  def test_tensor_gather(self):
2382  _TestTorchMixin._test_gather(self, lambda t: t.cuda(), False)
2383 
2384  def test_tensor_scatter(self):
2385  _TestTorchMixin._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', test_bounds=False)
2386 
2387  def test_tensor_scatterAdd(self):
2388  _TestTorchMixin._test_scatter_base(self, lambda t: t.cuda(), 'scatter_add_', test_bounds=False)
2389 
2390  def test_tensor_scatterFill(self):
2391  _TestTorchMixin._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', True, test_bounds=False)
2392 
2393  def test_min_max_inits(self):
2394  # Testing if THC_reduceAll received the correct index initialization.
2395  # This affects the result of THC_reduceAll operations at extreme values
2396  x = torch.cuda.ByteTensor([0])
2397  y = torch.cuda.ByteTensor([255])
2398  expected = torch.cuda.LongTensor([0])[0]
2399 
2400  _, v = x.max(dim=0)
2401  self.assertEqual(v, expected)
2402 
2403  _, v = y.min(dim=0)
2404  self.assertEqual(v, expected)
2405 
2406  def test_max_with_inf(self):
2407  _TestTorchMixin._test_max_with_inf(self, (torch.half, torch.float, torch.double), 'cuda')
2408 
2409  def test_min_with_inf(self):
2410  _TestTorchMixin._test_min_with_inf(self, (torch.half, torch.float, torch.double), 'cuda')
2411 
2412  def test_rpow(self):
2413  _TestTorchMixin._test_rpow(self, lambda x: x.cuda())
2414 
2415  def test_int_pow(self):
2416  _TestTorchMixin._test_int_pow(self, lambda x: x.cuda())
2417 
2418  def test_remainder_overflow(self):
2419  _TestTorchMixin._test_remainder_overflow(self, dtype=torch.int64, device='cuda')
2420 
2421  def test_var(self):
2422  cpu_tensor = torch.randn(2, 3, 3)
2423  gpu_tensor = cpu_tensor.cuda()
2424  self.assertEqual(gpu_tensor.var(), cpu_tensor.var())
2425  self.assertEqual(gpu_tensor.var(1), cpu_tensor.var(1))
2426  self.assertEqual(gpu_tensor.var(2), cpu_tensor.var(2))
2427  self.assertEqual(gpu_tensor.std(), cpu_tensor.std())
2428  self.assertEqual(gpu_tensor.std(1), cpu_tensor.std(1))
2429  self.assertEqual(gpu_tensor.var(2), cpu_tensor.var(2))
2430 
2431  cpu_tensor = torch.randn(100)
2432  gpu_tensor = cpu_tensor.cuda()
2433  self.assertEqual(gpu_tensor.var(), cpu_tensor.var())
2434 
2435  def test_var_unbiased(self):
2436  tensor = torch.randn(100).cuda()
2437  self.assertEqual(tensor.var(0), tensor.var(0, unbiased=True))
2438  self.assertEqual(tensor.var(), tensor.var(unbiased=True))
2439  self.assertEqual(tensor.var(unbiased=False), tensor.var(0, unbiased=False))
2440 
2441  tensor = torch.FloatTensor([1.0, 2.0]).cuda()
2442  self.assertEqual(tensor.var(unbiased=True), 0.5)
2443  self.assertEqual(tensor.var(unbiased=False), 0.25)
2444 
2445  tensor = torch.randn(100).cuda()
2446  self.assertEqual(tensor.std(0), tensor.std(0, unbiased=True))
2447  self.assertEqual(tensor.std(), tensor.std(unbiased=True))
2448  self.assertEqual(tensor.std(unbiased=False), tensor.std(0, unbiased=False))
2449 
2450  def test_var_large_input(self):
2451  # Large, not-nice input
2452  tensor_cpu = torch.randn(2 * 32 * 1024 + 1, 2, 67)
2453  tensor_cuda = tensor_cpu.cuda()
2454 
2455  self.assertEqual(tensor_cpu.var(2), tensor_cuda.var(2).cpu())
2456 
2457  def test_var_stability(self):
2458  tensor = torch.FloatTensor([2281.5, 2281.25]).cuda()
2459 
2460  # Stability for inner dim
2461  self.assertEqual(tensor.var(0), 0.03125)
2462 
2463  # General stability
2464  self.assertEqual(tensor.var(), 0.03125)
2465 
2466  # Stability for outer dimensions
2467  tensor = tensor.unsqueeze(1)
2468  self.assertEqual(tensor.var(0), 0.03125)
2469 
2470  @skipIfRocm
2471  def test_digamma(self):
2472  def test(use_double=False):
2473  cpu_tensor = torch.randn(10, 10, 10)
2474  gpu_tensor = cpu_tensor.cuda()
2475  zeros = torch.zeros(10, 10, 10)
2476  if (use_double):
2477  cpu_tensor = cpu_tensor.double()
2478  gpu_tensor = gpu_tensor.double()
2479  zeros = zeros.double()
2480  cpu_out = cpu_tensor.digamma()
2481  gpu_out = gpu_tensor.digamma()
2482  norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out
2483  self.assertEqual(norm_errors, zeros)
2484 
2485  test(True)
2486  test(False)
2487 
2488  # Test float32 behavior near and at poles.
2489  cpu_tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111,
2490  -100.99999994, -1931.99999994, 0.000000111,
2491  -0.000000111, 0, -1, -2, -931])
2492  expected_errors = torch.tensor([0, 0, 0, 0, 0, 0, 0, nan, nan, nan, nan])
2493  gpu_tensor = cpu_tensor.cuda()
2494  cpu_out = cpu_tensor.digamma()
2495  gpu_out = gpu_tensor.digamma()
2496  norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out
2497  self.assertEqual(norm_errors, expected_errors)
2498 
2499  @skipIfRocm
2500  def test_polygamma(self):
2501  def test(use_double=False):
2502  cpu_tensor = torch.randn(10, 10, 10)
2503  gpu_tensor = cpu_tensor.cuda()
2504  zeros = torch.zeros(10, 10, 10)
2505  if (use_double):
2506  cpu_tensor = cpu_tensor.double()
2507  gpu_tensor = gpu_tensor.double()
2508  zeros = zeros.double()
2509  for n in [0, 1]:
2510  cpu_out = cpu_tensor.polygamma(n)
2511  gpu_out = gpu_tensor.polygamma(n)
2512  norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out
2513  self.assertEqual(norm_errors, zeros)
2514 
2515  test(True)
2516  test(False)
2517 
2518  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2519  def test_symeig(self):
2520  _TestTorchMixin._test_symeig(self, lambda t: t.cuda())
2521 
2522  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2523  def test_svd_no_singularvectors(self):
2524  _TestTorchMixin._test_svd_no_singularvectors(self, lambda t: t.cuda())
2525 
2526  def test_arange(self):
2527  for t in ['IntTensor', 'LongTensor', 'FloatTensor', 'DoubleTensor']:
2528  a = torch.cuda.__dict__[t]()
2529  torch.arange(0, 10, out=a)
2530  b = torch.__dict__[t]()
2531  torch.arange(0, 10, out=b)
2532  self.assertEqual(a, b.cuda())
2533 
2534  def test_linspace(self):
2535  a = torch.linspace(0, 10, 10, device='cuda')
2536  b = torch.linspace(0, 10, 10)
2537  self.assertEqual(a, b.cuda())
2538 
2539  def test_logspace(self):
2540  a = torch.logspace(1, 10, 10, device='cuda')
2541  b = torch.logspace(1, 10, 10)
2542  self.assertEqual(a, b.cuda())
2543 
2544  def test_lerp(self):
2545  _TestTorchMixin._test_lerp(self, lambda t: t.cuda())
2546 
2547  def test_diagonal(self):
2548  _TestTorchMixin._test_diagonal(self, dtype=torch.float32, device='cuda')
2549 
2550  def test_diagflat(self):
2551  _TestTorchMixin._test_diagflat(self, dtype=torch.float32, device='cuda')
2552 
2553  @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
2554  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2555  @skipIfRocm
2556  def test_norm(self):
2557  _TestTorchMixin._test_norm(self, device='cuda')
2558 
2559  def test_dist(self):
2560  _TestTorchMixin._test_dist(self, device='cuda')
2561 
2562  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2563  def test_geqrf(self):
2564  _TestTorchMixin._test_geqrf(self, lambda t: t.cuda())
2565 
2566  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2567  def test_trtrs(self):
2568  _TestTorchMixin._test_trtrs(self, lambda t: t.cuda())
2569 
2570  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2571  def test_trtrs_batched(self):
2572  _TestTorchMixin._test_trtrs_batched(self, lambda t: t.cuda())
2573 
2574  @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")
2575  def test_trtrs_batched_dims(self):
2576  _TestTorchMixin._test_trtrs_batched_dims(self, lambda t: t.cuda())
2577 
2578  @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
2579  def test_get_set_rng_state_all(self):
2580  states = torch.cuda.get_rng_state_all()
2581  before0 = torch.cuda.FloatTensor(100, device=0).normal_()
2582  before1 = torch.cuda.FloatTensor(100, device=1).normal_()
2583  torch.cuda.set_rng_state_all(states)
2584  after0 = torch.cuda.FloatTensor(100, device=0).normal_()
2585  after1 = torch.cuda.FloatTensor(100, device=1).normal_()
2586  self.assertEqual(before0, after0, 0)
2587  self.assertEqual(before1, after1, 0)
2588 
2589  @skipIfRocm
2590  def test_nvtx(self):
2591  # Just making sure we can see the symbols
2593  torch.cuda.nvtx.mark("bar")
2595 
2596  def test_randperm_cuda(self):
2597  cuda = torch.device('cuda:0')
2598 
2599  # For small inputs, randperm is offloaded to CPU instead
2600  with torch.random.fork_rng(devices=[0]):
2601  res1 = torch.randperm(100, device=cuda)
2602  res2 = torch.cuda.LongTensor()
2603  torch.randperm(100, out=res2, device=cuda)
2604  self.assertEqual(res1, res2, 0)
2605 
2606  with torch.random.fork_rng(devices=[0]):
2607  res1 = torch.randperm(100000, device=cuda)
2608  res2 = torch.cuda.LongTensor()
2609  torch.randperm(100000, out=res2, device=cuda)
2610  self.assertEqual(res1, res2, 0)
2611 
2612  with torch.random.fork_rng(devices=[0]):
2613  res1 = torch.randperm(100, dtype=torch.half, device=cuda)
2614  res2 = torch.cuda.HalfTensor()
2615  torch.randperm(100, out=res2, device=cuda)
2616  self.assertEqual(res1, res2, 0)
2617 
2618  with torch.random.fork_rng(devices=[0]):
2619  res1 = torch.randperm(50000, dtype=torch.half, device=cuda)
2620  res2 = torch.cuda.HalfTensor()
2621  torch.randperm(50000, out=res2, device=cuda)
2622  self.assertEqual(res1, res2, 0)
2623 
2624  # randperm of 0 elements is an empty tensor
2625  res1 = torch.randperm(0, device=cuda)
2626  res2 = torch.cuda.LongTensor(5)
2627  torch.randperm(0, out=res2, device=cuda)
2628  self.assertEqual(res1.numel(), 0)
2629  self.assertEqual(res2.numel(), 0)
2630 
2631  def test_random_neg_values(self):
2632  _TestTorchMixin._test_random_neg_values(self, use_cuda=True)
2633 
2634  def test_bincount_cuda(self):
2635  _TestTorchMixin._test_bincount(self, device='cuda')
2636  # ensure CUDA code coverage
2637  input_size = (5000,)
2638  w = torch.randn(input_size, device='cuda')
2639  w_cpu = w.cpu()
2640  # test shared memory impl
2641  t = torch.randint(50, input_size, dtype=torch.int8, device='cuda')
2642  self.assertEqual(t.cpu().bincount(), t.bincount())
2643  self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
2644  # test multi block memory impl
2645  # see `THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM` in SummaryOps.cu
2646  t = torch.randint(500, input_size, dtype=torch.int64, device='cuda')
2647  self.assertEqual(t.cpu().bincount(), t.bincount())
2648  self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
2649  # test global memory impl
2650  # see `THRESH_NUMBER_BINS_FOR_GLOBAL_MEM` in SummaryOps.cu
2651  t = torch.randint(2000, input_size, dtype=torch.int64, device='cuda')
2652  self.assertEqual(t.cpu().bincount(), t.bincount())
2653  self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w))
2654 
2655  def test_histc_cuda(self):
2656  _TestTorchMixin._test_histc(self, device='cuda')
2657 
2658  def test_tiny_half_norm_(self):
2659  a = torch.arange(25).cuda().float()
2660  a /= 100000000
2661  b = a.half()
2662  self.assertGreater(b.norm().item(), 0)
2663 
2664  def test_norm_type_conversion(self):
2665  a = torch.ones(65536).cuda().half()
2666  self.assertEqual(a.norm(p=0, dtype=torch.float32), 65536)
2667 
2668  # Note: This test fails on ROCm CI gfx900 but passes on gfx906
2669  @skipIfRocm
2670  # Test that wrap_with_cuda_memory_check successfully detects leak
2671  def test_cuda_memory_leak_detection(self):
2672  l = []
2673 
2674  @self.wrap_with_cuda_memory_check
2675  def no_leak():
2676  pass
2677 
2678  @self.wrap_with_cuda_memory_check
2679  def leak_gpu0():
2680  l.append(torch.tensor(10, device=torch.device("cuda:0")))
2681 
2682  no_leak()
2683 
2684  with self.assertRaisesRegex(AssertionError, r"leaked \d+ bytes CUDA memory on device 0"):
2685  leak_gpu0()
2686 
2687  if TEST_MULTIGPU:
2688  @self.wrap_with_cuda_memory_check
2689  def leak_gpu1():
2690  l.append(torch.tensor(10, device=torch.device("cuda:1")))
2691 
2692  with self.assertRaisesRegex(AssertionError, r"leaked \d+ bytes CUDA memory on device 1"):
2693  leak_gpu1()
2694 
2695  def test_cuda_memory_leak_detection_propagates_errors(self):
2696  with self.assertRaisesRegex(RuntimeError, r"The size of tensor a \(3\) must match"):
2697  with self.assertLeaksNoCudaTensors():
2698  x = torch.randn(3, 1, device='cuda')
2699  y = torch.randn(2, 1, device='cuda')
2700  z = x + y
2701 
2702  def test_trilu_indices(self):
2703  for test_args in tri_tests_args:
2704  _compare_trilu_indices(self, *test_args, device='cuda')
2705 
2706  # test default options
2707  x = torch.ones(
2708  3, 3, dtype=torch.long, device='cuda', layout=torch.strided)
2709  self.assertEqual(
2710  x.tril(0).nonzero().transpose(0, 1),
2711  torch.tril_indices(3, 3, device='cuda'))
2712  self.assertEqual(
2713  x.triu(0).nonzero().transpose(0, 1),
2714  torch.triu_indices(3, 3, device='cuda'))
2715 
2716  def test_large_trilu_indices(self):
2717  for test_args in tri_large_tests_args:
2718  _compare_large_trilu_indices(self, *test_args, device='cuda')
2719 
2720  def test_triu_tril(self):
2721  _TestTorchMixin._test_triu_tril(self, lambda t: t.cuda())
2722 
2723  def test_cuda_round(self):
2724  # test half-to-even
2725  a = [-5.8, -3.5, -2.3, -1.5, -0.5, 0.5, 1.5, 2.3, 3.5, 5.8]
2726  res = [-6., -4., -2., -2., 0., 0., 2., 2., 4., 6.]
2727 
2728  self.assertEqual(
2729  torch.HalfTensor(a).cuda().round().cpu(),
2730  torch.HalfTensor(res).cpu())
2731  self.assertEqual(
2732  torch.FloatTensor(a).cuda().round().cpu(),
2733  torch.FloatTensor(res).cpu())
2734  self.assertEqual(
2735  torch.DoubleTensor(a).cuda().round().cpu(),
2736  torch.DoubleTensor(res).cpu())
2737 
2738 
2739 def load_ignore_file():
2740  from os.path import join, dirname
2741  global ignores
2742  path = join(dirname(__file__), 'data', 'test_cuda_ignores.txt')
2743  with open(path, 'r') as f:
2744  ignores = {l for l in f.read().splitlines() if not l.startswith('#')}
2745 
2746 
2747 def generate_tests():
2748  for decl in tests:
2749  for t in types:
2750  tensor = t()
2751 
2752  # Default values
2753  desc = ''
2754  type_subset = types
2755  no_inplace = False
2756  decorator = None
2757  if len(decl) == 3:
2758  name, constr, arg_constr = decl
2759  elif len(decl) == 4:
2760  name, constr, arg_constr, desc = decl
2761  elif len(decl) == 5:
2762  name, constr, arg_constr, desc, type_subset = decl
2763  elif len(decl) == 6:
2764  name, constr, arg_constr, desc, type_subset, no_inplace = decl
2765  elif len(decl) == 7:
2766  name, constr, arg_constr, desc, type_subset, no_inplace, decorator = decl
2767 
2768  if t not in type_subset:
2769  continue
2770  if TEST_WITH_ROCM and decorator is not None:
2771  if (isinstance(decorator, str)):
2772  tensor_type_name = str(t.__name__)
2773  decorator_list = decorator.split(":")
2774  skip_type_list = decorator_list[1].split(",")
2775  if (("ByteTensor" in skip_type_list) and tensor_type_name == "ByteTensor") \
2776  or (("CharTensor" in skip_type_list) and tensor_type_name == "CharTensor") \
2777  or (("DoubleTensor" in skip_type_list) and tensor_type_name == "DoubleTensor") \
2778  or (("FloatTensor" in skip_type_list) and tensor_type_name == "FloatTensor") \
2779  or (("HalfTensor" in skip_type_list) and tensor_type_name == "HalfTensor") \
2780  or (("IntTensor" in skip_type_list) and tensor_type_name == "IntTensor") \
2781  or (("LongTensor" in skip_type_list) and tensor_type_name == "LongTensor") \
2782  or (("ShortTensor" in skip_type_list) and tensor_type_name == "ShortTensor"):
2783  decorator = skipIfRocm
2784  else:
2785  decorator = None
2786  elif ((not TEST_WITH_ROCM) and (decorator is not None)):
2787  if (isinstance(decorator, str)):
2788  decorator = None
2789 
2790  precision = custom_precision.get(name, TestCuda.precision)
2791  if is_half(t):
2792  precision = custom_half_precision.get(name, precision)
2793 
2794  for inplace in (True, False):
2795  if inplace and no_inplace:
2796  continue
2797  if inplace:
2798  name_inner = name + '_'
2799  else:
2800  name_inner = name
2801 
2802  if t != torch.HalfTensor and not hasattr(tensor, name_inner):
2803  # torch.HalfTensor doesn't support most operations,
2804  # but we use torch.FloatTensor as cpu baseline
2805  continue
2806  full_name = '{}.{}'.format(tensor.type(), name_inner)
2807  if full_name in ignores:
2808  continue
2809 
2810  test_name = 'test_' + t.__name__ + '_' + name_inner
2811  if desc:
2812  test_name += '_' + desc
2813 
2814  assert not hasattr(TestCuda, test_name), "Duplicated test name: " + test_name
2815 
2816  test_fn = compare_cpu_gpu(constr, arg_constr, name_inner, t, precision)
2817 
2818  if decorator is not None:
2819  test_fn = decorator(test_fn)
2820 
2821  setattr(TestCuda, test_name, test_fn)
2822 
2823 
2824 if __name__ == '__main__':
2825  if TEST_CUDA:
2826  load_ignore_file()
2827  generate_tests()
2828 
2829  run_tests()
def reset_max_memory_cached(device=None)
Definition: __init__.py:510
def range_pop()
Definition: nvtx.py:56
def max_memory_allocated(device=None)
Definition: __init__.py:428
def set_device(device)
Definition: __init__.py:253
Module caffe2.python.layers.split.
def get_device_properties(device)
Definition: __init__.py:297
Definition: test.py:1
def is_available()
Definition: __init__.py:45
def device_count()
Definition: __init__.py:341
def max_memory_cached(device=None)
Definition: __init__.py:487
def _sleep(cycles)
Definition: __init__.py:53
def fork_rng(devices=None, enabled=True, _caller="fork_rng", _devices_kw="devices")
Definition: random.py:49
def synchronize()
Definition: __init__.py:355
def is_tensor(obj)
Definition: __init__.py:114
def memory_cached(device=None)
Definition: __init__.py:470
def memory_allocated(device=None)
Definition: __init__.py:409
def mark(msg)
Definition: nvtx.py:66
def stream(stream)
Definition: __init__.py:307
def get_device_capability(device=None)
Definition: __init__.py:280
def default_stream(device=None)
Definition: __init__.py:375
def range_push(msg)
Definition: nvtx.py:43
def empty_cache()
Definition: __init__.py:395
def current_device()
Definition: __init__.py:349
def get_device_name(device=None)
Definition: __init__.py:268
def reset_max_memory_allocated(device=None)
Definition: __init__.py:451
def current_stream(device=None)
Definition: __init__.py:361