Caffe2 - C++ API
A deep learning, cross platform ML framework
cuda_stream_test.cpp
1 #include <gtest/gtest.h>
2 
3 #include <ATen/cuda/CUDAContext.h>
4 #include <c10/cuda/CUDAGuard.h>
5 #include <ATen/cuda/CUDAMultiStreamGuard.h>
6 #include <ATen/cuda/CUDAEvent.h>
7 
8 #include <cuda_runtime.h>
9 
10 #include <functional>
11 #include <thread>
12 #include <unordered_set>
13 
14 #define ASSERT_EQ_CUDA(X, Y) \
15  { \
16  bool isTRUE = X == Y; \
17  ASSERT_TRUE(isTRUE); \
18  }
19 
20 #define ASSERT_NE_CUDA(X, Y) \
21  { \
22  bool isFALSE = X == Y; \
23  ASSERT_FALSE(isFALSE); \
24  }
25 
26 /*
27  Tests related to ATen streams.
28  */
29 // Verifies streams are live through copying and moving
30 TEST(TestStream, CopyAndMoveTest) {
31  if (!at::cuda::is_available()) return;
32  int32_t device = -1;
33  cudaStream_t cuda_stream;
34 
35  // Tests that copying works as expected and preserves the stream
36  at::cuda::CUDAStream copyStream = at::cuda::getStreamFromPool();
37  {
38  auto s = at::cuda::getStreamFromPool();
39  device = s.device_index();
40  cuda_stream = s.stream();
41 
42  copyStream = s;
43 
44  ASSERT_EQ_CUDA(copyStream.device_index(), device);
45  ASSERT_EQ_CUDA(copyStream.stream(), cuda_stream);
46  }
47 
48  ASSERT_EQ_CUDA(copyStream.device_index(), device);
49  ASSERT_EQ_CUDA(copyStream.stream(), cuda_stream);
50 
51  // Tests that moving works as expected and preserves the stream
52  at::cuda::CUDAStream moveStream = at::cuda::getStreamFromPool();
53  {
54  auto s = at::cuda::getStreamFromPool();
55  device = s.device_index();
56  cuda_stream = s.stream();
57 
58  moveStream = std::move(s);
59 
60  ASSERT_EQ_CUDA(moveStream.device_index(), device);
61  ASSERT_EQ_CUDA(moveStream.stream(), cuda_stream);
62  }
63 
64  ASSERT_EQ_CUDA(moveStream.device_index(), device);
65  ASSERT_EQ_CUDA(moveStream.stream(), cuda_stream);
66 }
67 
68 // Verifies streams are set properly
69 TEST(TestStream, GetAndSetTest) {
70  if (!at::cuda::is_available()) return;
71  at::cuda::CUDAStream myStream = at::cuda::getStreamFromPool();
72 
73  // Sets and gets
74  at::cuda::setCurrentCUDAStream(myStream);
75  at::cuda::CUDAStream curStream = at::cuda::getCurrentCUDAStream();
76 
77  ASSERT_EQ_CUDA(myStream, curStream);
78 
79  // Gets, sets, and gets default stream
80  at::cuda::CUDAStream defaultStream = at::cuda::getDefaultCUDAStream();
81  at::cuda::setCurrentCUDAStream(defaultStream);
82  curStream = at::cuda::getCurrentCUDAStream();
83 
84  ASSERT_NE_CUDA(defaultStream, myStream);
85  ASSERT_EQ_CUDA(curStream, defaultStream);
86 }
87 
88 void thread_fun(at::optional<at::cuda::CUDAStream>& cur_thread_stream) {
89  auto new_stream = at::cuda::getStreamFromPool();
90  at::cuda::setCurrentCUDAStream(new_stream);
91  cur_thread_stream = {at::cuda::getCurrentCUDAStream()};
92  ASSERT_EQ_CUDA(*cur_thread_stream, new_stream);
93 }
94 
95 // Ensures streams are thread local
96 TEST(TestStream, MultithreadGetAndSetTest) {
97  if (!at::cuda::is_available()) return;
99 
100  std::thread t0{thread_fun, std::ref(s0)};
101  std::thread t1{thread_fun, std::ref(s1)};
102  t0.join();
103  t1.join();
104 
105  at::cuda::CUDAStream cur_stream = at::cuda::getCurrentCUDAStream();
106  at::cuda::CUDAStream default_stream = at::cuda::getDefaultCUDAStream();
107 
108  ASSERT_EQ_CUDA(cur_stream, default_stream);
109  ASSERT_NE_CUDA(cur_stream, *s0);
110  ASSERT_NE_CUDA(cur_stream, *s1);
111  ASSERT_NE_CUDA(s0, s1);
112 }
113 
114 // CUDA Guard
115 TEST(TestStream, CUDAGuardTest) {
116  if (!at::cuda::is_available()) return;
117  if (at::cuda::getNumGPUs() < 2) {
118  return;
119  }
120 
121  // -- begin setup
122 
123  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
124  std::vector<at::cuda::CUDAStream> streams0 = {
125  at::cuda::getDefaultCUDAStream(), at::cuda::getStreamFromPool()};
126  ASSERT_EQ_CUDA(streams0[0].device_index(), 0);
127  ASSERT_EQ_CUDA(streams0[1].device_index(), 0);
128  at::cuda::setCurrentCUDAStream(streams0[0]);
129 
130  std::vector<at::cuda::CUDAStream> streams1;
131  {
132  at::cuda::CUDAGuard device_guard(1);
133  streams1.push_back(at::cuda::getDefaultCUDAStream());
134  streams1.push_back(at::cuda::getStreamFromPool());
135  }
136  ASSERT_EQ_CUDA(streams1[0].device_index(), 1);
137  ASSERT_EQ_CUDA(streams1[1].device_index(), 1);
138  at::cuda::setCurrentCUDAStream(streams1[0]);
139 
140  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
141 
142  // -- end setup
143 
144  // Test that all original streams are recorded.
145  {
147  ASSERT_EQ_CUDA(guard.original_streams().size(), at::cuda::getNumGPUs());
148  ASSERT_EQ_CUDA(guard.original_streams()[0], streams0[0]);
149  ASSERT_EQ_CUDA(guard.original_streams()[1], streams1[0]);
150  }
151 
152  // Setting a stream changes the current device and the stream on that device
153  {
154  at::cuda::CUDAStreamGuard guard(streams1[1]);
155  ASSERT_EQ_CUDA(guard.current_device(), at::Device(at::kCUDA, 1));
156  ASSERT_EQ_CUDA(at::cuda::current_device(), 1);
157  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[1]);
158  }
159 
160  // Device and stream are now reset
161  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
162  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]);
163 
164  // Setting only the device changes only the current device and not the stream
165  {
166  at::cuda::CUDAGuard guard(/*device=*/1);
167  ASSERT_EQ_CUDA(guard.current_device(), at::Device(at::kCUDA, 1));
168  ASSERT_EQ_CUDA(at::cuda::current_device(), 1);
169  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(1), streams1[0]);
170  }
171 
172  ASSERT_EQ_CUDA(at::cuda::current_device(), 0);
173  ASSERT_EQ_CUDA(at::cuda::getCurrentCUDAStream(0), streams0[0]);
174 }
175 
176 // Streampool Round Robin
177 TEST(TestStream, StreamPoolTest) {
178  if (!at::cuda::is_available()) return;
179  std::vector<at::cuda::CUDAStream> streams{};
180  for (int i = 0; i < 200; ++i) {
181  streams.emplace_back(at::cuda::getStreamFromPool());
182  }
183 
184  std::unordered_set<cudaStream_t> stream_set{};
185  bool hasDuplicates = false;
186  for (auto i = decltype(streams.size()){0}; i < streams.size(); ++i) {
187  cudaStream_t cuda_stream = streams[i];
188  auto result_pair = stream_set.insert(cuda_stream);
189  if (!result_pair.second)
190  hasDuplicates = true;
191  }
192 
193  ASSERT_TRUE(hasDuplicates);
194 }
195 
196 // Multi-GPU
197 TEST(TestStream, MultiGPUTest) {
198  if (!at::cuda::is_available()) return;
199  if (at::cuda::getNumGPUs() < 2)
200  return;
201 
202  at::cuda::CUDAStream s0 = at::cuda::getStreamFromPool(true, 0);
203  at::cuda::CUDAStream s1 = at::cuda::getStreamFromPool(false, 1);
204 
205  at::cuda::setCurrentCUDAStream(s0);
206  at::cuda::setCurrentCUDAStream(s1);
207 
208  ASSERT_EQ_CUDA(s0, at::cuda::getCurrentCUDAStream());
209 
210  at::cuda::CUDAGuard device_guard{1};
211  ASSERT_EQ_CUDA(s1, at::cuda::getCurrentCUDAStream());
212 }
213 
214 // CUDAEvent Syncs
215 TEST(TestStream, CUDAEventSyncTest) {
216  if (!at::cuda::is_available()) return;
217  const auto stream = at::cuda::getStreamFromPool();
218  at::cuda::CUDAEvent event;
219 
220  ASSERT_TRUE(event.query());
221 
222  event.recordOnce(stream);
223 
224  const auto wait_stream0 = at::cuda::getStreamFromPool();
225  const auto wait_stream1 = at::cuda::getStreamFromPool();
226 
227  event.block(wait_stream0);
228  event.block(wait_stream1);
229 
230  cudaStreamSynchronize(wait_stream0);
231  ASSERT_TRUE(event.query());
232 }
233 
234 // Cross-Device Events
235 TEST(TestStream, CrossDeviceTest) {
236  if (!at::cuda::is_available()) return;
237  if (at::cuda::getNumGPUs() < 2)
238  return;
239 
240  const auto stream0 = at::cuda::getStreamFromPool();
241  at::cuda::CUDAEvent event0;
242 
243  at::cuda::set_device(1);
244  const auto stream1 = at::cuda::getStreamFromPool();
245  at::cuda::CUDAEvent event1;
246 
247  event0.record(stream0);
248  event1.record(stream1);
249 
250  event0 = std::move(event1);
251 
252  ASSERT_EQ_CUDA(event0.device(), at::Device(at::kCUDA, 1));
253 
254  event0.block(stream0);
255 
256  cudaStreamSynchronize(stream0);
257  ASSERT_TRUE(event0.query());
258 }
Represents a a compute device on which a tensor is located.
Definition: Device.h:30
A variant of StreamGuard that is specialized for CUDA.
Definition: CUDAGuard.h:117
A variant of DeviceGuard that is specialized for CUDA.
Definition: CUDAGuard.h:20
cudaStream_t stream() const
Explicit conversion to cudaStream_t.
Definition: CUDAStream.cpp:318
DeviceIndex device_index() const
Get the CUDA device index that this stream is associated with.
Definition: CUDAStream.h:95