Caffe2 - C++ API
A deep learning, cross platform ML framework
core_overhead_benchmark_gpu.cc
1 
17 #include "benchmark/benchmark.h"
18 
19 #include "caffe2/core/context.h"
20 #include "caffe2/core/context_gpu.h"
21 #include "caffe2/core/operator.h"
22 
23 #define CAFFE2_SKIP_IF_NO_GPU \
24  if (!caffe2::NumCudaDevices()) { \
25  state.SkipWithError("No CUDA available, skipping benchmark."); \
26  return; \
27  }
28 
29 using namespace caffe2;
30 
31 static void BM_CUDAContextCreation(benchmark::State& state) {
32  CAFFE2_SKIP_IF_NO_GPU;
33  volatile CUDAContext context_so_we_do_initialization_work;
34  while (state.KeepRunning()) {
35  volatile CUDAContext context;
36  }
37 }
38 BENCHMARK(BM_CUDAContextCreation);
39 
40 static void BM_CUDAContextStreamAccess(benchmark::State& state) {
41  CAFFE2_SKIP_IF_NO_GPU;
42  CUDAContext context;
43  while (state.KeepRunning()) {
44  volatile cudaStream_t stream = context.cuda_stream();
45  }
46 }
47 BENCHMARK(BM_CUDAContextStreamAccess);
48 
49 static void BM_cudaGetDevice(benchmark::State& state) {
50  CAFFE2_SKIP_IF_NO_GPU;
51  int id;
52  while (state.KeepRunning()) {
53  CUDA_ENFORCE(cudaGetDevice(&id));
54  }
55 }
56 BENCHMARK(BM_cudaGetDevice);
57 
58 static void BM_cudaSetDevice(benchmark::State& state) {
59  CAFFE2_SKIP_IF_NO_GPU;
60  int total = NumCudaDevices();
61  int i = 0;
62  while (state.KeepRunning()) {
63  CUDA_ENFORCE(cudaSetDevice((i++) % total));
64  }
65 }
66 BENCHMARK(BM_cudaSetDevice);
67 
68 static void BM_cudaSetAndGetDevice(benchmark::State& state) {
69  CAFFE2_SKIP_IF_NO_GPU;
70  int total = NumCudaDevices();
71  int i = 0;
72  int id;
73  while (state.KeepRunning()) {
74  CUDA_ENFORCE(cudaSetDevice((i++) % total));
75  CUDA_ENFORCE(cudaGetDevice(&id));
76  }
77 }
78 BENCHMARK(BM_cudaSetAndGetDevice);
79 
80 static void BM_cudaSetSameDevice(benchmark::State& state) {
81  CAFFE2_SKIP_IF_NO_GPU;
82  while (state.KeepRunning()) {
83  CUDA_ENFORCE(cudaSetDevice(0));
84  }
85 }
86 BENCHMARK(BM_cudaSetSameDevice);
87 
88 static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) {
89  CAFFE2_SKIP_IF_NO_GPU;
90  cudaStream_t stream;
91  while (state.KeepRunning()) {
92  CUDA_ENFORCE(cudaStreamCreate(&stream));
93  CUDA_ENFORCE(cudaStreamSynchronize(stream));
94  CUDA_ENFORCE(cudaStreamDestroy(stream));
95  }
96 }
97 BENCHMARK(BM_cudaStreamCreateSyncDelete);
98 
99 static void BM_cudaStreamSynchronize(benchmark::State& state) {
100  CAFFE2_SKIP_IF_NO_GPU;
101  cudaStream_t stream;
102  CUDA_ENFORCE(cudaStreamCreate(&stream));
103  while (state.KeepRunning()) {
104  CUDA_ENFORCE(cudaStreamSynchronize(stream));
105  }
106 }
107 BENCHMARK(BM_cudaStreamSynchronize);
108 
109 static void BM_cudaEventRecord(benchmark::State& state) {
110  CAFFE2_SKIP_IF_NO_GPU;
111  cudaStream_t stream;
112  cudaEvent_t event;
113  CUDA_ENFORCE(cudaStreamCreate(&stream));
114  CUDA_ENFORCE(cudaEventCreateWithFlags(
115  &event, cudaEventDefault | cudaEventDisableTiming));
116  while (state.KeepRunning()) {
117  CUDA_ENFORCE(cudaEventRecord(event, stream));
118  }
119 }
120 BENCHMARK(BM_cudaEventRecord);
121 
122 static void BM_cudaStreamWaitEventThenStreamSynchronize(
123  benchmark::State& state) {
124  CAFFE2_SKIP_IF_NO_GPU;
125  cudaStream_t stream;
126  cudaEvent_t event;
127  CUDA_ENFORCE(cudaStreamCreate(&stream));
128  CUDA_ENFORCE(cudaEventCreateWithFlags(
129  &event, cudaEventDefault | cudaEventDisableTiming));
130  CUDA_ENFORCE(cudaEventRecord(event, stream));
131  CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
132  CUDA_ENFORCE(cudaStreamSynchronize(stream));
133  while (state.KeepRunning()) {
134  CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
135  CUDA_ENFORCE(cudaStreamSynchronize(stream));
136  }
137 }
138 BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
139 
140 static void BM_CudaPointerAffinity(benchmark::State& state) {
141  CAFFE2_SKIP_IF_NO_GPU;
142  Tensor tensor(vector<int64_t>{1, 2, 3, 4}, CUDA);
143  float* ptr = tensor.mutable_data<float>();
144  while (state.KeepRunning()) {
145  volatile int id = GetGPUIDForPointer(ptr);
146  }
147 }
148 BENCHMARK(BM_CudaPointerAffinity);
149 
150 namespace {
151 template <class Context>
152 class DummyEmptyOp : public Operator<Context> {
153  public:
154  DummyEmptyOp(const OperatorDef& def, Workspace* ws)
155  : Operator<Context>(def, ws) {}
156 
157  bool RunOnDevice() final { return true; }
158 };
159 
160 REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp<CPUContext>);
161 REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp<CUDAContext>);
162 OPERATOR_SCHEMA(DummyEmpty);
163 } // namespace
164 
165 static void BM_OperatorCreationCPU(benchmark::State& state) {
166  std::unique_ptr<OperatorBase> op;
167  OperatorDef def;
168  Workspace ws;
169  def.set_type("DummyEmpty");
170  def.mutable_device_option()->set_device_type(PROTO_CPU);
171  while (state.KeepRunning()) {
172  op = CreateOperator(def, &ws);
173  }
174 }
175 BENCHMARK(BM_OperatorCreationCPU);
176 
177 static void BM_OperatorCreationCUDA(benchmark::State& state) {
178  CAFFE2_SKIP_IF_NO_GPU;
179  std::unique_ptr<OperatorBase> op;
180  OperatorDef def;
181  Workspace ws;
182  def.set_type("DummyEmpty");
183  def.mutable_device_option()->set_device_type(PROTO_CUDA);
184  while (state.KeepRunning()) {
185  op = CreateOperator(def, &ws);
186  }
187 }
188 BENCHMARK(BM_OperatorCreationCUDA);
189 
190 static void BM_RawAllocDeallocCPU(benchmark::State& state) {
191  while (state.KeepRunning()) {
192  // Allocating only 1 byte in order to measure the overhead.
193  auto data_ptr = GetCPUAllocator()->allocate(1);
194  // Deallocated when it's out of scope
195  }
196 }
197 BENCHMARK(BM_RawAllocDeallocCPU);
198 
199 static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
200  Tensor tensor(CPU);
201  // small allocation
202  tensor.Resize(32, 32);
203  while (state.KeepRunning()) {
204  CHECK(tensor.mutable_data<float>());
205  tensor.FreeMemory();
206  }
207 }
208 BENCHMARK(BM_TensorAllocDeallocCPU);
209 
210 static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
211  CAFFE2_SKIP_IF_NO_GPU;
212  Tensor tensor(CUDA);
213  // small allocation
214  tensor.Resize(32, 32);
215  while (state.KeepRunning()) {
216  CHECK(tensor.mutable_data<float>());
217  tensor.FreeMemory();
218  }
219 }
220 BENCHMARK(BM_TensorAllocDeallocCUDA);
221 
222 BENCHMARK_MAIN();
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:106
int NumCudaDevices()
Returns the number of devices.
Definition: common_gpu.cc:15