17 #include "benchmark/benchmark.h" 19 #include "caffe2/core/context.h" 20 #include "caffe2/core/context_gpu.h" 21 #include "caffe2/core/operator.h" 23 #define CAFFE2_SKIP_IF_NO_GPU \ 24 if (!caffe2::NumCudaDevices()) { \ 25 state.SkipWithError("No CUDA available, skipping benchmark."); \ 31 static void BM_CUDAContextCreation(benchmark::State& state) {
32 CAFFE2_SKIP_IF_NO_GPU;
33 volatile CUDAContext context_so_we_do_initialization_work;
34 while (state.KeepRunning()) {
38 BENCHMARK(BM_CUDAContextCreation);
40 static void BM_CUDAContextStreamAccess(benchmark::State& state) {
41 CAFFE2_SKIP_IF_NO_GPU;
43 while (state.KeepRunning()) {
44 volatile cudaStream_t stream = context.cuda_stream();
47 BENCHMARK(BM_CUDAContextStreamAccess);
49 static void BM_cudaGetDevice(benchmark::State& state) {
50 CAFFE2_SKIP_IF_NO_GPU;
52 while (state.KeepRunning()) {
53 CUDA_ENFORCE(cudaGetDevice(&
id));
56 BENCHMARK(BM_cudaGetDevice);
58 static void BM_cudaSetDevice(benchmark::State& state) {
59 CAFFE2_SKIP_IF_NO_GPU;
62 while (state.KeepRunning()) {
63 CUDA_ENFORCE(cudaSetDevice((i++) % total));
66 BENCHMARK(BM_cudaSetDevice);
68 static void BM_cudaSetAndGetDevice(benchmark::State& state) {
69 CAFFE2_SKIP_IF_NO_GPU;
73 while (state.KeepRunning()) {
74 CUDA_ENFORCE(cudaSetDevice((i++) % total));
75 CUDA_ENFORCE(cudaGetDevice(&
id));
78 BENCHMARK(BM_cudaSetAndGetDevice);
80 static void BM_cudaSetSameDevice(benchmark::State& state) {
81 CAFFE2_SKIP_IF_NO_GPU;
82 while (state.KeepRunning()) {
83 CUDA_ENFORCE(cudaSetDevice(0));
86 BENCHMARK(BM_cudaSetSameDevice);
88 static void BM_cudaStreamCreateSyncDelete(benchmark::State& state) {
89 CAFFE2_SKIP_IF_NO_GPU;
91 while (state.KeepRunning()) {
92 CUDA_ENFORCE(cudaStreamCreate(&stream));
93 CUDA_ENFORCE(cudaStreamSynchronize(stream));
94 CUDA_ENFORCE(cudaStreamDestroy(stream));
97 BENCHMARK(BM_cudaStreamCreateSyncDelete);
99 static void BM_cudaStreamSynchronize(benchmark::State& state) {
100 CAFFE2_SKIP_IF_NO_GPU;
102 CUDA_ENFORCE(cudaStreamCreate(&stream));
103 while (state.KeepRunning()) {
104 CUDA_ENFORCE(cudaStreamSynchronize(stream));
107 BENCHMARK(BM_cudaStreamSynchronize);
109 static void BM_cudaEventRecord(benchmark::State& state) {
110 CAFFE2_SKIP_IF_NO_GPU;
113 CUDA_ENFORCE(cudaStreamCreate(&stream));
114 CUDA_ENFORCE(cudaEventCreateWithFlags(
115 &event, cudaEventDefault | cudaEventDisableTiming));
116 while (state.KeepRunning()) {
117 CUDA_ENFORCE(cudaEventRecord(event, stream));
120 BENCHMARK(BM_cudaEventRecord);
122 static void BM_cudaStreamWaitEventThenStreamSynchronize(
123 benchmark::State& state) {
124 CAFFE2_SKIP_IF_NO_GPU;
127 CUDA_ENFORCE(cudaStreamCreate(&stream));
128 CUDA_ENFORCE(cudaEventCreateWithFlags(
129 &event, cudaEventDefault | cudaEventDisableTiming));
130 CUDA_ENFORCE(cudaEventRecord(event, stream));
131 CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
132 CUDA_ENFORCE(cudaStreamSynchronize(stream));
133 while (state.KeepRunning()) {
134 CUDA_ENFORCE(cudaStreamWaitEvent(stream, event, 0));
135 CUDA_ENFORCE(cudaStreamSynchronize(stream));
138 BENCHMARK(BM_cudaStreamWaitEventThenStreamSynchronize);
140 static void BM_CudaPointerAffinity(benchmark::State& state) {
141 CAFFE2_SKIP_IF_NO_GPU;
142 Tensor tensor(vector<int64_t>{1, 2, 3, 4}, CUDA);
143 float* ptr = tensor.mutable_data<
float>();
144 while (state.KeepRunning()) {
148 BENCHMARK(BM_CudaPointerAffinity);
151 template <
class Context>
152 class DummyEmptyOp :
public Operator<Context> {
154 DummyEmptyOp(
const OperatorDef& def,
Workspace* ws)
157 bool RunOnDevice() final {
return true; }
160 REGISTER_CPU_OPERATOR(DummyEmpty, DummyEmptyOp<CPUContext>);
161 REGISTER_CUDA_OPERATOR(DummyEmpty, DummyEmptyOp<CUDAContext>);
162 OPERATOR_SCHEMA(DummyEmpty);
165 static void BM_OperatorCreationCPU(benchmark::State& state) {
166 std::unique_ptr<OperatorBase> op;
169 def.set_type(
"DummyEmpty");
170 def.mutable_device_option()->set_device_type(PROTO_CPU);
171 while (state.KeepRunning()) {
172 op = CreateOperator(def, &ws);
175 BENCHMARK(BM_OperatorCreationCPU);
177 static void BM_OperatorCreationCUDA(benchmark::State& state) {
178 CAFFE2_SKIP_IF_NO_GPU;
179 std::unique_ptr<OperatorBase> op;
182 def.set_type(
"DummyEmpty");
183 def.mutable_device_option()->set_device_type(PROTO_CUDA);
184 while (state.KeepRunning()) {
185 op = CreateOperator(def, &ws);
188 BENCHMARK(BM_OperatorCreationCUDA);
190 static void BM_RawAllocDeallocCPU(benchmark::State& state) {
191 while (state.KeepRunning()) {
193 auto data_ptr = GetCPUAllocator()->allocate(1);
197 BENCHMARK(BM_RawAllocDeallocCPU);
199 static void BM_TensorAllocDeallocCPU(benchmark::State& state) {
202 tensor.Resize(32, 32);
203 while (state.KeepRunning()) {
204 CHECK(tensor.mutable_data<
float>());
208 BENCHMARK(BM_TensorAllocDeallocCPU);
210 static void BM_TensorAllocDeallocCUDA(benchmark::State& state) {
211 CAFFE2_SKIP_IF_NO_GPU;
214 tensor.Resize(32, 32);
215 while (state.KeepRunning()) {
216 CHECK(tensor.mutable_data<
float>());
220 BENCHMARK(BM_TensorAllocDeallocCUDA);
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
int NumCudaDevices()
Returns the number of devices.