Caffe2 - C++ API
A deep learning, cross platform ML framework
nnapi_benchmark.cc
1 
17 #include "caffe2/core/init.h"
18 #include "caffe2/core/operator.h"
19 #include "caffe2/core/tensor.h"
20 #include "caffe2/core/timer.h"
21 #include "caffe2/utils/math.h"
22 #include "caffe2/utils/proto_utils.h"
23 #include "nnapi.h"
24 
25 namespace caffe2 {
26 
27 namespace {
28 
29 static double benchmark_conv_caffe2(
30  Workspace* ws,
31  int N,
32  int C,
33  int H,
34  int W,
35  int K,
36  int kernel,
37  int group,
38  int warmup = 5,
39  int run = 10,
40  std::string engine = "NNPACK") {
41  caffe2::Workspace localWs;
42  if (!ws) {
43  ws = &localWs;
44  }
45  {
46  auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
47  t->Resize(N, C, H, W);
48  CPUContext ctx;
49  math::RandGaussian<float, CPUContext>(
50  t->size(), 0, 30, t->mutable_data<float>(), &ctx);
51  }
52  {
53  auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
54  if (group == 1) {
55  t->Resize(K, C, kernel, kernel);
56  } else {
57  t->Resize(K, 1, kernel, kernel);
58  }
59  CPUContext ctx;
60  math::RandGaussian<float, CPUContext>(
61  t->size(), 0, 30, t->mutable_data<float>(), &ctx);
62  }
63  {
64  auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
65  t->Resize(K);
66  CPUContext ctx;
67  math::RandGaussian<float, CPUContext>(
68  t->size(), 0, 30, t->mutable_data<float>(), &ctx);
69  }
70 
71  OperatorDef op;
72  {
73  op.set_type("Conv");
74  op.add_input("X_cpu");
75  op.add_input("W");
76  op.add_input("B");
77  op.add_output("Y_cpu");
78  op.set_engine(engine);
79  {
80  auto& arg = *(op.add_arg());
81  arg.set_name("order");
82  arg.set_s("NCHW");
83  }
84  {
85  auto& arg = *(op.add_arg());
86  arg.set_name("convolution_transform_strategy");
87  arg.set_s("PRECOMPUTE");
88  }
89  {
90  auto& arg = *(op.add_arg());
91  arg.set_name("kernel");
92  arg.set_i(kernel);
93  }
94  {
95  auto& arg = *(op.add_arg());
96  arg.set_name("group");
97  arg.set_i(group);
98  }
99  }
100 
101  // NNPack
102  std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(op, ws));
103 
104  Timer timer;
105  CAFFE_ENFORCE(op1->Run());
106  for (int i = 0; i < warmup; i++) {
107  op1->Run();
108  }
109  timer.Start();
110  for (int i = 0; i < run; i++) {
111  op1->Run();
112  }
113  return double(timer.MilliSeconds()) / run;
114 }
115 
116 static double benchmark_conv_nnapi(
117  Workspace* ws,
118  int N,
119  int C,
120  int H,
121  int W,
122  int K,
123  int kernel,
124  int group,
125  int warmup = 5,
126  int run = 10) {
127  caffe2::Workspace localWs;
128  if (!ws) {
129  ws = &localWs;
130  }
131  {
132  auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
133  t->Resize(N, H, W, C);
134  CPUContext ctx;
135  math::RandGaussian<float, CPUContext>(
136  t->size(), 0, 30, t->mutable_data<float>(), &ctx);
137  }
138  {
139  auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
140  if (group > 1) {
141  CAFFE_ENFORCE_EQ(C, group);
142  t->Resize(1, kernel, kernel, C);
143  } else {
144  t->Resize(K, kernel, kernel, C);
145  }
146  CPUContext ctx;
147  math::RandGaussian<float, CPUContext>(
148  t->size(), 0, 30, t->mutable_data<float>(), &ctx);
149  }
150  {
151  auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
152  t->Resize(K);
153  CPUContext ctx;
154  math::RandGaussian<float, CPUContext>(
155  t->size(), 0, 30, t->mutable_data<float>(), &ctx);
156  }
157 
158  NetDef netdef;
159  {
160  {
161  auto& op = *(netdef.add_op());
162  op.set_type("Conv");
163  op.add_input("X_cpu");
164  op.add_input("W");
165  op.add_input("B");
166  op.add_output("Y_cpu");
167  {
168  auto& arg = *(op.add_arg());
169  arg.set_name("order");
170  arg.set_s("NHWC");
171  }
172  {
173  auto& arg = *(op.add_arg());
174  arg.set_name("kernel");
175  arg.set_i(kernel);
176  }
177  {
178  auto& arg = *(op.add_arg());
179  arg.set_name("group");
180  arg.set_i(group);
181  }
182  }
183  netdef.add_external_input("X_cpu");
184  netdef.add_external_input("W");
185  netdef.add_external_input("B");
186  netdef.add_external_output("Y_cpu");
187  }
188 
189  // NN API
190  NetDef initNet;
191  NNApi model(initNet, netdef, ws);
192  std::vector<TensorCPU*> inputs, outputs;
193  inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
194  CAFFE_ENFORCE(model.run(inputs, &outputs));
195 
196  for (int i = 0; i < warmup; i++) {
197  model.run(inputs, &outputs);
198  }
199  Timer timer;
200  timer.Start();
201  for (int i = 0; i < run; i++) {
202  model.run(inputs, &outputs);
203  }
204  return double(timer.MilliSeconds()) / run;
205 }
206 
207 static double benchmark_conv_nnapi_int8(
208  Workspace* ws,
209  int N,
210  int C,
211  int H,
212  int W,
213  int K,
214  int kernel,
215  int group,
216  int warmup = 5,
217  int run = 10) {
218  caffe2::Workspace localWs;
219  if (!ws) {
220  ws = &localWs;
221  }
222  {
223  auto* t = ws->CreateBlob("X_cpu")->GetMutable<TensorCPU>();
224  t->Resize(N, H, W, C);
225  for (int i = 0; i < t->size(); i++) {
226  t->mutable_data<uint8_t>()[i] = rand() % 10;
227  }
228  }
229  {
230  auto* t = ws->CreateBlob("W")->GetMutable<TensorCPU>();
231  if (group > 1) {
232  CAFFE_ENFORCE_EQ(C, group);
233  t->Resize(1, kernel, kernel, C);
234  } else {
235  t->Resize(K, kernel, kernel, C);
236  }
237  for (int i = 0; i < t->size(); i++) {
238  t->mutable_data<uint8_t>()[i] = rand() % 10;
239  }
240  }
241 
242  // For input tensor of ANEURALNETWORKS_TENSOR_QUANT8_ASYMM type, the bias
243  // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 and
244  // bias_scale == input_scale * filter_scale.
245  {
246  auto* t = ws->CreateBlob("B")->GetMutable<TensorCPU>();
247  t->Resize(K);
248  for (int i = 0; i < t->size(); i++) {
249  t->mutable_data<int32_t>()[i] = rand() % 10;
250  }
251  }
252 
253  NetDef netdef;
254  {
255  {
256  auto& op = *(netdef.add_op());
257  op.set_type("Conv");
258  op.add_input("X_cpu");
259  op.add_input("W");
260  op.add_input("B");
261  op.add_output("Y_cpu");
262  {
263  auto& arg = *(op.add_arg());
264  arg.set_name("order");
265  arg.set_s("NHWC");
266  }
267  {
268  auto& arg = *(op.add_arg());
269  arg.set_name("kernel");
270  arg.set_i(kernel);
271  }
272  {
273  auto& arg = *(op.add_arg());
274  arg.set_name("group");
275  arg.set_i(group);
276  }
277  // Hack
278  // for weight tensor
279  {
280  auto& arg = *(op.add_arg());
281  arg.set_name("weight_scale");
282  arg.set_f(1.0);
283  }
284  {
285  auto& arg = *(op.add_arg());
286  arg.set_name("weight_zero_point");
287  arg.set_i(0);
288  }
289  // for output tensor
290  // For output tensor of ANEURALNETWORKS_TENSOR_QUANT8_ASYMM type, the
291  // following condition must be satisfied: output_scale > input_scale *
292  // filter_scale
293  {
294  auto& arg = *(op.add_arg());
295  arg.set_name("output_scale");
296  arg.set_f(2.0);
297  }
298  {
299  auto& arg = *(op.add_arg());
300  arg.set_name("output_zero_point");
301  arg.set_i(0);
302  }
303  }
304  netdef.add_external_input("X_cpu");
305  netdef.add_external_input("W");
306  netdef.add_external_input("B");
307  netdef.add_external_output("Y_cpu");
308  // scale and zero_point for the input tensor
309  {
310  auto& arg = *(netdef.add_arg());
311  arg.set_name("scale");
312  arg.set_f(1.0);
313  }
314  {
315  auto& arg = *(netdef.add_arg());
316  arg.set_name("zero_point");
317  arg.set_i(0);
318  }
319  }
320 
321  // NN API
322  NetDef initNet;
323  NNApi model(initNet, netdef, ws);
324  std::vector<TensorCPU*> inputs, outputs;
325  inputs.push_back(ws->GetBlob("X_cpu")->GetMutable<TensorCPU>());
326  CAFFE_ENFORCE(model.run(inputs, &outputs));
327 
328  for (int i = 0; i < warmup; i++) {
329  model.run(inputs, &outputs);
330  }
331  Timer timer;
332  timer.Start();
333  for (int i = 0; i < run; i++) {
334  model.run(inputs, &outputs);
335  }
336  return double(timer.MilliSeconds()) / run;
337 }
338 
339 } // namespace
340 
341 } // namespace caffe2
342 
343 int main(int argc, char** argv) {
345  ws.GetThreadPool()->setMinWorkSize(0);
346 
347  int warmup = 2, mainrun = 10;
348  // float32
349  for (int space : {14, 26, 52, 104}) {
350  for (int input_channel : {64, 128, 256, 512}) {
351  for (int kernel : {1, 3}) {
352  int output_channel = input_channel;
353  const double cpu_time = caffe2::benchmark_conv_caffe2(
354  &ws,
355  1,
356  input_channel,
357  space,
358  space,
359  output_channel,
360  kernel,
361  1,
362  warmup,
363  mainrun,
364  "NNPACK");
365  const double nn_time_fp32 = caffe2::benchmark_conv_nnapi(
366  &ws,
367  1,
368  input_channel,
369  space,
370  space,
371  output_channel,
372  kernel,
373  1,
374  warmup,
375  mainrun);
376  const double nn_time_int8 = caffe2::benchmark_conv_nnapi_int8(
377  &ws,
378  1,
379  input_channel,
380  space,
381  space,
382  output_channel,
383  kernel,
384  1,
385  warmup,
386  mainrun);
387  const double flops = double(input_channel) * output_channel * kernel *
388  kernel * (kernel == 1 ? space : space - 2) *
389  (kernel == 1 ? space : space - 2) * 2;
390  printf(
391  "Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t32b"
392  "NNPACK GFLOPS: %.2f\t32b"
393  "NN-API GFLOPS: %.2f\t8b"
394  "NN-API GOPS: %.2f\n",
395  space,
396  space,
397  input_channel,
398  output_channel,
399  kernel,
400  kernel,
401  flops / cpu_time / 1E6,
402  flops / nn_time_fp32 / 1E6,
403  flops / nn_time_int8 / 1E6);
404  }
405  }
406  }
407  fflush(stdout);
408 
409  // depthwise
410  for (int space : {14, 26, 52, 104}) {
411  for (int channel : {64, 128, 256, 512}) {
412  for (int kernel : {3}) {
413  const double cpu_time = caffe2::benchmark_conv_caffe2(
414  &ws,
415  1,
416  channel,
417  space,
418  space,
419  channel,
420  kernel,
421  channel,
422  warmup,
423  mainrun,
424  "DEPTHWISE_3x3");
425  const double nn_time_fp32_dwise = caffe2::benchmark_conv_nnapi(
426  &ws,
427  1,
428  channel,
429  space,
430  space,
431  channel,
432  kernel,
433  channel,
434  warmup,
435  mainrun);
436  const double nn_time_int8_dwise = caffe2::benchmark_conv_nnapi_int8(
437  &ws,
438  1,
439  channel,
440  space,
441  space,
442  channel,
443  kernel,
444  channel,
445  warmup,
446  mainrun);
447  const double dwise_bandwidth = sizeof(float) * double(channel) *
448  (2 * (space - 2) * (space - 2) + kernel * kernel);
449  printf(
450  "Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t32b"
451  "Caffe2 Dwise GB/s: %.2f\t32b"
452  "NN-API Dwise GB/s: %.2f\t8b"
453  "NN-API Dwise GB/s: %.2f\n",
454  space,
455  space,
456  channel,
457  channel,
458  kernel,
459  kernel,
460  dwise_bandwidth / cpu_time / 1E6,
461  dwise_bandwidth / nn_time_fp32_dwise / 1E6,
462  dwise_bandwidth / sizeof(float) / nn_time_int8_dwise / 1E6);
463  }
464  }
465  }
466 }
Blob * CreateBlob(const string &name)
Creates a blob of the given name.
Definition: workspace.cc:120
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
void Resize(Ts...dim_source)
Resizes a tensor.
Definition: tensor.h:304
Copyright (c) 2016-present, Facebook, Inc.
T * GetMutable(bool *is_new_object=nullptr)
Gets a mutable pointer to the stored object.
Definition: blob.h:117