17 #include "caffe2/core/init.h" 18 #include "caffe2/core/operator.h" 19 #include "caffe2/core/tensor.h" 20 #include "caffe2/core/timer.h" 21 #include "caffe2/utils/math.h" 22 #include "caffe2/utils/proto_utils.h" 29 static double benchmark_conv_caffe2(
40 std::string engine =
"NNPACK") {
46 auto* t = BlobGetMutableTensor(ws->CreateBlob(
"X_cpu"), CPU);
47 t->Resize(N, C, H, W);
49 math::RandGaussian<float, CPUContext>(
50 t->size(), 0, 30, t->mutable_data<
float>(), &ctx);
53 auto* t = BlobGetMutableTensor(ws->CreateBlob(
"W"), CPU);
55 t->Resize(K, C, kernel, kernel);
57 t->Resize(K, 1, kernel, kernel);
60 math::RandGaussian<float, CPUContext>(
61 t->size(), 0, 30, t->mutable_data<
float>(), &ctx);
64 auto* t = BlobGetMutableTensor(ws->CreateBlob(
"B"), CPU);
67 math::RandGaussian<float, CPUContext>(
68 t->size(), 0, 30, t->mutable_data<
float>(), &ctx);
74 op.add_input(
"X_cpu");
77 op.add_output(
"Y_cpu");
78 op.set_engine(engine);
80 auto& arg = *(op.add_arg());
81 arg.set_name(
"order");
85 auto& arg = *(op.add_arg());
86 arg.set_name(
"convolution_transform_strategy");
87 arg.set_s(
"PRECOMPUTE");
90 auto& arg = *(op.add_arg());
91 arg.set_name(
"kernel");
95 auto& arg = *(op.add_arg());
96 arg.set_name(
"group");
102 std::unique_ptr<caffe2::OperatorBase> op1(CreateOperator(op, ws));
105 CAFFE_ENFORCE(op1->Run());
106 for (
int i = 0; i < warmup; i++) {
110 for (
int i = 0; i < run; i++) {
113 return double(timer.MilliSeconds()) / run;
116 static double benchmark_conv_nnapi(
132 auto* t = BlobGetMutableTensor(ws->CreateBlob(
"X_cpu"), CPU);
133 t->Resize(N, H, W, C);
135 math::RandGaussian<float, CPUContext>(
136 t->size(), 0, 30, t->mutable_data<
float>(), &ctx);
139 auto* t = BlobGetMutableTensor(ws->CreateBlob(
"W"), CPU);
141 CAFFE_ENFORCE_EQ(C, group);
142 t->Resize(1, kernel, kernel, C);
144 t->Resize(K, kernel, kernel, C);
147 math::RandGaussian<float, CPUContext>(
148 t->size(), 0, 30, t->mutable_data<
float>(), &ctx);
151 auto* t = BlobGetMutableTensor(ws->CreateBlob(
"B"), CPU);
154 math::RandGaussian<float, CPUContext>(
155 t->size(), 0, 30, t->mutable_data<
float>(), &ctx);
161 auto& op = *(netdef.add_op());
163 op.add_input(
"X_cpu");
166 op.add_output(
"Y_cpu");
168 auto& arg = *(op.add_arg());
169 arg.set_name(
"order");
173 auto& arg = *(op.add_arg());
174 arg.set_name(
"kernel");
178 auto& arg = *(op.add_arg());
179 arg.set_name(
"group");
183 netdef.add_external_input(
"X_cpu");
184 netdef.add_external_input(
"W");
185 netdef.add_external_input(
"B");
186 netdef.add_external_output(
"Y_cpu");
191 NNApi model(initNet, netdef, ws);
192 std::vector<TensorCPU*> inputs, outputs;
193 inputs.push_back(BlobGetMutableTensor(ws->GetBlob(
"X_cpu"), CPU));
194 CAFFE_ENFORCE(model.run(inputs, &outputs));
196 for (
int i = 0; i < warmup; i++) {
197 model.run(inputs, &outputs);
201 for (
int i = 0; i < run; i++) {
202 model.run(inputs, &outputs);
204 return double(timer.MilliSeconds()) / run;
207 static double benchmark_conv_nnapi_int8(
223 auto* t = BlobGetMutableTensor(ws->CreateBlob(
"X_cpu"), CPU);
224 t->Resize(N, H, W, C);
225 for (
int i = 0; i < t->size(); i++) {
226 t->mutable_data<uint8_t>()[i] = rand() % 10;
230 auto* t = BlobGetMutableTensor(ws->CreateBlob(
"W"), CPU);
232 CAFFE_ENFORCE_EQ(C, group);
233 t->Resize(1, kernel, kernel, C);
235 t->Resize(K, kernel, kernel, C);
237 for (
int i = 0; i < t->size(); i++) {
238 t->mutable_data<uint8_t>()[i] = rand() % 10;
246 auto* t = BlobGetMutableTensor(ws->CreateBlob(
"B"), CPU);
248 for (
int i = 0; i < t->size(); i++) {
249 t->mutable_data<int32_t>()[i] = rand() % 10;
256 auto& op = *(netdef.add_op());
258 op.add_input(
"X_cpu");
261 op.add_output(
"Y_cpu");
263 auto& arg = *(op.add_arg());
264 arg.set_name(
"order");
268 auto& arg = *(op.add_arg());
269 arg.set_name(
"kernel");
273 auto& arg = *(op.add_arg());
274 arg.set_name(
"group");
280 auto& arg = *(op.add_arg());
281 arg.set_name(
"weight_scale");
285 auto& arg = *(op.add_arg());
286 arg.set_name(
"weight_zero_point");
294 auto& arg = *(op.add_arg());
295 arg.set_name(
"output_scale");
299 auto& arg = *(op.add_arg());
300 arg.set_name(
"output_zero_point");
304 netdef.add_external_input(
"X_cpu");
305 netdef.add_external_input(
"W");
306 netdef.add_external_input(
"B");
307 netdef.add_external_output(
"Y_cpu");
310 auto& arg = *(netdef.add_arg());
311 arg.set_name(
"scale");
315 auto& arg = *(netdef.add_arg());
316 arg.set_name(
"zero_point");
323 NNApi model(initNet, netdef, ws);
324 std::vector<TensorCPU*> inputs, outputs;
325 inputs.push_back(BlobGetMutableTensor(ws->GetBlob(
"X_cpu"), CPU));
326 CAFFE_ENFORCE(model.run(inputs, &outputs));
328 for (
int i = 0; i < warmup; i++) {
329 model.run(inputs, &outputs);
333 for (
int i = 0; i < run; i++) {
334 model.run(inputs, &outputs);
336 return double(timer.MilliSeconds()) / run;
343 int main(
int argc,
char** argv) {
345 ws.GetThreadPool()->setMinWorkSize(0);
347 int warmup = 2, mainrun = 10;
349 for (
int space : {14, 26, 52, 104}) {
350 for (
int input_channel : {64, 128, 256, 512}) {
351 for (
int kernel : {1, 3}) {
352 int output_channel = input_channel;
353 const double cpu_time = caffe2::benchmark_conv_caffe2(
365 const double nn_time_fp32 = caffe2::benchmark_conv_nnapi(
376 const double nn_time_int8 = caffe2::benchmark_conv_nnapi_int8(
387 const double flops = double(input_channel) * output_channel * kernel *
388 kernel * (kernel == 1 ? space : space - 2) *
389 (kernel == 1 ? space : space - 2) * 2;
391 "Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t32b" 392 "NNPACK GFLOPS: %.2f\t32b" 393 "NN-API GFLOPS: %.2f\t8b" 394 "NN-API GOPS: %.2f\n",
401 flops / cpu_time / 1E6,
402 flops / nn_time_fp32 / 1E6,
403 flops / nn_time_int8 / 1E6);
410 for (
int space : {14, 26, 52, 104}) {
411 for (
int channel : {64, 128, 256, 512}) {
412 for (
int kernel : {3}) {
413 const double cpu_time = caffe2::benchmark_conv_caffe2(
425 const double nn_time_fp32_dwise = caffe2::benchmark_conv_nnapi(
436 const double nn_time_int8_dwise = caffe2::benchmark_conv_nnapi_int8(
447 const double dwise_bandwidth =
sizeof(float) *
double(channel) *
448 (space * space + kernel == 1
450 : (space - 2) * (space - 2) + kernel * kernel);
452 "Conv: X: %ix%i \tC: %i -> %i\tK: %ix%i\t32b" 453 "Caffe2 Dwise GB/s: %.2f\t32b" 454 "NN-API Dwise GB/s: %.2f\t8b" 455 "NN-API Dwise GB/s: %.2f\n",
462 dwise_bandwidth / cpu_time / 1E6,
463 dwise_bandwidth / nn_time_fp32_dwise / 1E6,
464 dwise_bandwidth /
sizeof(
float) / nn_time_int8_dwise / 1E6);
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...