Caffe2 - C++ API
A deep learning, cross platform ML framework
benchmark_helper.cc
1 
17 #include <chrono>
18 #include <fstream>
19 #include <string>
20 #include <thread>
21 
22 #include <binaries/benchmark_helper.h>
23 #include "caffe2/core/blob_serialization.h"
24 #ifdef __CUDA_ARCH__
25 #include "caffe2/core/context_gpu.h"
26 #endif
27 #include "caffe2/core/init.h"
28 #include "caffe2/core/logging.h"
29 #include "caffe2/core/net.h"
30 #include "caffe2/core/operator.h"
31 #include "caffe2/core/tensor_int8.h"
32 #include "caffe2/utils/bench_utils.h"
33 #include "caffe2/utils/string_utils.h"
34 #include <observers/net_observer_reporter_print.h>
35 #include <observers/observer_config.h>
36 #include <observers/perf_observer.h>
37 
38 using std::map;
39 using std::shared_ptr;
40 using std::string;
41 using std::unique_ptr;
42 using std::vector;
43 
44 void observerConfig() {
45  caffe2::ClearGlobalNetObservers();
46  caffe2::AddGlobalNetObserverCreator([](caffe2::NetBase* subject) {
47  return caffe2::make_unique<caffe2::PerfNetObserver>(subject);
48  });
49  caffe2::ObserverConfig::setReporter(
50  caffe2::make_unique<caffe2::NetObserverReporterPrint>());
51 }
52 
53 bool backendCudaSet(const string& backend) {
54  bool run_on_gpu = false;
55  if (backend == "cuda") {
56 #ifdef __CUDA_ARCH__
57  if (caffe2::HasCudaGPU()) {
58  run_on_gpu = true;
59  } else {
60  CAFFE_THROW("NO GPU support on this host machine");
61  }
62 #else
63  CAFFE_THROW("NO GPU support");
64 #endif
65  }
66  return run_on_gpu;
67 }
68 
69 void setDeviceType(caffe2::NetDef* net_def, caffe2::DeviceType& run_dev) {
70  for (int j = 0; j < net_def->op_size(); j++) {
71  caffe2::OperatorDef* op = net_def->mutable_op(j);
72  op->mutable_device_option()->set_device_type(caffe2::TypeToProto(run_dev));
73  }
74 }
75 
76 void setOperatorEngine(caffe2::NetDef* net_def, const string& backend) {
77  if (backend != "builtin") {
78  string engine = backend == "nnpack"
79  ? "NNPACK"
80  : backend == "eigen" ? "EIGEN"
81  : backend == "mkl" ? "MKLDNN"
82  : backend == "cuda"
83  ? "CUDA"
84  : backend == "dnnlowp" ? "DNNLOWP"
85  : backend == "dnnlowp_acc16"
86  ? "DNNLOWP_ACC16"
87  : backend == "default" ? "" : "NONE";
88  CAFFE_ENFORCE(engine != "NONE", "Backend is not supported");
89  for (int i = 0; i < net_def->op_size(); i++) {
90  caffe2::OperatorDef* op_def = net_def->mutable_op(i);
91  op_def->set_engine(engine);
92  }
93  }
94 }
95 
96 int loadInput(
97  shared_ptr<caffe2::Workspace> workspace,
98  const bool run_on_gpu,
99  map<string, caffe2::TensorProtos>& tensor_protos_map,
100  const string& input,
101  const string& input_file,
102  const string& input_dims,
103  const string& input_type) {
104  // How many input blobs are in the inputs
105  int blob_num = 1;
106  // Load input.
107  if (input.size()) {
108  vector<string> input_names = caffe2::split(',', input);
109  if (input_file.size()) {
110  vector<string> input_files = caffe2::split(',', input_file);
111  CAFFE_ENFORCE_EQ(
112  input_names.size(),
113  input_files.size(),
114  "Input name and file should have the same number.");
115  for (int i = 0; i < input_names.size(); ++i) {
116  caffe2::TensorProtos tensor_protos;
117  CAFFE_ENFORCE(
118  caffe2::ReadProtoFromFile(input_files[i], &tensor_protos));
119  workspace->CreateBlob(input_names[i]);
120  tensor_protos_map.insert(std::make_pair(input_names[i], tensor_protos));
121  }
122  // Check that all blobs have the same number of entries
123  blob_num = tensor_protos_map[input_names[0]].protos_size();
124  for (int i = 1; i < input_names.size(); ++i) {
125  int bnum = tensor_protos_map[input_names[i]].protos_size();
126  CAFFE_ENFORCE_EQ(
127  blob_num,
128  bnum,
129  "Number of blobs are not the same for all inputs");
130  }
131  } else if (input_dims.size() || input_type.size()) {
132  CAFFE_ENFORCE_GE(
133  input_dims.size(),
134  0,
135  "Input dims must be specified when input tensors are used.");
136  CAFFE_ENFORCE_GE(
137  input_type.size(),
138  0,
139  "Input type must be specified when input tensors are used.");
140 
141  vector<string> input_dims_list = caffe2::split(';', input_dims);
142  CAFFE_ENFORCE_EQ(
143  input_names.size(),
144  input_dims_list.size(),
145  "Input name and dims should have the same number of items.");
146  vector<string> input_type_list = caffe2::split(';', input_type);
147  CAFFE_ENFORCE_EQ(
148  input_names.size(),
149  input_type_list.size(),
150  "Input name and type should have the same number of items.");
151  for (size_t i = 0; i < input_names.size(); ++i) {
152  vector<string> input_dims_str = caffe2::split(',', input_dims_list[i]);
153  vector<int> input_dims;
154  for (const string& s : input_dims_str) {
155  input_dims.push_back(c10::stoi(s));
156  }
157  caffe2::Blob* blob = workspace->GetBlob(input_names[i]);
158  if (blob == nullptr) {
159  blob = workspace->CreateBlob(input_names[i]);
160  }
161  if (run_on_gpu) {
162  LOG(INFO) << "Running on GPU.";
163 #ifdef __CUDA_ARCH__
165  CHECK_NOTNULL(tensor);
166  tensor->Resize(input_dims);
167  if (input_type_list[i] == "uint8_t") {
168  tensor->mutable_data<uint8_t>();
169  } else if (input_type_list[i] == "float") {
170  tensor->mutable_data<float>();
171  } else {
172  CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
173  }
174 #else
175  CAFFE_THROW("Not support GPU on mobile.");
176 #endif
177  } else {
178  if (input_type_list[i] == "uint8_t") {
181  CHECK_NOTNULL(tensor);
182  tensor->t.Resize(input_dims);
183  tensor->t.mutable_data<uint8_t>();
184  } else if (input_type_list[i] == "float") {
185  caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
186  CHECK_NOTNULL(tensor);
187  tensor->Resize(input_dims);
188  tensor->mutable_data<float>();
189  } else if (input_type_list[i] == "int") {
190  caffe2::TensorCPU* tensor = BlobGetMutableTensor(blob, caffe2::CPU);
191  CHECK_NOTNULL(tensor);
192  tensor->Resize(input_dims);
193  tensor->mutable_data<int>();
194  } else {
195  CAFFE_THROW("Unsupported input type: ", input_type_list[i]);
196  }
197  }
198  }
199  } else {
200  CAFFE_THROW(
201  "You requested input tensors, but neither input_file nor "
202  "input_dims is set.");
203  }
204  }
205  return blob_num;
206 }
207 
208 void fillInputBlob(
209  shared_ptr<caffe2::Workspace> workspace,
210  map<string, caffe2::TensorProtos>& tensor_protos_map,
211  int iteration) {
212  if (tensor_protos_map.empty()) {
213  return;
214  }
215  static caffe2::TensorDeserializer deserializer;
216  for (auto& tensor_kv : tensor_protos_map) {
217  caffe2::Blob* blob = workspace->GetBlob(tensor_kv.first);
218  if (blob == nullptr) {
219  blob = workspace->CreateBlob(tensor_kv.first);
220  }
221  // todo: support gpu and make this function a tempalte
222  int protos_size = tensor_kv.second.protos_size();
223  if (protos_size == 1 && iteration > 0) {
224  // Do not override the input data if there is only one input data,
225  // since it will clear all caches. Rely on wipe_cache to
226  // clear caches
227  continue;
228  }
229  caffe2::TensorProto* tensor_proto =
230  tensor_kv.second.mutable_protos(iteration % protos_size);
231  BlobSetTensor(blob, deserializer.Deserialize(*tensor_proto));
232  // todo: for other types
233  }
234 }
235 
236 void runNetwork(
237  shared_ptr<caffe2::Workspace> workspace,
238  caffe2::NetDef& net_def,
239  map<string, caffe2::TensorProtos>& tensor_protos_map,
240  const bool wipe_cache,
241  const bool run_individual,
242  const bool run_on_gpu,
243  const bool text_output,
244  const int warmup,
245  const int iter,
246  const int num_blobs,
247  const int sleep_before_run,
248  const int sleep_between_iteration,
249  const int sleep_between_net_and_operator,
250  const std::string& output,
251  const std::string& output_folder) {
252 
253  if (!net_def.has_name()) {
254  net_def.set_name("benchmark");
255  }
256 
257  caffe2::NetBase* net = workspace->CreateNet(net_def);
258  CHECK_NOTNULL(net);
259 
260  LOG(INFO) << "Starting benchmark.";
261  caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup);
262  LOG(INFO) << "Running warmup runs.";
263  for (int i = 0; i < warmup; ++i) {
264  fillInputBlob(workspace, tensor_protos_map, i);
265  CAFFE_ENFORCE(net->Run(), "Warmup run ", i, " has failed.");
266  }
267 
268  if (wipe_cache) {
269  caffe2::wipe_cache();
270  }
271  if (sleep_before_run > 0) {
272  std::this_thread::sleep_for(std::chrono::seconds(sleep_before_run));
273  }
274  LOG(INFO) << "Main runs.";
275  CAFFE_ENFORCE(
276  iter >= 0,
277  "Number of main runs should be non negative, provided ",
278  iter,
279  ".");
280  LOG(INFO) << "net runs.";
281  for (int i = 0; i < iter; ++i) {
282  caffe2::ObserverConfig::initSampleRate(1, 1, 1, 0, warmup);
283  fillInputBlob(workspace, tensor_protos_map, i);
284  if (wipe_cache) {
285  caffe2::wipe_cache();
286  }
287  CAFFE_ENFORCE(net->Run(), "Main run ", i, " has failed.");
288  // Write the output for the first num_blobs times
289  writeOutput(
290  workspace,
291  run_on_gpu,
292  output,
293  output_folder,
294  text_output,
295  i,
296  num_blobs);
297  if (wipe_cache) {
298  caffe2::wipe_cache();
299  }
300  if (sleep_between_iteration > 0) {
301  std::this_thread::sleep_for(
302  std::chrono::seconds(sleep_between_iteration));
303  }
304  }
305  if (run_individual) {
306  LOG(INFO) << "operator runs.";
307  if (sleep_between_net_and_operator > 0) {
308  std::this_thread::sleep_for(
309  std::chrono::seconds(sleep_between_net_and_operator));
310  }
311  for (int i = 0; i < iter; ++i) {
312  caffe2::ObserverConfig::initSampleRate(1, 1, 1, 1, warmup);
313  fillInputBlob(workspace, tensor_protos_map, i);
314  CAFFE_ENFORCE(net->Run(), "Main run ", i, " with operator has failed.");
315  if (wipe_cache) {
316  caffe2::wipe_cache();
317  }
318  if (sleep_between_iteration > 0) {
319  std::this_thread::sleep_for(
320  std::chrono::seconds(sleep_between_iteration));
321  }
322  }
323  }
324 }
325 
326 void writeOutput(
327  shared_ptr<caffe2::Workspace> workspace,
328  const bool run_on_gpu,
329  const string& output,
330  const string& output_folder,
331  const bool text_output,
332  const int index,
333  const int num_blobs) {
334  if (output.size() == 0) {
335  return;
336  }
337  string output_prefix = output_folder.size() ? output_folder + "/" : "";
338  vector<string> output_names = caffe2::split(',', output);
339  if (output == "*") {
340  output_names = workspace->Blobs();
341  }
342  for (const string& name : output_names) {
343  CAFFE_ENFORCE(
344  workspace->HasBlob(name),
345  "You requested a non-existing blob: ",
346  name);
347  if (text_output) {
348  if (run_on_gpu) {
349 #ifdef __CUDA_ARCH__
350  writeTextOutput<caffe2::CUDAContext, caffe2::TensorCUDA>(
351  workspace->GetBlob(name)->GetMutable<caffe2::TensorCUDA>(),
352  output_prefix,
353  name,
354  index,
355  num_blobs);
356 #else
357  CAFFE_THROW("Not support GPU.");
358 #endif
359  } else {
360  writeTextOutput<caffe2::CPUContext, caffe2::TensorCPU>(
361  BlobGetMutableTensor(workspace->GetBlob(name), caffe2::CPU),
362  output_prefix,
363  name,
364  index,
365  num_blobs);
366  }
367  } else {
368  // Do not support multiple entries per blob.
369  CAFFE_ENFORCE(
370  index == 0,
371  "Binary file only support one output.");
372  string serialized = SerializeBlob(*workspace->GetBlob(name), name);
373  string output_filename = output_prefix + name;
374  caffe2::WriteStringToFile(serialized, output_filename.c_str());
375  }
376  }
377 }
378 
379 int benchmark(
380  int argc,
381  char* argv[],
382  const string& FLAGS_backend,
383  const string& FLAGS_init_net,
384  const string& FLAGS_input,
385  const string& FLAGS_input_dims,
386  const string& FLAGS_input_file,
387  const string& FLAGS_input_type,
388  int FLAGS_iter,
389  const string& FLAGS_net,
390  const string& FLAGS_output,
391  const string& FLAGS_output_folder,
392  bool FLAGS_run_individual,
393  int FLAGS_sleep_before_run,
394  int FLAGS_sleep_between_iteration,
395  int FLAGS_sleep_between_net_and_operator,
396  bool FLAGS_text_output,
397  int FLAGS_warmup,
398  bool FLAGS_wipe_cache) {
399  // Check arguments to be correct
400  {
401  // Need to check whether file exists, as the file reader does not assert if
402  // file does not exist
403  std::ifstream net_file(FLAGS_net);
404  CAFFE_ENFORCE(net_file.good());
405  net_file.close();
406 
407  std::ifstream init_net_file(FLAGS_init_net);
408  CAFFE_ENFORCE(init_net_file.good());
409  init_net_file.close();
410 
411  if (FLAGS_input_file.size() > 0) {
412  vector<string> input_files = caffe2::split(',', FLAGS_input_file);
413  for (auto input_file : input_files) {
414  std::ifstream ifile(input_file);
415  CAFFE_ENFORCE(ifile.good());
416  ifile.close();
417  }
418  }
419  }
420 
421  observerConfig();
422  caffe2::ShowLogInfoToStderr();
423 
424  auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
425  bool run_on_gpu = backendCudaSet(FLAGS_backend);
426  // Run initialization network.
427  caffe2::NetDef init_net_def;
428  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
429  setOperatorEngine(&init_net_def, FLAGS_backend);
430  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
431 
432  // Run main network.
433  caffe2::NetDef net_def;
434  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
435  setOperatorEngine(&net_def, FLAGS_backend);
436 
437  map<string, caffe2::TensorProtos> tensor_protos_map;
438 
439  int num_blobs = loadInput(
440  workspace,
441  run_on_gpu,
442  tensor_protos_map,
443  FLAGS_input,
444  FLAGS_input_file,
445  FLAGS_input_dims,
446  FLAGS_input_type);
447 
448  runNetwork(
449  workspace,
450  net_def,
451  tensor_protos_map,
452  FLAGS_wipe_cache,
453  FLAGS_run_individual,
454  run_on_gpu,
455  FLAGS_text_output,
456  FLAGS_warmup,
457  FLAGS_iter,
458  num_blobs,
459  FLAGS_sleep_before_run,
460  FLAGS_sleep_between_iteration,
461  FLAGS_sleep_between_net_and_operator,
462  FLAGS_output,
463  FLAGS_output_folder);
464 
465  return 0;
466 }
Blob is a general container that hosts a typed pointer.
Definition: blob.h:24
bool HasCudaGPU()
Check if the current running session has a cuda gpu present.
Definition: common_gpu.h:149
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
TensorDeserializer is the deserializer for Tensors.
T * GetMutable()
Gets a mutable pointer to the stored object.
Definition: blob.h:100
void SerializeBlob(const Blob &blob, const string &name, BlobSerializerBase::SerializationAcceptor acceptor, int chunk_size)
Serializes the given blob, if possible.