Caffe2 - C++ API
A deep learning, cross platform ML framework
net_simple.cc
1 #include "caffe2/core/net_simple.h"
2 #include "caffe2/core/net.h"
3 
4 #include <iostream>
5 #include <set>
6 #include <unordered_map>
7 #include <unordered_set>
8 
9 #include "caffe2/core/operator.h"
10 #include "caffe2/core/static_tracepoint.h"
11 #include "caffe2/core/timer.h"
12 #include "caffe2/proto/caffe2_pb.h"
13 #include "caffe2/utils/proto_utils.h"
14 
15 C10_DEFINE_bool(
16  caffe2_simple_net_benchmark_run_whole_net,
17  true,
18  "If false, whole net passes won't be performed");
19 
20 namespace caffe2 {
21 
22 SimpleNet::SimpleNet(
23  const std::shared_ptr<const NetDef>& net_def,
24  Workspace* ws)
25  : NetBase(net_def, ws) {
26  VLOG(1) << "Constructing SimpleNet " << net_def->name();
27  const bool net_def_has_device_option = net_def->has_device_option();
28  // Initialize the operators
29  for (int idx = 0; idx < net_def->op_size(); ++idx) {
30  const auto& operator_def = net_def->op(idx);
31  VLOG(1) << "Creating operator " << operator_def.name() << ": "
32  << operator_def.type();
33  std::unique_ptr<OperatorBase> op{nullptr};
34  if (!operator_def.has_device_option() && net_def_has_device_option) {
35  // In the case that the operator def does not specify a device option but
36  // the net def has a default option, we copy the device option over to the
37  // operator def.
38  OperatorDef temp_def(operator_def);
39  temp_def.mutable_device_option()->CopyFrom(net_def->device_option());
40  op = CreateOperator(temp_def, ws, idx);
41  } else {
42  op = CreateOperator(operator_def, ws, idx);
43  op->set_debug_def(
44  std::shared_ptr<const OperatorDef>{net_def, &(net_def->op(idx))});
45  }
46  operators_.emplace_back(std::move(op));
47  }
48 }
49 
50 bool SimpleNet::Run() {
51  StartAllObservers();
52  VLOG(1) << "Running net " << name_;
53  for (auto& op : operators_) {
54  VLOG(1) << "Running operator " << op->debug_def().name() << "("
55  << op->debug_def().type() << ").";
56 #ifdef CAFFE2_ENABLE_SDT
57  const auto& op_name = op->debug_def().name().c_str();
58  const auto& op_type = op->debug_def().type().c_str();
59  auto* op_ptr = op.get();
60  const auto& net_name = name_.c_str();
61  CAFFE_SDT(operator_start, net_name, op_name, op_type, op_ptr);
62 #endif
63  bool res = op->Run();
64 #ifdef CAFFE2_ENABLE_SDT
65  CAFFE_SDT(operator_done, net_name, op_name, op_type, op_ptr);
66 #endif
67  if (!res) {
68  LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
69  return false;
70  }
71  }
72  StopAllObservers();
73  return true;
74 }
75 
76 bool SimpleNet::RunAsync() {
77  return Run();
78 }
79 
80 namespace {
81 template <typename A, typename B>
82 bool PairLargerThan(const std::pair<A, B>& x, const std::pair<A, B>& y) {
83  return x.second > y.second;
84 }
85 }
86 
88  const int warmup_runs,
89  const int main_runs,
90  const bool run_individual) {
91  /* Use std::cout because logging may be disabled */
92  std::cout << "Starting benchmark." << std::endl;
93  std::cout << "Running warmup runs." << std::endl;
94  CAFFE_ENFORCE(
95  warmup_runs >= 0,
96  "Number of warm up runs should be non negative, provided ",
97  warmup_runs,
98  ".");
99  for (int i = 0; i < warmup_runs; ++i) {
100  CAFFE_ENFORCE(Run(), "Warmup run ", i, " has failed.");
101  }
102 
103  std::cout << "Main runs." << std::endl;
104  CAFFE_ENFORCE(
105  main_runs >= 0,
106  "Number of main runs should be non negative, provided ",
107  main_runs,
108  ".");
109  Timer timer;
110  auto millis = timer.MilliSeconds();
111  if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
112  for (int i = 0; i < main_runs; ++i) {
113  CAFFE_ENFORCE(Run(), "Main run ", i, " has failed.");
114  }
115  millis = timer.MilliSeconds();
116  std::cout << "Main run finished. Milliseconds per iter: "
117  << millis / main_runs
118  << ". Iters per second: " << 1000.0 * main_runs / millis
119  << std::endl;
120  }
121  vector<float> time_per_op(operators_.size(), 0);
122  vector<uint64_t> flops_per_op;
123  vector<uint64_t> memory_bytes_read_per_op;
124  vector<uint64_t> memory_bytes_written_per_op;
125  vector<uint64_t> param_bytes_per_op;
126  CaffeMap<string, float> time_per_op_type;
127  CaffeMap<string, float> flops_per_op_type;
128  CaffeMap<string, float> memory_bytes_read_per_op_type;
129  CaffeMap<string, float> memory_bytes_written_per_op_type;
130  CaffeMap<string, float> param_bytes_per_op_type;
131  if (run_individual) {
132  for (int i = 0; i < main_runs; ++i) {
133  for (auto& op : operators_) {
134  op->ResetEvent();
135  }
136  int idx = 0;
137  for (auto& op : operators_) {
138  const string& op_type = op->debug_def().type();
139  if (i == 0) { // Gather flops on the first run.
140  auto* schema = OpSchemaRegistry::Schema(op_type);
141  if (schema && schema->HasCostInferenceFunction()) {
142  vector<TensorShape> shapes = op->InputTensorShapes();
143 
144  auto all_good_shapes = std::accumulate(
145  shapes.begin(),
146  shapes.end(),
147  true,
148  [](bool acc, const TensorShape& shape) {
149  return acc && !shape.unknown_shape();
150  });
151  OpSchema::Cost cost;
152  if (all_good_shapes) {
153  cost = schema->InferCost(op->debug_def(), shapes);
154  }
155 
156  flops_per_op.emplace_back(cost.flops);
157  memory_bytes_read_per_op.emplace_back(cost.bytes_read);
158  memory_bytes_written_per_op.emplace_back(cost.bytes_written);
159  param_bytes_per_op.emplace_back(cost.params_bytes);
160 
161  flops_per_op_type[op_type] += cost.flops;
162  memory_bytes_read_per_op_type[op_type] += cost.bytes_read;
163  memory_bytes_written_per_op_type[op_type] += cost.bytes_written;
164  param_bytes_per_op_type[op_type] += cost.params_bytes;
165  } else {
166  flops_per_op.emplace_back(0);
167  memory_bytes_read_per_op.emplace_back(0);
168  memory_bytes_written_per_op.emplace_back(0);
169  param_bytes_per_op.emplace_back(0);
170  }
171  }
172  timer.Start();
173  CAFFE_ENFORCE(
174  op->Run(),
175  "operator ",
176  op->debug_def().name(),
177  "(",
178  op_type,
179  ") has failed.");
180  float spent = timer.MilliSeconds();
181  time_per_op[idx] += spent;
182  time_per_op_type[op_type] += spent;
183  ++idx;
184  }
185  }
186  size_t idx = 0;
187  for (auto& op : operators_) {
188  const string& op_type = op->debug_def().type();
189  const string& print_name =
190  (op->debug_def().name().size()
191  ? op->debug_def().name()
192  : (op->debug_def().output_size() ? op->debug_def().output(0)
193  : "NO_OUTPUT"));
194  std::stringstream flops_str;
195  if (idx < flops_per_op.size() && flops_per_op[idx]) {
196  flops_str << " (" << to_string(1.0e-9 * flops_per_op[idx]) << " GFLOP, "
197  << to_string(
198  1.0e-6 * flops_per_op[idx] / time_per_op[idx] *
199  main_runs)
200  << " GFLOPS)";
201  }
202  std::stringstream memory_bytes_read_str;
203  if (idx < memory_bytes_read_per_op.size() &&
204  memory_bytes_read_per_op[idx]) {
205  memory_bytes_read_str
206  << " (" << to_string(1.0e-6 * memory_bytes_read_per_op[idx])
207  << " MB)";
208  }
209  std::stringstream memory_bytes_written_str;
210  if (idx < memory_bytes_written_per_op.size() &&
211  memory_bytes_written_per_op[idx]) {
212  memory_bytes_written_str
213  << " (" << to_string(1.0e-6 * memory_bytes_written_per_op[idx])
214  << " MB)";
215  }
216  std::stringstream param_bytes_str;
217  if (idx < param_bytes_per_op.size() && param_bytes_per_op[idx]) {
218  param_bytes_str << " (" << to_string(1.0e-6 * param_bytes_per_op[idx])
219  << " MB)";
220  }
221  std::cout << "Operator #" << idx << " (" << print_name << ", " << op_type
222  << ") " << time_per_op[idx] / main_runs << " ms/iter"
223  << flops_str.str() << memory_bytes_written_str.str()
224  << param_bytes_str.str() << std::endl;
225  ++idx;
226  }
227  const std::vector<string> metric({"Time",
228  "FLOP",
229  "Feature Memory Read",
230  "Feature Memory Written",
231  "Parameter Memory"});
232  const std::vector<double> normalizer(
233  {1.0 / main_runs, 1.0e-9, 1.0e-6, 1.0e-6, 1.0e-6});
234  const std::vector<string> unit({"ms", "GFLOP", "MB", "MB", "MB"});
235 
236  std::vector<CaffeMap<string, float>*> metric_per_op_type_vec_vec;
237  metric_per_op_type_vec_vec.emplace_back(&time_per_op_type);
238  metric_per_op_type_vec_vec.emplace_back(&flops_per_op_type);
239  metric_per_op_type_vec_vec.emplace_back(&memory_bytes_read_per_op_type);
240  metric_per_op_type_vec_vec.emplace_back(&memory_bytes_written_per_op_type);
241  metric_per_op_type_vec_vec.emplace_back(&param_bytes_per_op_type);
242  for (size_t i = 0; i < metric_per_op_type_vec_vec.size(); ++i) {
243  std::cout << metric[i] << " per operator type:" << std::endl;
244  auto* item = metric_per_op_type_vec_vec[i];
245  std::vector<std::pair<string, float>> metric_per_op_type_vec(
246  (*item).begin(), (*item).end());
247  std::sort(
248  metric_per_op_type_vec.begin(),
249  metric_per_op_type_vec.end(),
250  PairLargerThan<string, float>);
251  float total_metric = 0.;
252  for (const auto& op_item : metric_per_op_type_vec) {
253  total_metric += op_item.second * normalizer[i];
254  }
255  for (const auto& op_item : metric_per_op_type_vec) {
256  float percent = 0.;
257  if (total_metric > 0.) {
258  percent = (100.0 * op_item.second * normalizer[i] / total_metric);
259  }
260  std::cout << std::setw(15) << std::setfill(' ')
261  << op_item.second * normalizer[i] << " " << unit[i] << ". "
262  << std::setw(10) << std::setfill(' ') << percent << "%. "
263  << op_item.first << std::endl;
264  }
265  std::cout << std::setw(15) << std::setfill(' ') << total_metric << " "
266  << unit[i] << " in Total" << std::endl;
267  }
268  }
269  // We will reuse time_per_op to return the result of BenchmarkNet.
270  for (size_t i = 0; i < time_per_op.size(); ++i) {
271  time_per_op[i] /= main_runs;
272  }
273  if (FLAGS_caffe2_simple_net_benchmark_run_whole_net) {
274  time_per_op.insert(time_per_op.begin(), millis / main_runs);
275  }
276  return time_per_op;
277 }
278 
279 REGISTER_NET(simple, SimpleNet);
280 
281 } // namespace caffe2
void Start()
Starts a timer.
Definition: timer.h:24
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
float MilliSeconds()
Returns the elapsed time in milliseconds.
Definition: timer.h:32
vector< float > TEST_Benchmark(const int warmup_runs, const int main_runs, const bool run_individual) override
Benchmarks a network.
Definition: net_simple.cc:87
A simple timer object for measuring time.
Definition: timer.h:16