Caffe2 - C++ API
A deep learning, cross platform ML framework
memonger.cc
1 
17 #include "caffe2/core/memonger.h"
18 
19 #include <set>
20 #include <unordered_set>
21 
22 #include "caffe2/utils/proto_utils.h"
23 #include "google/protobuf/text_format.h"
24 
25 namespace caffe2 {
26 namespace memonger {
27 
28 NetDef optimize_inference_net(
29  const NetDef& net,
30  const std::set<string>& static_blobs) {
31  if (net.type() != "" && net.type() != "simple") {
32  LOG(INFO) << "Cannot optimize memory for nets of type: " << net.type();
33  return net;
34  }
35 
36  std::vector<OperatorDef> ops;
37  for (auto& op : net.op()) {
38  if (op.type() == "RecurrentNetwork") {
39  // NOTE: for subtleties of RNN op memonger, see memonger.py on how
40  // to deal with the forward/backward links etc.
41  LOG(INFO) << "Memonger does not support RecurrentNetwork yet";
42  return net;
43  }
44  ops.push_back(op);
45  }
46 
47  // Step 1: count first and last operator for each blob
48  std::unordered_set<std::string> all_blobs;
49  std::unordered_map<std::string, std::pair<int, int>> ranges;
50  for (int i = 0; i < ops.size(); i++) {
51  for (auto& inp : ops[i].input()) {
52  if (ranges.find(inp) != ranges.end()) {
53  ranges[inp].second = i;
54  }
55  all_blobs.insert(inp);
56  }
57  for (auto& outp : ops[i].output()) {
58  all_blobs.insert(outp);
59  if (static_blobs.find(outp) != static_blobs.end()) {
60  continue;
61  }
62  if (ranges.find(outp) == ranges.end()) {
63  ranges[outp] = std::make_pair(i, i);
64  }
65  }
66  }
67 
68  // Step 2: pass over ops and recycle
69  std::vector<std::string> free_blobs;
70  std::unordered_map<std::string, std::string> renaming;
71  std::unordered_map<std::string, std::string> mapping;
72 
73  for (int i = 0; i < ops.size(); i++) {
74  auto& op = ops[i];
75  std::unordered_set<std::string> new_free_blobs;
76 
77  // Check if some input is used the last time, and release it
78  for (auto& inp : op.input()) {
79  auto rit = ranges.find(inp);
80  if (rit != ranges.end() && rit->second.second == i) {
81  if (mapping.find(inp) == mapping.end()) {
82  new_free_blobs.insert(inp);
83  mapping[inp] = inp;
84 
85  // Safety check to prevent double-memongering nets.
86  string shared_blob =
87  "__m" + caffe2::to_string(renaming.size()) + "_shared";
88  if (all_blobs.find(shared_blob) != all_blobs.end()) {
89  LOG(INFO) << "Net was already memongered!";
90  return net;
91  }
92  renaming[inp] = shared_blob;
93  } else {
94  new_free_blobs.insert(mapping[inp]);
95  }
96  }
97  }
98 
99  // Check if some output appears the first time, and see if we can replace it
100  // with a recycled blob.
101  for (auto& outp : op.output()) {
102  if (!free_blobs.empty()) {
103  // first use?
104  auto rit = ranges.find(outp);
105  if (rit != ranges.end() && rit->second.first == i) {
106  std::string recycled = free_blobs.back();
107  free_blobs.pop_back();
108  mapping[outp] = recycled;
109  }
110  }
111  }
112 
113  // Add blobs released from this op to the pool.
114  for (auto& b : new_free_blobs) {
115  free_blobs.push_back(b);
116  }
117  }
118 
119  // Step 3: rename inputs and outputs and create new net
120  NetDef optim_net = net;
121  optim_net.mutable_op()->Clear();
122  for (auto op : ops) {
123  for (int i = 0; i < op.input_size(); i++) {
124  auto& inp = op.input(i);
125  if (mapping.find(inp) != mapping.end()) {
126  op.set_input(i, renaming[mapping[inp]]);
127  }
128  }
129  for (int i = 0; i < op.output_size(); i++) {
130  auto& outp = op.output(i);
131  if (mapping.find(outp) != mapping.end()) {
132  op.set_output(i, renaming[mapping[outp]]);
133  }
134  }
135  auto* ao = optim_net.add_op();
136  ao->CopyFrom(op);
137  }
138 
139  LOG(INFO) << "optimized net using " << renaming.size() << " shared blobs";
140  return optim_net;
141 }
142 
144  public:
145  explicit ComputeBlobRecyclingForDag(const int size)
146  : op_inputs_(size),
147  op_visited_count_(size),
148  op_token_deposit_(size),
149  op_visited_(size, false) {}
150  NetDef OptimizeNet(
151  const NetDef& net,
152  const std::vector<string>& heads,
153  const std::vector<int>& op_indices,
154  const std::unordered_set<string>& shareable_blob_names,
155  const string& namescope,
156  const std::unordered_set<string>& dont_share_blob_names,
157  const std::unordered_map<string, vector<int>>& blob_shapes) {
158  // Construct the set of input blobs.
159  std::unordered_set<string> heads_blobs_set(heads.begin(), heads.end());
160 
161  // Construct the set of output blobs we want to optimize.
162  for (const int op_index : op_indices) {
163  for (const auto& output : net.op(op_index).output()) {
164  optim_op_outputs_.insert(output);
165  }
166  }
167 
168  // Compute operators in degree (op_inputs_) and initialize how many ops are
169  // sharing input blobs (share_counts_).
170  // Note: We have to handle the cases where output blobs are shared.
171  std::unordered_map<string, int> blob_seen;
172  for (const int op_index : op_indices) {
173  for (const auto& input : net.op(op_index).input()) {
174  if (has_key(shareable_blob_names, input) ||
175  has_key(heads_blobs_set, input)) {
176  if (has_key(optim_op_outputs_, input)) {
177  CAFFE_ENFORCE(
178  blob_seen.find(input) != blob_seen.end(),
179  "Input ",
180  input,
181  " was not output by an op before");
182  op_inputs_[op_index] += blob_seen[input];
183  } else {
184  share_counts_[input] = 1;
185  }
186  blob_to_ops_[input].push_back(op_index);
187  }
188  }
189  for (const auto& output : net.op(op_index).output()) {
190  blob_seen[output] += 1;
191  blob_device_[output] = net.op(op_index).device_option();
192  // Exception for CopyGPUToCPU that has
193  // cuda device option but whose inputs/outputs are on CPU
194  if (net.op(op_index).type() == "CopyGPUToCPU") {
195  blob_device_[output].set_device_type(0);
196  blob_device_[output].set_cuda_gpu_id(0);
197  }
198  }
199  }
200 
201  // The main recursive call. Here we do start DFS in the operator graph
202  // from the input blobs.
203  for (const auto& input_blob : heads) {
204  for (const int op_index : blob_to_ops_[input_blob]) {
205  if (!op_visited_[op_index]) {
206  vector<std::pair<int, string>> free_blobs;
207  std::unordered_set<int> tokens{tokens_counter_++};
208  process_op(
209  net,
210  shareable_blob_names,
211  namescope,
212  dont_share_blob_names,
213  blob_shapes,
214  op_index,
215  &free_blobs,
216  &tokens);
217  }
218  }
219  }
220 
221  // Rename mapped blobs.
222  std::unordered_map<string, string> renamed;
223  int name_idx = 0;
224  std::unordered_set<string> mapped_blobs_set;
225  for (const auto& mapped_blob : mapping_) {
226  mapped_blobs_set.insert(mapped_blob.second);
227  if (has_key(optim_op_outputs_, mapped_blob.second)) {
228  if (renamed.find(mapped_blob.second) == renamed.end()) {
229  renamed.insert(
230  {mapped_blob.second,
231  namescope + "__m" + caffe2::to_string(name_idx++) + "_shared"});
232  }
233  } else {
234  renamed.insert({mapped_blob.second, mapped_blob.second});
235  }
236  }
237 
238  // Recursively rename mapped_blobs.
239  mapping_.insert(renamed.begin(), renamed.end());
240  bool had_changes = true;
241  while (had_changes) {
242  had_changes = false;
243  for (const auto mapped_blob : mapping_) {
244  if (has_key(renamed, mapped_blob.second) &&
245  renamed[mapped_blob.second] != mapped_blob.second) {
246  renamed[mapped_blob.first] = renamed[mapped_blob.second];
247  mapping_[mapped_blob.first] = renamed[mapped_blob.first];
248  }
249  }
250  }
251 
252  NetDef optimized_net = apply_assignments(net);
253  LOG(INFO) << "Remapping " << mapping_.size() << " using "
254  << mapped_blobs_set.size() << " shared blobs.";
255  if (floats_saved_ > 0) {
256  LOG(INFO) << "Memonger saved approximately : "
257  << (floats_saved_ * 4.0 / 1024.0 / 1024.0) << " MB.";
258  }
259 
260  return optimized_net;
261  }
262 
263  private:
264  NetDef apply_assignments(const NetDef& net) {
265  NetDef optimized_net = net;
266  // Rename optimized_net blobs.
267  for (int i = 0; i < optimized_net.op_size(); ++i) {
268  // Special handling for RNNs, which have internal nets that
269  // can refer to memongered blobs
270  if (optimized_net.op(i).type().find("RecurrentNetwork") == 0) {
271  apply_recurrent_blob_assignments(optimized_net.mutable_op(i));
272  }
273 
274  for (int j = 0; j < optimized_net.op(i).input_size(); ++j) {
275  const string& input_name =
276  get_blob_or_mapped_blob(optimized_net.op(i).input(j));
277  optimized_net.mutable_op(i)->set_input(j, input_name);
278  }
279 
280  for (int j = 0; j < optimized_net.op(i).output_size(); ++j) {
281  auto output_name =
282  get_blob_or_mapped_blob(optimized_net.op(i).output(j));
283  optimized_net.mutable_op(i)->set_output(j, output_name);
284  }
285  }
286  return optimized_net;
287  }
288 
289  void apply_recurrent_blob_assignments(OperatorDef* op) {
290  // Recursively map stepnets in RecurrentNetworks, and
291  // attach a mapping table
292  for (int i = 0; i < op->arg_size(); i++) {
293  Argument* arg = op->mutable_arg(i);
294  const string& name = arg->name();
295  if (name == "step_net" || name == "backward_step_net") {
296  if (arg->has_n()) {
297  NetDef* step_net_ref = arg->mutable_n();
298  CAFFE_ENFORCE(
299  !arg->has_s(),
300  "Invalid definition for ",
301  name,
302  ". Only one of NetDef and string should be present");
303  NetDef optimized_net = apply_assignments(*step_net_ref);
304  step_net_ref->CopyFrom(optimized_net);
305  } else {
306  NetDef step_net;
307  CAFFE_ENFORCE(
308  google::protobuf::TextFormat::ParseFromString(
309  arg->s(), &step_net),
310  "Could not parse step net:",
311  name);
312  step_net = apply_assignments(step_net);
313  arg->set_s(ProtoDebugString(step_net));
314  }
315  }
316  }
317 
318  // Store renamings
319  vector<string> inputs_outputs(op->input().begin(), op->input().end());
320  inputs_outputs.insert(
321  inputs_outputs.end(), op->output().begin(), op->output().end());
322 
323  for (auto& b : inputs_outputs) {
324  string mapped = get_blob_or_mapped_blob(b);
325  if (b != mapped) {
326  Argument* map_arg = op->add_arg();
327  map_arg->set_name(b + ".rename");
328  map_arg->set_s(mapped);
329  }
330  }
331  }
332 
333  template <typename K, typename V>
334  inline bool has_key(const std::unordered_map<K, V>& in_map, const K& key) {
335  return in_map.find(key) != in_map.end();
336  }
337 
338  template <typename K>
339  inline bool has_key(const std::unordered_set<K>& in_set, const K& key) {
340  return in_set.find(key) != in_set.end();
341  }
342 
343  void process_op(
344  const NetDef& net,
345  const std::unordered_set<string>& shareable_blob_names,
346  const string& namescope,
347  const std::unordered_set<string>& dont_share_blob_names,
348  const std::unordered_map<string, vector<int>>& blob_shapes,
349  int op_index,
350  std::vector<std::pair<int, string>>* free_blobs,
351  std::unordered_set<int>* tokens) {
352  // The tokens we have now is the union of current tokens operator is holding
353  // and tokens pushed from parents.
354  tokens->insert(
355  op_token_deposit_[op_index].begin(), op_token_deposit_[op_index].end());
356  op_token_deposit_[op_index].clear();
357  CAFFE_ENFORCE(!op_visited_[op_index]);
358  op_visited_[op_index] = true;
359 
360  const OperatorDef& current_op = net.op(op_index);
361 
362  // The set of freed input blobs by processing current op.
363  std::vector<std::pair<int, string>> new_free_blobs;
364  std::unordered_set<string> new_free_blobs_set;
365 
366  // Now update blob tokens.
367  for (const auto& input : current_op.input()) {
368  const auto& actual_blob = get_blob_or_mapped_blob(input);
369  req_tokens_[actual_blob].insert(tokens->begin(), tokens->end());
370  if (actual_blob != input) {
371  req_tokens_[input].insert(tokens->begin(), tokens->end());
372  }
373  }
374  for (const auto& output : current_op.output()) {
375  const auto& actual_blob = get_blob_or_mapped_blob(output);
376  req_tokens_[actual_blob].insert(tokens->begin(), tokens->end());
377  if (actual_blob != output) {
378  req_tokens_[output].insert(tokens->begin(), tokens->end());
379  }
380  }
381 
382  // Increment blob count and check if we can free input blobs.
383  for (const auto& input : current_op.input()) {
384  if (has_key(shareable_blob_names, input)) {
385  blob_input_count_[input]++;
386  if (blob_input_count_[input] == blob_to_ops_[input].size()) {
387  const string& actual_blob = get_blob_or_mapped_blob(input);
388  if (!has_key(dont_share_blob_names, actual_blob)) {
389  new_free_blobs.emplace_back(
390  -share_counts_[actual_blob], actual_blob);
391  new_free_blobs_set.insert(actual_blob);
392  }
393  }
394  }
395  }
396 
397  // Check if we can recycle free blobs and use it as output blob.
398  for (const auto& output : current_op.output()) {
399  if (has_key(shareable_blob_names, output) &&
400  !has_key(processed_output_blobs_, output) &&
401  !has_key(new_free_blobs_set, output)) {
402  const string freed_blob = get_free_blob(
403  output, blob_shapes, tokens, free_blobs, blob_device_[output]);
404  if (freed_blob != "") {
405  req_tokens_[freed_blob].insert(tokens->begin(), tokens->end());
406  share_counts_[freed_blob]++;
407  mapping_[output] = freed_blob;
408  }
409  processed_output_blobs_.insert(output);
410  }
411  }
412 
413  // Insert new freed blobs.
414  std::unordered_set<string> free_blob_set;
415  for (const auto& free_blob : *free_blobs) {
416  free_blob_set.insert(free_blob.second);
417  }
418  for (const auto& new_free_blob : new_free_blobs) {
419  if (!has_key(free_blob_set, new_free_blob.second)) {
420  free_blobs->push_back(new_free_blob);
421  if (blob_shapes.size() > 0) {
422  if (!has_key(blob_sizes_, new_free_blob.second)) {
423  blob_sizes_.insert(
424  {new_free_blob.second,
425  infer_blob_size(new_free_blob.second, blob_shapes)});
426  }
427  }
428  std::push_heap(
429  free_blobs->begin(),
430  free_blobs->end(),
431  std::greater<std::pair<int, string>>());
432  }
433  }
434 
435  int num_branches = 0;
436  for (const auto& output : current_op.output()) {
437  num_branches += blob_to_ops_[output].size();
438  }
439 
440  for (const auto& output : current_op.output()) {
441  for (const auto& input_op_index : blob_to_ops_[output]) {
442  op_visited_count_[input_op_index]++;
443  if (op_visited_count_[input_op_index] == op_inputs_[input_op_index]) {
444  std::unordered_set<int> new_tokens;
445  new_tokens.insert(tokens->begin(), tokens->end());
446  if (num_branches > 1) {
447  new_tokens.insert(tokens_counter_++);
448  }
449  process_op(
450  net,
451  shareable_blob_names,
452  namescope,
453  dont_share_blob_names,
454  blob_shapes,
455  input_op_index,
456  free_blobs,
457  &new_tokens);
458  } else {
459  if (!op_visited_[input_op_index]) {
460  op_token_deposit_[input_op_index].insert(
461  tokens->begin(), tokens->end());
462  }
463  }
464  }
465  }
466  }
467 
468  inline int infer_blob_size(
469  const string& blob_name,
470  const std::unordered_map<string, vector<int>>& blob_shapes) {
471  const auto& blob_shapes_iter = blob_shapes.find(blob_name);
472  if (blob_shapes_iter == blob_shapes.end()) {
473  return 0;
474  }
475  int size = 1;
476  for (int i = 0; i < blob_shapes_iter->second.size(); ++i) {
477  size *= blob_shapes_iter->second[i];
478  }
479  return size;
480  }
481 
482  inline string get_blob_or_mapped_blob(const string& blob_name) {
483  auto mapped_blob = mapping_.find(blob_name);
484  if (mapped_blob == mapping_.end()) {
485  return blob_name;
486  } else {
487  return mapped_blob->second;
488  }
489  }
490 
491  // Rturns true if the op that generates that blob acquires all tokens.
492  inline bool can_use_blob(
493  const string& blob_name,
494  std::unordered_set<int>* tokens,
495  const DeviceOption& device_option) {
496  const DeviceOption& blob_device = blob_device_[blob_name];
497  if (device_option.device_type() != blob_device.device_type() ||
498  device_option.cuda_gpu_id() != blob_device.cuda_gpu_id()) {
499  return false;
500  }
501  for (const int token : req_tokens_[blob_name]) {
502  if (tokens->find(token) == tokens->end()) {
503  return false;
504  }
505  }
506  return true;
507  };
508 
509  // Returns the name of the blob that we are going to map blob_name into.
510  inline string get_free_blob(
511  const string& blob_name,
512  const std::unordered_map<string, vector<int>>& blob_shapes,
513  std::unordered_set<int>* tokens,
514  std::vector<std::pair<int, string>>* free_blobs,
515  const DeviceOption& device) {
516  string freed_blob = "";
517  if (blob_shapes.size() == 0) {
518  std::vector<std::pair<int, string>> cant_use_blobs;
519  while (free_blobs->size() > 0) {
520  std::pop_heap(
521  free_blobs->begin(),
522  free_blobs->end(),
523  std::greater<std::pair<int, string>>());
524  const auto cand_free_blob = free_blobs->back();
525  free_blobs->pop_back();
526  if (can_use_blob(cand_free_blob.second, tokens, device)) {
527  freed_blob = cand_free_blob.second;
528  break;
529  } else {
530  cant_use_blobs.push_back(cand_free_blob);
531  }
532  }
533  for (const auto& cant_use_blob : cant_use_blobs) {
534  free_blobs->push_back(cant_use_blob);
535  std::push_heap(
536  free_blobs->begin(),
537  free_blobs->end(),
538  std::greater<std::pair<int, string>>());
539  }
540  } else {
541  // Heuristic to choose the largest blob to fit output thats
542  // slightly less than blob_size.
543  const int blob_size = infer_blob_size(blob_name, blob_shapes);
544  int best_size = -1;
545  int free_blob_index = -1;
546  for (int i = 0; i < free_blobs->size(); ++i) {
547  const string& cb_name = (*free_blobs)[i].second;
548  if (can_use_blob(cb_name, tokens, device)) {
549  const int cand_bz = blob_sizes_[cb_name];
550  CAFFE_ENFORCE(blob_sizes_.find(cb_name) != blob_sizes_.end());
551  if (cand_bz >= best_size) {
552  if (best_size < blob_size || best_size >= cand_bz) {
553  best_size = cand_bz;
554  free_blob_index = i;
555  }
556  }
557  }
558  }
559  if (free_blob_index != -1) {
560  floats_saved_ += best_size;
561  freed_blob = (*free_blobs)[free_blob_index].second;
562  free_blobs->erase(free_blobs->begin() + free_blob_index);
563  }
564  }
565  return freed_blob;
566  };
567 
568  int tokens_counter_ = 1;
569  int floats_saved_ = 0;
570  // blob_name -> Op edges.
571  std::unordered_map<string, std::vector<int>> blob_to_ops_;
572  // Current Op in degree.
573  std::unordered_map<string, int> blob_input_count_;
574  // Op in degree.
575  std::vector<int> op_inputs_;
576  // Current Op visit counts.
577  std::vector<int> op_visited_count_;
578  std::unordered_map<string, int> share_counts_;
579  std::unordered_map<string, int> blob_sizes_;
580  std::unordered_map<string, std::unordered_set<int>> req_tokens_;
581  std::vector<std::unordered_set<int>> op_token_deposit_;
582  std::unordered_set<string> optim_op_outputs_;
583  std::unordered_map<string, string> mapping_;
584  std::unordered_map<string, DeviceOption> blob_device_;
585  // The set of output blobs we already processed.
586  std::unordered_set<string> processed_output_blobs_;
587  std::vector<bool> op_visited_;
588 };
589 
590 NetDef compute_blob_recycling_for_dag(
591  const NetDef& net,
592  const std::vector<string>& heads,
593  const std::vector<int>& op_indices,
594  const std::unordered_set<string>& shareable_blob_names,
595  const string& namescope,
596  const std::unordered_set<string>& dont_share_blob_names,
597  const std::unordered_map<string, vector<int>>& blob_shapes) {
598  ComputeBlobRecyclingForDag memonger(net.op_size());
599  return memonger.OptimizeNet(
600  net,
601  heads,
602  op_indices,
603  shareable_blob_names,
604  namescope,
605  dont_share_blob_names,
606  blob_shapes);
607 }
608 
609 } // memonger
610 } // caffe2
Copyright (c) 2016-present, Facebook, Inc.