doxygen-c/html/memonger_8cc_source.html

 #include "caffe2/core/memonger.h"

 #include <set>
 #include <unordered_set>

 #include "caffe2/utils/proto_utils.h"

 namespace caffe2 {
 namespace memonger {

 NetDef optimize_inference_net(
     const NetDef& net,
     const std::set<string>& static_blobs) {
   if (net.type() != "" && net.type() != "simple") {
     LOG(INFO) << "Cannot optimize memory for nets of type: " << net.type();
     return net;
   }

   std::vector<OperatorDef> ops;
   for (auto& op : net.op()) {
     if (op.type() == "RecurrentNetwork") {
       // NOTE: for subtleties of RNN op memonger, see memonger.py on how
       // to deal with the forward/backward links etc.
       LOG(INFO) << "Memonger does not support RecurrentNetwork yet";
       return net;
     }
     ops.push_back(op);
   }

   // Step 1: count first and last operator for each blob
   std::unordered_set<std::string> all_blobs;
   std::unordered_map<std::string, std::pair<int, int>> ranges;
   for (size_t i = 0; i < ops.size(); i++) {
     for (auto& inp : ops[i].input()) {
       if (ranges.find(inp) != ranges.end()) {
         ranges[inp].second = i;
       }
       all_blobs.insert(inp);
     }
     for (auto& outp : ops[i].output()) {
       all_blobs.insert(outp);
       if (static_blobs.find(outp) != static_blobs.end()) {
         continue;
       }
       if (ranges.find(outp) == ranges.end()) {
         ranges[outp] = std::make_pair(i, i);
       }
     }
   }

   // Step 2: pass over ops and recycle
   std::vector<std::string> free_blobs;
   std::unordered_map<std::string, std::string> renaming;
   std::unordered_map<std::string, std::string> mapping;

   for (int i = 0; i < (int)ops.size(); i++) {
     auto& op = ops[i];
     std::unordered_set<std::string> new_free_blobs;

     // Check if some input is used the last time, and release it
     for (auto& inp : op.input()) {
       auto rit = ranges.find(inp);
       if (rit != ranges.end() && rit->second.second == i) {
         if (mapping.find(inp) == mapping.end()) {
           new_free_blobs.insert(inp);
           mapping[inp] = inp;

           // Safety check to prevent double-memongering nets.
           string shared_blob =
               "__m" + c10::to_string(renaming.size()) + "_shared";
           if (all_blobs.find(shared_blob) != all_blobs.end()) {
             LOG(INFO) << "Net was already memongered!";
             return net;
           }
           renaming[inp] = shared_blob;
         } else {
           new_free_blobs.insert(mapping[inp]);
         }
       }
     }

     // Check if some output appears the first time, and see if we can replace it
     // with a recycled blob.
     for (auto& outp : op.output()) {
       if (!free_blobs.empty()) {
         // first use?
         auto rit = ranges.find(outp);
         if (rit != ranges.end() && rit->second.first == i) {
           std::string recycled = free_blobs.back();
           free_blobs.pop_back();
           mapping[outp] = recycled;
         }
       }
     }

     // Add blobs released from this op to the pool.
     for (auto& b : new_free_blobs) {
       free_blobs.push_back(b);
     }
   }

   // Step 3: rename inputs and outputs and create new net
   NetDef optim_net = net;
   optim_net.mutable_op()->Clear();
   for (auto op : ops) {
     for (int i = 0; i < op.input_size(); i++) {
       auto& inp = op.input(i);
       if (mapping.find(inp) != mapping.end()) {
         op.set_input(i, renaming[mapping[inp]]);
       }
     }
     for (int i = 0; i < op.output_size(); i++) {
       auto& outp = op.output(i);
       if (mapping.find(outp) != mapping.end()) {
         op.set_output(i, renaming[mapping[outp]]);
       }
     }
     auto* ao = optim_net.add_op();
     ao->CopyFrom(op);
   }

   VLOG(1) << "optimized net using " << renaming.size() << " shared blobs";
   return optim_net;
 }

 class ComputeBlobRecyclingForDag {
  public:
   explicit ComputeBlobRecyclingForDag(const int size)
       : op_inputs_(size),
         op_visited_count_(size),
         op_token_deposit_(size),
         op_visited_(size, false) {}
   NetDef OptimizeNet(
       const NetDef& net,
       const std::vector<string>& heads,
       const std::vector<int>& op_indices,
       const std::unordered_set<string>& shareable_blob_names,
       const string& namescope,
       const std::unordered_set<string>& dont_share_blob_names,
       const std::unordered_map<string, vector<int>>& blob_shapes) {
     // Construct the set of input blobs.
     std::unordered_set<string> heads_blobs_set(heads.begin(), heads.end());

     // Construct the set of output blobs we want to optimize.
     for (const int op_index : op_indices) {
       for (const auto& output : net.op(op_index).output()) {
         optim_op_outputs_.insert(output);
       }
     }

     // Compute operators in degree (op_inputs_) and initialize how many ops are
     // sharing input blobs (share_counts_).
     // Note: We have to handle the cases where output blobs are shared.
     std::unordered_map<string, int> blob_seen;
     for (const int op_index : op_indices) {
       for (const auto& input : net.op(op_index).input()) {
         if (has_key(shareable_blob_names, input) ||
             has_key(heads_blobs_set, input)) {
           if (has_key(optim_op_outputs_, input)) {
             CAFFE_ENFORCE(
                 blob_seen.find(input) != blob_seen.end(),
                 "Input ",
                 input,
                 " was not output by an op before");
             op_inputs_[op_index] += blob_seen[input];
           } else {
             share_counts_[input] = 1;
           }
           blob_to_ops_[input].push_back(op_index);
         }
       }
       for (const auto& output : net.op(op_index).output()) {
         blob_seen[output] += 1;
         blob_device_[output] = net.op(op_index).device_option();
         // Exception for CopyGPUToCPU that has
         // cuda device option but whose inputs/outputs are on CPU
         if (net.op(op_index).type() == "CopyGPUToCPU") {
           blob_device_[output].set_device_type(0);
           blob_device_[output].set_device_id(0);
         }
       }
     }

     // The main recursive call. Here we do start DFS in the operator graph
     // from the input blobs.
     for (const auto& input_blob : heads) {
       for (const int op_index : blob_to_ops_[input_blob]) {
         if (!op_visited_[op_index]) {
           vector<std::pair<int, string>> free_blobs;
           std::unordered_set<int> tokens{tokens_counter_++};
           process_op(
               net,
               shareable_blob_names,
               namescope,
               dont_share_blob_names,
               blob_shapes,
               op_index,
               &free_blobs,
               &tokens);
         }
       }
     }

     // Rename mapped blobs.
     std::unordered_map<string, string> renamed;
     int name_idx = 0;
     std::unordered_set<string> mapped_blobs_set;
     for (const auto& mapped_blob : mapping_) {
       mapped_blobs_set.insert(mapped_blob.second);
       if (has_key(optim_op_outputs_, mapped_blob.second)) {
         if (renamed.find(mapped_blob.second) == renamed.end()) {
           renamed.insert(
               {mapped_blob.second,
                namescope + "__m" + c10::to_string(name_idx++) + "_shared"});
         }
       } else {
         renamed.insert({mapped_blob.second, mapped_blob.second});
       }
     }

     // Recursively rename mapped_blobs.
     mapping_.insert(renamed.begin(), renamed.end());
     bool had_changes = true;
     while (had_changes) {
       had_changes = false;
       for (const auto mapped_blob : mapping_) {
         if (has_key(renamed, mapped_blob.second) &&
             renamed[mapped_blob.second] != mapped_blob.second) {
           renamed[mapped_blob.first] = renamed[mapped_blob.second];
           mapping_[mapped_blob.first] = renamed[mapped_blob.first];
         }
       }
     }

     NetDef optimized_net = apply_assignments(net);
     LOG(INFO) << "Remapping " << mapping_.size() << " using "
               << mapped_blobs_set.size() << " shared blobs.";
     if (floats_saved_ > 0) {
       LOG(INFO) << "Memonger saved approximately : "
                 << (floats_saved_ * 4.0 / 1024.0 / 1024.0) << " MB.";
     }

     return optimized_net;
   }

  private:
   NetDef apply_assignments(const NetDef& net) {
     NetDef optimized_net = net;
     // Rename optimized_net blobs.
     for (int i = 0; i < optimized_net.op_size(); ++i) {
       // Special handling for RNNs, which have internal nets that
       // can refer to memongered blobs
       if (optimized_net.op(i).type().find("RecurrentNetwork") == 0) {
         apply_recurrent_blob_assignments(optimized_net.mutable_op(i));
       }

       for (int j = 0; j < optimized_net.op(i).input_size(); ++j) {
         const string& input_name =
             get_blob_or_mapped_blob(optimized_net.op(i).input(j));
         optimized_net.mutable_op(i)->set_input(j, input_name);
       }

       for (int j = 0; j < optimized_net.op(i).output_size(); ++j) {
         auto output_name =
             get_blob_or_mapped_blob(optimized_net.op(i).output(j));
         optimized_net.mutable_op(i)->set_output(j, output_name);
       }
     }
     return optimized_net;
   }

   void apply_recurrent_blob_assignments(OperatorDef* op) {
     // Recursively map stepnets in RecurrentNetworks, and
     // attach a mapping table
     for (int i = 0; i < op->arg_size(); i++) {
       Argument* arg = op->mutable_arg(i);
       const string& name = arg->name();
       if (name == "step_net" || name == "backward_step_net") {
         if (arg->has_n()) {
           NetDef* step_net_ref = arg->mutable_n();
           CAFFE_ENFORCE(
               !arg->has_s(),
               "Invalid definition for ",
               name,
               ". Only one of NetDef and string should be present");
           NetDef optimized_net = apply_assignments(*step_net_ref);
           step_net_ref->CopyFrom(optimized_net);
         } else {
           NetDef step_net;
           CAFFE_ENFORCE(
               TextFormat::ParseFromString(
                   arg->s(), &step_net),
               "Could not parse step net:",
               name);
           step_net = apply_assignments(step_net);
           arg->set_s(ProtoDebugString(step_net));
         }
       }
     }

     // Store renamings
     vector<string> inputs_outputs(op->input().begin(), op->input().end());
     inputs_outputs.insert(
         inputs_outputs.end(), op->output().begin(), op->output().end());

     for (auto& b : inputs_outputs) {
       string mapped = get_blob_or_mapped_blob(b);
       if (b != mapped) {
         Argument* map_arg = op->add_arg();
         map_arg->set_name(b + ".rename");
         map_arg->set_s(mapped);
       }
     }
   }

   template <typename K, typename V>
   inline bool has_key(const std::unordered_map<K, V>& in_map, const K& key) {
     return in_map.find(key) != in_map.end();
   }

   template <typename K>
   inline bool has_key(const std::unordered_set<K>& in_set, const K& key) {
     return in_set.find(key) != in_set.end();
   }

   void process_op(
       const NetDef& net,
       const std::unordered_set<string>& shareable_blob_names,
       const string& namescope,
       const std::unordered_set<string>& dont_share_blob_names,
       const std::unordered_map<string, vector<int>>& blob_shapes,
       int op_index,
       std::vector<std::pair<int, string>>* free_blobs,
       std::unordered_set<int>* tokens) {
     // The tokens we have now is the union of current tokens operator is holding
     // and tokens pushed from parents.
     tokens->insert(
         op_token_deposit_[op_index].begin(), op_token_deposit_[op_index].end());
     op_token_deposit_[op_index].clear();
     CAFFE_ENFORCE(!op_visited_[op_index]);
     op_visited_[op_index] = true;

     const OperatorDef& current_op = net.op(op_index);

     // The set of freed input blobs by processing current op.
     std::vector<std::pair<int, string>> new_free_blobs;
     std::unordered_set<string> new_free_blobs_set;

     // Now update blob tokens.
     for (const auto& input : current_op.input()) {
       const auto& actual_blob = get_blob_or_mapped_blob(input);
       req_tokens_[actual_blob].insert(tokens->begin(), tokens->end());
       if (actual_blob != input) {
         req_tokens_[input].insert(tokens->begin(), tokens->end());
       }
     }
     for (const auto& output : current_op.output()) {
       const auto& actual_blob = get_blob_or_mapped_blob(output);
       req_tokens_[actual_blob].insert(tokens->begin(), tokens->end());
       if (actual_blob != output) {
         req_tokens_[output].insert(tokens->begin(), tokens->end());
       }
     }

     // Increment blob count and check if we can free input blobs.
     for (const auto& input : current_op.input()) {
       if (has_key(shareable_blob_names, input)) {
         blob_input_count_[input]++;
         if (blob_input_count_[input] == (int)blob_to_ops_[input].size()) {
           const string& actual_blob = get_blob_or_mapped_blob(input);
           if (!has_key(dont_share_blob_names, actual_blob)) {
             new_free_blobs.emplace_back(
                 -share_counts_[actual_blob], actual_blob);
             new_free_blobs_set.insert(actual_blob);
           }
         }
       }
     }

     // Check if we can recycle free blobs and use it as output blob.
     for (const auto& output : current_op.output()) {
       if (has_key(shareable_blob_names, output) &&
           !has_key(processed_output_blobs_, output) &&
           !has_key(new_free_blobs_set, output)) {
         const string freed_blob = get_free_blob(
             output, blob_shapes, tokens, free_blobs, blob_device_[output]);
         if (freed_blob != "") {
           req_tokens_[freed_blob].insert(tokens->begin(), tokens->end());
           share_counts_[freed_blob]++;
           mapping_[output] = freed_blob;
         }
         processed_output_blobs_.insert(output);
       }
     }

     // Insert new freed blobs.
     std::unordered_set<string> free_blob_set;
     for (const auto& free_blob : *free_blobs) {
       free_blob_set.insert(free_blob.second);
     }
     for (const auto& new_free_blob : new_free_blobs) {
       if (!has_key(free_blob_set, new_free_blob.second)) {
         free_blobs->push_back(new_free_blob);
         if (blob_shapes.size() > 0) {
           if (!has_key(blob_sizes_, new_free_blob.second)) {
             blob_sizes_.insert(
                 {new_free_blob.second,
                  infer_blob_size(new_free_blob.second, blob_shapes)});
           }
         }
         std::push_heap(
             free_blobs->begin(),
             free_blobs->end(),
             std::greater<std::pair<int, string>>());
       }
     }

     int num_branches = 0;
     for (const auto& output : current_op.output()) {
       num_branches += blob_to_ops_[output].size();
     }

     for (const auto& output : current_op.output()) {
       for (const auto& input_op_index : blob_to_ops_[output]) {
         op_visited_count_[input_op_index]++;
         if (op_visited_count_[input_op_index] == op_inputs_[input_op_index]) {
           std::unordered_set<int> new_tokens;
           new_tokens.insert(tokens->begin(), tokens->end());
           if (num_branches > 1) {
             new_tokens.insert(tokens_counter_++);
           }
           process_op(
               net,
               shareable_blob_names,
               namescope,
               dont_share_blob_names,
               blob_shapes,
               input_op_index,
               free_blobs,
               &new_tokens);
         } else {
           if (!op_visited_[input_op_index]) {
             op_token_deposit_[input_op_index].insert(
                 tokens->begin(), tokens->end());
           }
         }
       }
     }
   }

   inline int infer_blob_size(
       const string& blob_name,
       const std::unordered_map<string, vector<int>>& blob_shapes) {
     const auto& blob_shapes_iter = blob_shapes.find(blob_name);
     if (blob_shapes_iter == blob_shapes.end()) {
       return 0;
     }
     int size = 1;
     for (size_t i = 0; i < blob_shapes_iter->second.size(); ++i) {
       size *= blob_shapes_iter->second[i];
     }
     return size;
   }

   inline string get_blob_or_mapped_blob(const string& blob_name) {
     auto mapped_blob = mapping_.find(blob_name);
     if (mapped_blob == mapping_.end()) {
       return blob_name;
     } else {
       return mapped_blob->second;
     }
   }

   // Rturns true if the op that generates that blob acquires all tokens.
   inline bool can_use_blob(
       const string& blob_name,
       std::unordered_set<int>* tokens,
       const DeviceOption& device_option) {
     const DeviceOption& blob_device = blob_device_[blob_name];
     if (device_option.device_type() != blob_device.device_type() ||
         device_option.device_id() != blob_device.device_id()) {
       return false;
     }
     for (const int token : req_tokens_[blob_name]) {
       if (tokens->find(token) == tokens->end()) {
         return false;
       }
     }
     return true;
   };

   // Returns the name of the blob that we are going to map blob_name into.
   inline string get_free_blob(
       const string& blob_name,
       const std::unordered_map<string, vector<int>>& blob_shapes,
       std::unordered_set<int>* tokens,
       std::vector<std::pair<int, string>>* free_blobs,
       const DeviceOption& device) {
     string freed_blob = "";
     if (blob_shapes.size() == 0) {
       std::vector<std::pair<int, string>> cant_use_blobs;
       while (free_blobs->size() > 0) {
         std::pop_heap(
             free_blobs->begin(),
             free_blobs->end(),
             std::greater<std::pair<int, string>>());
         const auto cand_free_blob = free_blobs->back();
         free_blobs->pop_back();
         if (can_use_blob(cand_free_blob.second, tokens, device)) {
           freed_blob = cand_free_blob.second;
           break;
         } else {
           cant_use_blobs.push_back(cand_free_blob);
         }
       }
       for (const auto& cant_use_blob : cant_use_blobs) {
         free_blobs->push_back(cant_use_blob);
         std::push_heap(
             free_blobs->begin(),
             free_blobs->end(),
             std::greater<std::pair<int, string>>());
       }
     } else {
       // Heuristic to choose the largest blob to fit output thats
       // slightly less than blob_size.
       const int blob_size = infer_blob_size(blob_name, blob_shapes);
       int best_size = -1;
       int free_blob_index = -1;
       for (size_t i = 0; i < free_blobs->size(); ++i) {
         const string& cb_name = (*free_blobs)[i].second;
         if (can_use_blob(cb_name, tokens, device)) {
           const int cand_bz = blob_sizes_[cb_name];
           CAFFE_ENFORCE(blob_sizes_.find(cb_name) != blob_sizes_.end());
           if (cand_bz >= best_size) {
             if (best_size < blob_size || best_size >= cand_bz) {
               best_size = cand_bz;
               free_blob_index = i;
             }
           }
         }
       }
       if (free_blob_index != -1) {
         floats_saved_ += best_size;
         freed_blob = (*free_blobs)[free_blob_index].second;
         free_blobs->erase(free_blobs->begin() + free_blob_index);
       }
     }
     return freed_blob;
   };

   int tokens_counter_ = 1;
   int floats_saved_ = 0;
   // blob_name -> Op edges.
   std::unordered_map<string, std::vector<int>> blob_to_ops_;
   // Current Op in degree.
   std::unordered_map<string, int> blob_input_count_;
   // Op in degree.
   std::vector<int> op_inputs_;
   // Current Op visit counts.
   std::vector<int> op_visited_count_;
   std::unordered_map<string, int> share_counts_;
   std::unordered_map<string, int> blob_sizes_;
   std::unordered_map<string, std::unordered_set<int>> req_tokens_;
   std::vector<std::unordered_set<int>> op_token_deposit_;
   std::unordered_set<string> optim_op_outputs_;
   std::unordered_map<string, string> mapping_;
   std::unordered_map<string, DeviceOption> blob_device_;
   // The set of output blobs we already processed.
   std::unordered_set<string> processed_output_blobs_;
   std::vector<bool> op_visited_;
 };

 NetDef compute_blob_recycling_for_dag(
     const NetDef& net,
     const std::vector<string>& heads,
     const std::vector<int>& op_indices,
     const std::unordered_set<string>& shareable_blob_names,
     const string& namescope,
     const std::unordered_set<string>& dont_share_blob_names,
     const std::unordered_map<string, vector<int>>& blob_shapes) {
   ComputeBlobRecyclingForDag memonger(net.op_size());
   return memonger.OptimizeNet(
       net,
       heads,
       op_indices,
       shareable_blob_names,
       namescope,
       dont_share_blob_names,
       blob_shapes);
 }

 } // memonger
 } // caffe2
caffe2::memonger::ComputeBlobRecyclingForDag
Definition: memonger.cc:126

caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13

c10::Argument
Definition: function_schema.h:14