Caffe2 - C++ API
A deep learning, cross platform ML framework
optimize_ideep.cc
1 #include "caffe2/opt/optimize_ideep.h"
2 #include "caffe2/opt/converter.h"
3 #include "caffe2/opt/fusion.h"
4 
5 #ifdef CAFFE2_USE_MKLDNN
6 #include "caffe2/ideep/ideep_utils.h"
7 #endif
8 
9 namespace caffe2 {
10 namespace opt {
11 
12 using namespace nom;
13 
14 #ifndef CAFFE2_USE_MKLDNN
15 void OptimizeForIdeep(
16  repr::NNModule* nn,
18  bool training_mode) {
19  LOG(WARNING) << "Only support optimizations for IDEEP";
20 }
21 
22 #else
23 USE_IDEEP_DEF_ALIASES();
24 
25 Blob* getBlob(repr::NNGraph::NodeRef node, caffe2::Workspace* ws) {
26  auto tensor = repr::nn::get<repr::Tensor>(node);
27  CAFFE_ENFORCE(ws->HasBlob(tensor->getName()), "Blob not in workspace");
28  return ws->GetBlob(tensor->getName());
29 }
30 
31 template <class T>
32 T* getTensor(Blob* blob) {
33  CAFFE_ENFORCE(blob, "Blob is invalid");
34  if (blob && blob->template IsType<T>()) {
35  return blob->template GetMutable<T>();
36  }
37  return nullptr;
38 }
39 
40 const caffe2::OperatorDef& getOpDef(const repr::NeuralNetOperator& nnOp) {
41  auto annotation = nnOp.getAnnotation();
42  if (annotation == nullptr) {
43  CAFFE_THROW("Cannot get Operator annotation");
44  }
45  return dyn_cast<Caffe2Annotation>(annotation)->getOperatorDef();
46 }
47 
48 caffe2::OperatorDef* getMutableOpDef(repr::NeuralNetOperator& nnOp) {
49  auto annotation = nnOp.getMutableAnnotation();
50  if (annotation == nullptr) {
51  CAFFE_THROW("Cannot get Operator annotation");
52  }
53  return dyn_cast<Caffe2Annotation>(annotation)->getMutableOperatorDef();
54 }
55 
56 bool isOpType(const repr::NNGraph::NodeRef& nodeRef, string typeName) {
57  if (!repr::nn::is<repr::NeuralNetOperator>(nodeRef)) {
58  return false;
59  }
60  auto op = repr::nn::get<repr::NeuralNetOperator>(nodeRef);
61  auto opDef = getOpDef(*op);
62  return opDef.type() == typeName;
63 }
64 
65 bool isOnIdeepDevice(const repr::NeuralNetOperator& nnOp) {
66  // We only want to fuse for IDEEP convs
67  const auto& op = getOpDef(nnOp);
68  return op.device_option().device_type() == DeviceTypeProto::PROTO_IDEEP;
69 }
70 
71 bool shouldFuseConv(const repr::Conv& conv) {
72  return isOnIdeepDevice(conv) ? (conv.getGroup() <= 1) : false;
73 }
74 
75 void removeStopGradientForInference(repr::NNModule *nn) {
76  auto isStopGradientNode = [](const repr::NNGraph::NodeRef& node) {
77  if (!repr::nn::is<repr::NeuralNetOperator>(node)) {
78  return false;
79  }
80  auto maybeStopGrad = repr::nn::get<repr::NeuralNetOperator>(node);
81  auto maybeStopGradDef = getOpDef(*maybeStopGrad);
82  return maybeStopGradDef.type() == "StopGradient";
83  };
84 
85  auto allNodes = nn->dataFlow.getMutableNodes();
86  for (int i = 0; i < allNodes.size(); ++i) {
87  auto node = allNodes[i];
88  if (!isStopGradientNode(node)) {
89  continue;
90  }
91 
92  auto stopGradInput = repr::nn::getInputs(node).front();
93  auto stopGradOutput = repr::nn::getOutputs(node).front();
94  auto inputName = repr::nn::get<repr::Tensor>(stopGradInput)->getName();
95  auto outputName = repr::nn::get<repr::Tensor>(stopGradOutput)->getName();
96  if (inputName == outputName) {
97  nn->dataFlow.replaceNode(stopGradOutput, stopGradInput);
98  nn->dataFlow.deleteNode(node);
99  }
100  }
101 }
102 
103 void resetConvForFusion(repr::NNGraph::NodeRef convNode, int fusion_type) {
104  // Fusion types:
105  // FUSION_CONV_RELU = 1
106  // FUSION_CONV_SUM = 2
107  // FUSION_CONV_SUM_RELU = 3
108  auto conv = repr::nn::get<repr::Conv>(convNode);
109  auto annotation = conv->getMutableAnnotation();
110  if (!annotation || !isa<Caffe2Annotation>(annotation)) {
111  return;
112  }
113 
114  auto* op = getMutableOpDef(*conv);
115  if (op == nullptr) {
116  return;
117  }
118 
119  if (op->type() == "ConvFusion") {
120  CAFFE_ENFORCE(fusion_type == 1, "Invalid nest fusion");
121  for (auto& arg : *op->mutable_arg()) {
122  if (arg.name() == "fusion_type") {
123  // Only from FUSION_CONV_SUM to FUSION_CONV_SUM_RELU
124  CAFFE_ENFORCE(arg.i() == 2, "Invalid nest fusion");
125  arg.set_i(3);
126  return;
127  }
128  }
129  return;
130  }
131 
132  CAFFE_ENFORCE(fusion_type < 3, "Invalid fusion type");
133  op->set_type("ConvFusion");
134  auto* arg = op->add_arg();
135  arg->set_name("fusion_type");
136  arg->set_i(fusion_type);
137 }
138 
139 bool fuseConvBNAndAffChHelperForIdeep(repr::NNModule* nn, caffe2::Workspace* ws) {
140  auto isAffineChannelNode = [](const repr::NNGraph::NodeRef& node) {
141  if (!repr::nn::is<repr::NeuralNetOperator>(node)) {
142  return false;
143  }
144  auto maybeAffCh = repr::nn::get<repr::NeuralNetOperator>(node);
145  auto maybeAffChDef = getOpDef(*maybeAffCh);
146  return maybeAffChDef.type() == "AffineChannel";
147  };
148 
149  for (auto node_pair : repr::nn::dataIterator<repr::Conv>(nn->dataFlow)) {
150  bool no_bias = false;
151  repr::NNGraph::NodeRef convNode;
152  repr::Conv* conv;
153  std::tie(conv, convNode) = node_pair;
154 
155  if (!isOnIdeepDevice(*conv)) {
156  LOG(WARNING) << "Not a IDEEP operator";
157  continue;
158  }
159 
160  const auto& op = getOpDef(*conv);
161  if (op.type() == "ConvFusion") {
162  continue;
163  }
164 
165  auto convOutput = repr::nn::getOutputs(convNode).front();
166  auto consumers = repr::nn::getConsumers(convOutput);
167  // convOutput is NOT referenced by sequential ops after BN.
168  if (consumers.size() != 1) {
169  continue;
170  }
171 
172  bool isBN;
173  auto consumer = consumers.front();
174  if (repr::nn::is<repr::BatchNormalization>(consumer)) {
175  isBN = true;
176  } else if (isAffineChannelNode(consumer)) {
177  isBN = false;
178  } else {
179  continue;
180  }
181  auto bnOrAffChNode = consumer;
182  auto bn = isBN ? repr::nn::get<repr::BatchNormalization>(bnOrAffChNode) : nullptr;
183  auto bnOrAffChOutput = repr::nn::getOutputs(bnOrAffChNode).front();
184 
185  auto convInputs = repr::nn::getInputs(convNode);
186  if (convInputs.size() < 2) {
187  LOG(WARNING) << "Invalid convolution input size";
188  continue;
189  }
190 
191  auto bnOrAffChInputs = repr::nn::getInputs(bnOrAffChNode);
192  int numInputs = isBN ? 5 : 3;
193  if (bnOrAffChInputs.size() < numInputs) {
194  LOG(WARNING) << "Invalid input size: "
195  << bnOrAffChInputs.size()
196  << ", expect " << numInputs;
197  continue;
198  }
199 
200  // When no bias, borrow BN bias
201  if (convInputs.size() < 3) {
202  no_bias = true;
203  nn->dataFlow.createEdge(bnOrAffChInputs[2], convNode);
204  convInputs = repr::nn::getInputs(convNode);
205  }
206 
207 #define EXPOSE_TENSOR_DATA(name, index, nodes, need_init) \
208  itensor* name = nullptr; \
209  itensor name##Tensor; \
210  float* name##Data = nullptr; \
211  if (need_init) { \
212  name = getTensor<itensor>(getBlob(nodes[index], ws)); \
213  if (name == nullptr) { \
214  LOG(WARNING) << #name " not a IDEEP tensor"; \
215  continue; \
216  } \
217  name##Tensor.resize(name->get_dims(), name->get_data_type()); \
218  name##Tensor.reorder_from(*name); \
219  CAFFE_ENFORCE( \
220  name##Tensor.is_public_format(), #name " not with public format"); \
221  name##Data = static_cast<float*>(name##Tensor.get_data_handle()); \
222  }
223 
224  EXPOSE_TENSOR_DATA(filter, 1, convInputs, true);
225  EXPOSE_TENSOR_DATA(biasConv, 2, convInputs, true);
226 
227  EXPOSE_TENSOR_DATA(scale, 1, bnOrAffChInputs, true);
228  EXPOSE_TENSOR_DATA(biasBNOrAffCh, 2, bnOrAffChInputs, true);
229  EXPOSE_TENSOR_DATA(mean, 3, bnOrAffChInputs, isBN);
230  EXPOSE_TENSOR_DATA(variance, 4, bnOrAffChInputs, isBN);
231 
232 #undef EXPOSE_TENSOR_DATA
233 
234  // Assume M{CHW,HWC}
235  auto chwDim = filterTensor.get_dim(1) * filterTensor.get_dim(2) *
236  filterTensor.get_dim(3);
237  for (auto c = 0; c < filterTensor.get_dim(0); ++c) {
238  float mean_val = 0;
239  float variance_val = 1;
240  if (isBN) {
241  mean_val = meanData[c];
242  variance_val = std::sqrt(varianceData[c] + bn->getEpsilon());
243  }
244  float coeff = scaleData[c] / variance_val;
245  for (auto i = 0; i < chwDim; ++i) {
246  filterData[c * chwDim + i] *= coeff;
247  }
248 
249  if (no_bias) {
250  biasConvData[c] = biasBNOrAffChData[c] - mean_val * coeff;
251  } else {
252  biasConvData[c] =
253  biasBNOrAffChData[c] + (biasConvData[c] - mean_val) * coeff;
254  }
255  }
256 
257  filter->reorder_from(filterTensor);
258  biasConv->reorder_from(biasConvTensor);
259  nn->dataFlow.replaceNode(convOutput, bnOrAffChOutput);
260 
261  nn->dataFlow.deleteNode(bnOrAffChNode);
262  nn->dataFlow.deleteNode(convOutput);
263 
264  return true;
265  }
266 
267  return false;
268 }
269 
270 void fuseConvBNAndAffChForIdeep(repr::NNModule* nn, caffe2::Workspace* ws) {
271  while (fuseConvBNAndAffChHelperForIdeep(nn, ws)) {
272  }
273 }
274 
275 void fuseConvSumForIdeep(repr::NNModule* nn, caffe2::Workspace* ws) {
276  // Assume the order of nodes from getMutableNodes conforms to
277  // the original topo order of operators
278  auto allNodes = nn->dataFlow.getMutableNodes();
279  for (int i = 0; i < allNodes.size(); i++) {
280  auto sumNode = allNodes[i];
281  if (!repr::nn::hasInputs(sumNode)) {
282  continue;
283  }
284 
285  // CAUTION: On IDEEP device, only element-wise Add operator is
286  // supported yet. It totally works as element-wise sum without scalar broadcast.
287  if (!repr::nn::is<repr::Sum>(sumNode) && !isOpType(sumNode, "Add")) {
288  continue;
289  }
290 
291  auto sum = repr::nn::get<repr::NeuralNetOperator>(sumNode);
292  if (!isOnIdeepDevice(*sum)) {
293  LOG(WARNING) << "Not a IDEEP operator";
294  continue;
295  }
296 
297  auto sumInputs = repr::nn::getInputs(sumNode);
298  if (sumInputs.size() != 2) {
299  continue;
300  }
301 
302  bool should_fuse = true;
303  for (auto input : sumInputs) {
304  auto consumer = repr::nn::getConsumers(input).back();
305  if (consumer != sumNode) {
306  should_fuse = false;
307  break;
308  }
309  }
310  // Sum inputs should not be referenced by sequential ops.
311  if (!should_fuse) {
312  continue;
313  }
314 
315  int j = i - 1;
316  repr::NNGraph::NodeRef convNode = nullptr;
317  while (j-- >= 0) {
318  if (!repr::nn::hasInputs(sumNode)) {
319  continue;
320  }
321 
322  // Find the nearest Op before Sum
323  if (repr::nn::is<repr::NeuralNetOperator>(allNodes[j])) {
324  // The Op must be a Conv
325  if (repr::nn::is<repr::Conv>(allNodes[j])) {
326  convNode = allNodes[j];
327  }
328  break;
329  }
330  }
331  if (convNode == nullptr) {
332  continue;
333  }
334 
335  auto conv = repr::nn::get<repr::Conv>(convNode);
336  if (!shouldFuseConv(*conv)) {
337  LOG(WARNING) << "Not a IDEEP operator";
338  continue;
339  }
340 
341  auto convOutput = repr::nn::getOutputs(convNode).front();
342  repr::NNGraph::NodeRef sumInputX =
343  (sumInputs[0] == convOutput ? sumInputs[1] : sumInputs[0]);
344  CAFFE_ENFORCE(sumInputX != nullptr, "Invalid sum inputs");
345 
346  auto preNode = repr::nn::getProducer(sumInputX);
347  if (preNode == nullptr || !repr::nn::is<repr::NeuralNetOperator>(preNode)) {
348  LOG(WARNING) << "Can not fuse Conv Sum";
349  continue;
350  }
351 
352  auto newOutputName = repr::nn::get<repr::Tensor>(sumInputX)->getName();
353  auto newOutputTensor = util::make_unique<repr::Tensor>(newOutputName);
354  auto newOutput = nn->dataFlow.createNode(
355  unique_dyn_cast<repr::NeuralNetData>(newOutputTensor));
356 
357  auto sumOutput = repr::nn::getOutputs(sumNode).front();
358  nn->dataFlow.replaceNode(sumOutput, newOutput);
359 
360  // 2 means FUSION_CONV_SUM
361  resetConvForFusion(convNode, 2);
362  nn->dataFlow.createEdge(sumInputX, convNode);
363  nn->dataFlow.createEdge(convNode, newOutput);
364 
365  nn->dataFlow.deleteNode(sumNode);
366  nn->dataFlow.deleteNode(sumOutput);
367  nn->dataFlow.deleteNode(convOutput);
368  }
369 }
370 
371 void fuseActivationForIdeep(repr::NNModule* nn) {
372  // Conv+Relu fusion
373  auto should_fuse = shouldFuseConv;
374  auto postprocess = std::bind(resetConvForFusion, std::placeholders::_1, 1);
375  fuseActivation<repr::Conv, repr::Relu>(nn, should_fuse, postprocess);
376 }
377 
378 void enforceFusionInplaceForIdeep(repr::NNModule* nn) {
379  // For fusions of Conv+Sum or Conv+Sum+ReLU, the last input and output must
380  // be inplaced. To enforce inplace, here to re-check whole graph and correct
381  // the ConvFusion Ops.
382  for (auto node_pair : repr::nn::dataIterator<repr::Conv>(nn->dataFlow)) {
383  repr::NNGraph::NodeRef convNode;
384  repr::Conv* conv;
385  std::tie(conv, convNode) = node_pair;
386 
387  if (!isOnIdeepDevice(*conv)) {
388  LOG(WARNING) << "Not a IDEEP operator";
389  continue;
390  }
391 
392  const auto& op = getOpDef(*conv);
393  if (op.type() != "ConvFusion") {
394  continue;
395  }
396 
397  bool enforce_inplace = false;
398  for (const auto& arg : op.arg()) {
399  // Only check FUSION_SUM & FUSION_SUM_RELU
400  if (arg.name() == "fusion_type" && (arg.i() == 2 || arg.i() == 3)) {
401  enforce_inplace = true;
402  break;
403  }
404  }
405 
406  if (!enforce_inplace) {
407  continue;
408  }
409 
410  auto convInput = repr::nn::getInputs(convNode).back();
411  auto inputName = repr::nn::get<repr::Tensor>(convInput)->getName();
412  auto convOutput = repr::nn::getOutputs(convNode).front();
413  auto outputName = repr::nn::get<repr::Tensor>(convOutput)->getName();
414  if (inputName == outputName) {
415  continue;
416  }
417 
418  auto consumer = repr::nn::getConsumers(convInput).back();
419  if (consumer != convNode) {
420  LOG(ERROR) << "Can not enforce to inplace for fusion";
421  return;
422  }
423 
424  auto newOutputTensor = util::make_unique<repr::Tensor>(inputName);
425  auto newOutput = nn->dataFlow.createNode(
426  unique_dyn_cast<repr::NeuralNetData>(newOutputTensor));
427  nn->dataFlow.replaceNode(convOutput, newOutput);
428 
429  nn->dataFlow.deleteNode(convOutput);
430  }
431 }
432 
433 void setPoolingInferenceMode(repr::NNModule *nn) {
434  for (auto node_pair : repr::nn::dataIterator<repr::MaxPool>(nn->dataFlow)) {
435  repr::NNGraph::NodeRef maxPoolNode;
436  repr::MaxPool *maxPool;
437  std::tie(maxPool, maxPoolNode) = node_pair;
438 
439  if (!isOnIdeepDevice(*maxPool)) {
440  LOG(WARNING) << "Not a IDEEP operator";
441  continue;
442  }
443 
444  auto *op = getMutableOpDef(*maxPool);
445  bool found_training_mode = false;
446  for (auto &arg : *op->mutable_arg()) {
447  if (arg.name() == "training_mode") {
448  arg.set_i(0);
449  found_training_mode = true;
450  break;
451  }
452  }
453 
454  if (!found_training_mode) {
455  auto *arg = op->add_arg();
456  arg->set_name("training_mode");
457  arg->set_i(0);
458  }
459  }
460 }
461 
462 void OptimizeForIdeep(
463  repr::NNModule* nn,
464  caffe2::Workspace* ws,
465  bool training_mode) {
466  if (training_mode) {
467  // Only support inference so far
468  return;
469  }
470 
471  removeStopGradientForInference(nn);
472 
473  fuseConvBNAndAffChForIdeep(nn, ws);
474 
475  fuseConvSumForIdeep(nn, ws);
476 
477  fuseActivationForIdeep(nn);
478 
479  enforceFusionInplaceForIdeep(nn);
480 
481  setPoolingInferenceMode(nn);
482 }
483 
484 #endif // CAFFE2_USE_MKLDNN
485 
486 } // namespace opt
487 } // namespace caffe2
NodeRef createNode(T &&data)
Creates a node and retains ownership of it.
Definition: Graph.h:240
Definition: Dot.h:16
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
const Blob * GetBlob(const string &name) const
Gets the blob with the given name as a const pointer.
Definition: workspace.cc:160
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
void deleteNode(NodeRef n)
Deletes a node from the graph.
Definition: Graph.h:460
bool HasBlob(const string &name) const
Checks if a blob with the given name is present in the current workspace.
Definition: workspace.h:179
void replaceNode(const NodeRef &oldNode, const NodeRef &newNode)
Replace a node in the graph with another node.
Definition: Graph.h:384
EdgeRef createEdge(NodeRef tail, NodeRef head, U...data)
Creates a directed edge and retains ownership of it.
Definition: Graph.h:415