Caffe2 - C++ API
A deep learning, cross platform ML framework
graph_executor.cpp
1 #include <torch/csrc/jit/graph_executor.h>
2 
3 #include <ATen/core/ivalue.h>
4 #include <c10/util/Exception.h>
5 #include <torch/csrc/autograd/grad_mode.h>
6 #include <torch/csrc/jit/argument_spec.h>
7 #include <torch/csrc/jit/autodiff.h>
8 #include <torch/csrc/jit/custom_operator.h>
9 #include <torch/csrc/jit/interpreter.h>
10 #include <torch/csrc/jit/ir.h>
11 #include <torch/csrc/jit/resource_guard.h>
12 #include <ATen/core/ivalue.h>
13 #include <torch/csrc/jit/passes/batch_mm.h>
14 #include <torch/csrc/jit/passes/canonicalize_ops.h>
15 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
16 #include <torch/csrc/jit/passes/constant_pooling.h>
17 #include <torch/csrc/jit/passes/constant_propagation.h>
18 #include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
19 #include <torch/csrc/jit/passes/dead_code_elimination.h>
20 #include <torch/csrc/jit/passes/graph_fuser.h>
21 #include <torch/csrc/jit/passes/inline_autodiff_subgraphs.h>
22 #include <torch/csrc/jit/passes/inplace_check.h>
23 #include <torch/csrc/jit/passes/loop_unrolling.h>
24 #include <torch/csrc/jit/passes/lower_grad_of.h>
25 #include <torch/csrc/jit/passes/peephole.h>
26 #include <torch/csrc/jit/passes/remove_expands.h>
27 #include <torch/csrc/jit/passes/requires_grad_analysis.h>
28 #include <torch/csrc/jit/passes/shape_analysis.h>
29 #include <torch/csrc/jit/passes/specialize_autogradzero.h>
30 #include <torch/csrc/jit/symbolic_variable.h>
31 #include <torch/csrc/jit/tracer.h>
32 
33 #include <torch/csrc/autograd/edge.h>
34 #include <torch/csrc/autograd/function.h>
35 #include <torch/csrc/jit/script/compiler.h>
36 
37 #include <cstdint>
38 #include <iterator>
39 #include <memory>
40 #include <mutex>
41 #include <unordered_map>
42 #include <utility>
43 #include <vector>
44 
45 namespace torch {
46 namespace jit {
47 
48 namespace {
49 
50 using tensor_list = std::vector<at::Tensor>;
51 using Variable = autograd::Variable;
52 using autograd::variable_list;
53 
54 struct ExecutionPlan {
55  ExecutionPlan() = default;
56  ExecutionPlan(std::shared_ptr<Graph> graph)
57  : code(graph), graph(std::move(graph)) {}
58 
59  void run(Stack& stack) const {
60  return InterpreterState(code).run(stack);
61  }
62 
63  operator bool() const {
64  return static_cast<bool>(graph);
65  }
66 
67  ExecutionPlanState getDebugState() {
68  ExecutionPlanState state;
69  state.code = &code;
70  state.graph = graph.get();
71  return state;
72  }
73 
74  Code code;
75  std::shared_ptr<Graph> graph;
76 };
77 
78 struct DifferentiableGraphBackward : public autograd::Function {
79  DifferentiableGraphBackward(GraphExecutor executor, size_t capture_size)
80  : executor(std::move(executor)) {
81  is_var_capture.reserve(capture_size);
82  var_captures.reserve(capture_size);
83  ivalue_captures.reserve(capture_size);
84  }
85 
86  variable_list apply(variable_list&& inputs) override {
87  Stack stack;
88  stack.reserve(is_var_capture.size() + inputs.size());
89  stack.insert(
90  stack.end(),
91  std::make_move_iterator(inputs.begin()),
92  std::make_move_iterator(inputs.end()));
93  auto var_capture_it = var_captures.begin();
94  auto ivalue_capture_it = ivalue_captures.begin();
95  for (bool is_var : is_var_capture) {
96  if (is_var) {
97  stack.emplace_back(var_capture_it->unpack(this->shared_from_this()));
98  ++var_capture_it;
99  } else {
100  stack.push_back(*ivalue_capture_it);
101  ++ivalue_capture_it;
102  }
103  }
104 
105  executor.run(stack);
106  AT_ASSERT(stack.size() == num_outputs());
107 
108  variable_list outputs;
109  outputs.reserve(num_outputs());
110  for (size_t i = 0; i < num_outputs(); ++i) {
111  // Input grad can also be None even if it requires grad
112  // Example: `other` in expand_as(self, other)
113  if (should_compute_output(i) && !stack[i].isNone()) {
114  auto output = std::move(stack[i]).toTensor();
115  const auto& edge = next_edge(i);
116  if (output.defined()) {
117  outputs.emplace_back(std::move(output));
118  } else if (edge.is_valid()) {
119  outputs.emplace_back(
120  edge.function->input_metadata(edge.input_nr).zeros_like());
121  } else {
122  outputs.emplace_back();
123  }
124  } else {
125  outputs.emplace_back();
126  }
127  }
128  return outputs;
129  }
130 
131  void capture(const IValue& val, bool is_output) {
132  const bool is_tensor = val.isTensor();
133  is_var_capture.push_back(is_tensor);
134  if (is_tensor) {
135  var_captures.emplace_back(Variable(val.toTensor()), is_output);
136  } else {
137  ivalue_captures.push_back(val);
138  }
139  }
140 
141  private:
142  friend struct ExecutionPlan;
143  GraphExecutor executor;
144 
145  // INVARIANT: is_var_capture.size() == var_captures.size() +
146  // ivalue_captures.size()
147  std::vector<bool> is_var_capture;
148  std::vector<autograd::SavedVariable> var_captures;
149  std::vector<IValue> ivalue_captures;
150 };
151 
152 // an optimized way of executing the subgraph computed directly on
153 // tensors rather than Variables.
154 // This will unwrap Variables, run the plan, and re-wrap them.
155 // It can optionally also have a gradient which is hooked up
156 // to the output Variables if present.
157 struct DifferentiableGraphOp {
158  DifferentiableGraphOp(Gradient grad)
159  : f(grad.f),
160  grad(std::move(grad)),
161  grad_executor(this->grad.df),
162  num_inputs(this->grad.f->inputs().size()),
163  num_outputs(this->grad.f->outputs().size()) {}
164 
165  // XXX: keep in mind that stack can be larger than the inputs we need!
166  int operator()(Stack& stack) const {
167  auto grad_fn = std::make_shared<DifferentiableGraphBackward>(
168  grad_executor,
169  grad.df_input_captured_inputs.size() +
170  grad.df_input_captured_outputs.size());
171 
172  {
173  auto inputs = last(stack, num_inputs);
174  // hook up the outputs of df to the gradient functions of the inputs that
175  // require gradients
176  for (auto idx : grad.df_output_vjps) {
177  auto v = Variable(inputs[idx].toTensor());
178  grad_fn->add_next_edge(
179  v.defined() ? v.gradient_edge() : autograd::Edge{});
180  }
181  captureInputs(*grad_fn, inputs);
182  }
183 
184  detachVariables(stack);
185  InterpreterState(f).run(stack);
186 
187  {
188  auto outputs = last(stack, num_outputs);
189  // hookup the gradients for the output tensors that require gradients
190  // to the inputs to our gradient function df
191  // TODO - XXX - if any output is the same tensor multiple times, views
192  // have to be setup here. We need to refactor autograd until it is safe
193  // for tensors to be constructed without all the viewing infrastructure.
194  // this is currently intentionally not done here so we can get an idea of
195  // our perf before introducing overhead for correctness
196  for (auto idx : grad.df_input_vjps) {
197  // Note: we have to set this up in place, or we have to throw away and
198  // reallocate variables that were already created in wrapTensors. We
199  // should add an API for this.
200 
201  // XXX: undefined tensor syntax in autograd
202  Variable output;
203  if (!outputs[idx].isNone()) {
204  output = outputs[idx].toTensor();
205  }
206  // NB: since our requires_grad setting is only a heuristic we might end
207  // up wanting to differentiate through integral tensors, which is
208  // generally a hard error in autograd.
209  if (at::isFloatingType(output.scalar_type())) {
210  autograd::create_gradient_edge(output, grad_fn);
211  output.set_requires_grad(true);
212  } else {
213  grad_fn->add_input_metadata(autograd::Function::undefined_input{});
214  }
215  }
216  captureOutputs(*grad_fn, outputs);
217  // drop the temporary outputs so that we return the same number of
218  // outputs as if we were not also calculating gradient
219  const size_t num_temporary_outputs = num_outputs - grad.f_real_outputs;
220  stack.erase(stack.end() - num_temporary_outputs, stack.end());
221  }
222  return 0;
223  }
224 
225  private:
226  friend GraphExecutor* detail::getGradExecutor(Operation& op);
227 
228  void detachVariables(Stack& stack) const {
229  // It would be nice to use an ArrayRef here, but unfortunately those can
230  // only return const references, so we need to do a bunch of indexing
231  // ourselves.
232  const int64_t stack_size = stack.size();
233  const int64_t stack_offset = stack_size - num_inputs;
234  for (int64_t i = stack_offset; i < stack_size; ++i) {
235  auto& v = stack[i];
236  if (!v.isTensor())
237  continue;
238  auto t = std::move(v).toTensor();
239  v = IValue{t.defined() ? autograd::as_variable_ref(t).detach()
240  : std::move(t)};
241  }
242  }
243  // Capture (save) inputs that would be required to subsequently run backwards
244  void captureInputs(
245  DifferentiableGraphBackward& grad_fn,
246  at::ArrayRef<IValue> inputs) const {
247  for (size_t offset : grad.df_input_captured_inputs) {
248  grad_fn.capture(inputs[offset], /*is_output*/ false);
249  }
250  }
251  void captureOutputs(
252  DifferentiableGraphBackward& grad_fn,
253  at::ArrayRef<IValue> outputs) const {
254  for (size_t offset : grad.df_input_captured_outputs) {
255  grad_fn.capture(outputs[offset], /*is_output*/ true);
256  }
257  }
258 
259  Code f;
260  Gradient grad;
261  GraphExecutor grad_executor;
262 
263  const size_t num_inputs;
264  const size_t num_outputs;
265 };
266 
267 void packGradient(Gradient gradient, Node* dnode) {
268  AT_ASSERT(dnode->kind() == prim::DifferentiableGraph);
269  dnode->g_(attr::Subgraph, gradient.f)
270  ->g_(attr::ReverseSubgraph, gradient.df)
271  ->i_(attr::f_real_outputs, gradient.f_real_outputs)
272  ->is_(attr::df_input_vjps, fmap<int64_t>(gradient.df_input_vjps))
273  ->is_(
274  attr::df_input_captured_inputs,
275  fmap<int64_t>(gradient.df_input_captured_inputs))
276  ->is_(
277  attr::df_input_captured_outputs,
278  fmap<int64_t>(gradient.df_input_captured_outputs))
279  ->is_(attr::df_output_vjps, fmap<int64_t>(gradient.df_output_vjps));
280 }
281 
282 Gradient getGradient(const Node* n) {
283  AT_ASSERT(n->kind() == prim::DifferentiableGraph);
284  Gradient grad;
285  grad.f = n->g(attr::Subgraph);
286  grad.df = n->g(attr::ReverseSubgraph);
287  grad.f_real_outputs = n->i(attr::f_real_outputs);
288  grad.df_input_vjps = fmap<size_t>(n->is(attr::df_input_vjps));
289  grad.df_input_captured_inputs =
290  fmap<size_t>(n->is(attr::df_input_captured_inputs));
291  grad.df_input_captured_outputs =
292  fmap<size_t>(n->is(attr::df_input_captured_outputs));
293  grad.df_output_vjps = fmap<size_t>(n->is(attr::df_output_vjps));
294  return grad;
295 }
296 
297 } // anonymous namespace
298 
299 RegisterOperators reg_graph_executor_ops(
300  {Operator(prim::DifferentiableGraph, [](const Node* n) -> Operation {
301  return DifferentiableGraphOp(getGradient(n));
302  })});
303 
304 namespace detail {
305 
306 GraphExecutor* getGradExecutor(Operation& op) {
307  if (auto diff_op = op.target<DifferentiableGraphOp>()) {
308  return &diff_op->grad_executor;
309  }
310  return nullptr;
311 }
312 
313 } // namespace detail
314 
315 // a Graph can be created via tracing, or via a language-based frontend
316 // GraphExecutor runs it. It can run the same graph on many different sizes
317 // and different requires_grad states, and handles specializations for each
318 // situation. GraphExecutor is completely unaware of tracing or module
319 // parameters to keep the tracing concerns separated.
321  static std::shared_ptr<Graph> prepareGraph(std::shared_ptr<Graph>& graph) {
322  auto copy = graph->copy();
323  EraseShapeInformation(copy);
324  return copy;
325  }
326 
327  static size_t countFlatInputs(const TypePtr& ptr) {
328  if (auto optional_type = ptr->cast<OptionalType>()) {
329  return countFlatInputs(optional_type->getElementType());
330  }
331  if (auto tuple_type = ptr->cast<TupleType>()) {
332  size_t total = 0;
333  for (auto& elem : tuple_type->elements()) {
334  total += countFlatInputs(elem);
335  }
336  return total;
337  }
338  return 1;
339  }
340 
341  static size_t countFlatInputs(const std::shared_ptr<Graph>& graph) {
342  size_t total = 0;
343  for (Value* input : graph->inputs()) {
344  total += countFlatInputs(input->type());
345  }
346  return total;
347  }
348 
349  inline bool hasMutableOperators(Block* block) {
350  for (auto n : block->nodes()) {
351  if (n->kind().is_aten() && n->schema().is_mutable())
352  return true;
353  for (auto b : n->blocks()) {
354  if (hasMutableOperators(b))
355  return true;
356  }
357  }
358  return false;
359  }
360 
361  GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize)
362  : graph(prepareGraph(graph)),
363  // until we have correct alias analysis any use of mutable operators
364  // disables all optimization
365  optimize(optimize),
366  num_inputs(this->graph->inputs().size()),
367  num_flat_inputs(countFlatInputs(graph)),
368  num_outputs(this->graph->outputs().size()) {}
369 
370  // entry point where execution begins
371  void run(Stack& stack) {
372  AT_CHECK(
373  stack.size() >= num_inputs,
374  "expected ",
375  num_inputs,
376  " inputs, but got only ",
377  stack.size());
378 
379  if (tracer::isTracing()) {
380  return runTraced(stack);
381  }
382 
383  auto& execution_plan =
384  optimize ? getOrCompile(stack) : getOrCompileFallback();
385  return execution_plan.run(stack);
386  }
387 
388  std::shared_ptr<Graph> graphFor(const Stack& stack) const {
389  AT_ASSERT(stack.size() >= num_inputs);
390  auto inputs = last(stack, num_inputs);
391  ArgumentSpec spec(
392  autograd::GradMode::is_enabled(), inputs, num_flat_inputs);
393 
394  if (!optimize) {
395  AT_CHECK(fallback, "No graph found for given inputs");
396  return fallback.graph;
397  }
398 
399  auto it = plan_cache.find(spec);
400  AT_CHECK(it != plan_cache.end(), "No graph found for given inputs");
401  return it->second.graph;
402  }
403 
404  GraphExecutorState getDebugState() {
405  GraphExecutorState state;
406  state.graph = graph.get();
407  if (fallback) {
408  state.fallback = fallback.getDebugState();
409  }
410  for (auto& entry : plan_cache) {
411  state.execution_plans.emplace(entry.first, entry.second.getDebugState());
412  }
413  return state;
414  }
415 
416  // This function should be used only for testing purposes
417  void debugDisableAutodiffSubgraphInlining() {
418  // Allow single-node autodiff subgraphs
419  autodiffSubgraphNodeThreshold = 1;
420  // Don't inline autodiff subgraphs into autograd functions
421  autodiffSubgraphInlineThreshold = 1;
422  }
423 
424  private:
425  friend struct GraphExecutor;
426 
427  const ExecutionPlan& getOrCompileFallback() {
428  std::lock_guard<std::mutex> lock(compile_mutex);
429  if (!fallback) {
430  auto graph_ = graph->copy();
431  runRequiredPasses(graph_);
432  fallback = ExecutionPlan(graph_);
433  }
434  return fallback;
435  }
436 
437  const ExecutionPlan& getOrCompile(const Stack& stack) {
438  // outside lock guard, to minimize the time holding the lock on the fast
439  // path ArgumentSpec even computes its hashCode here.
440  ArgumentSpec spec(
441  autograd::GradMode::is_enabled(),
442  last(stack, num_inputs),
443  num_flat_inputs);
444  {
445  std::lock_guard<std::mutex> lock(compile_mutex);
446  auto it = plan_cache.find(spec);
447  if (it != plan_cache.end())
448  return it->second;
449  auto plan = compileSpec(spec);
450  auto r = plan_cache.emplace(std::move(spec), std::move(plan));
451  return r.first->second;
452  }
453  }
454 
455  ExecutionPlan compileSpec(const ArgumentSpec& spec) {
456  auto opt_graph = graph->copy();
457  setInputTypes(*opt_graph, spec);
458 
459  // Phase 1. Specialize to input definedness (this is very important for
460  // gradient graphs), and run required passes to bring the graph
461  // to an executable form.
462  runRequiredPasses(opt_graph);
463 
464  // Phase 2. Propagate detailed information about the spec through the
465  // graph (enabled more specializations in later passes).
466  // Shape propagation sometimes depends on certain arguments being
467  // constants, and constant propagation doesn't need shape
468  // information anyway, so it's better to run it first.
469  ConstantPropagation(opt_graph);
470  PropagateInputShapes(opt_graph);
471  PropagateRequiresGrad(opt_graph);
472 
473  // Phase 3. Run differentiable optimizations (i.e. simple graph rewrites
474  // that
475  // we can still execute using autograd).
476  runOptimization(opt_graph, spec);
477 
478  // Phase 4. If this graph will be differentiated, we need to slice out the
479  // symbolically differentiable subgraphs for further optimizations.
480  // Phase 5. Apply non-differentiable optimizations to the graphs we've found
481  // (or the whole grpah if we know we won't need its derivative).
482  if (needsGradient(opt_graph)) {
483  auto diff_nodes =
484  CreateAutodiffSubgraphs(opt_graph, autodiffSubgraphNodeThreshold);
485  for (Node* dnode : diff_nodes) {
486  auto diff_graph = std::move(dnode->g(attr::Subgraph));
487  Gradient gradient = differentiate(diff_graph);
488  runNondiffOptimization(gradient.f);
489  packGradient(gradient, dnode);
490  }
491  InlineAutodiffSubgraphs(opt_graph, autodiffSubgraphInlineThreshold);
492  } else {
493  runNondiffOptimization(opt_graph);
494  }
495  // Make sure there are no leftovers from any passes.
496  EliminateDeadCode(opt_graph);
497  return ExecutionPlan(opt_graph);
498  }
499 
500  void runOptimization(
501  std::shared_ptr<Graph>& graph,
502  const ArgumentSpec& spec) {
503  // Basic graph preprocessing to eliminate noise.
504  EliminateDeadCode(graph);
505  EliminateCommonSubexpression(graph);
506  ConstantPooling(graph);
507 
508  PeepholeOptimize(graph);
509 
510  // Unroll small loops, and eliminate expressions that are the same at every
511  // iteration.
512  UnrollLoops(graph);
513  EliminateCommonSubexpression(graph);
514 
515  // Rewrite subgraphs with many MMs into expressions that batch them.
516  BatchMM(graph);
517 
518  CheckInplace(graph);
519  }
520 
521  void runNondiffOptimization(std::shared_ptr<Graph>& graph) {
522  FuseGraph(graph);
523  }
524 
525  static bool needsGradient(const std::shared_ptr<const Graph>& graph) {
526  if (!autograd::GradMode::is_enabled())
527  return false;
528  if (mayIntroduceGradient(graph->block()))
529  return true;
530  for (const Value* input : graph->inputs()) {
531  if (input->type()->requires_grad())
532  return true;
533  }
534  return false;
535  }
536 
537  static bool mayIntroduceGradient(const Block* b) {
538  for (const Node* n : b->nodes()) {
539  if (n->kind() == prim::PythonOp)
540  return true;
541  for (const Block* bb : n->blocks()) {
542  if (mayIntroduceGradient(bb))
543  return true;
544  }
545  }
546  return false;
547  }
548 
549  void runTraced(Stack& stack) {
550  const auto& state = tracer::getTracingState();
551  auto inputs = last(stack, num_inputs);
552  auto input_values = fmap(
553  inputs, [](const IValue& v) { return tracer::getNestedValueTrace(v); });
554 
555  ArgumentSpec spec(
556  autograd::GradMode::is_enabled(), inputs, num_flat_inputs);
557  // NB: we could just run the fallback in here and call it a day, but that
558  // would loose all the control flow information we have in the graph. Thus,
559  // we run the fallback to get the correct output values, but we will
560  // override the tracing states later.
561  {
562  // No need to trace a script module.
563  ResourceGuard guard(tracer::pauseTracing());
564  getOrCompileFallback().run(stack);
565  }
566 
567  // Traces always have types propagated through them, so we make sure to
568  // also propagate types through the graph we are inserting here.
569  // However, this->graph itself may already have been generated with
570  // tracing and so we only do the type propgation if no concrete types have
571  // been set.
572  auto local_graph = this->graph->copy();
573  setInputTypes(*local_graph, spec);
574  PropagateInputShapes(local_graph);
575  auto output_values =
576  inlineCallTo(*state->graph, *local_graph, input_values);
577 
578  auto outputs = last(stack, num_outputs);
579  for (size_t i = 0; i < outputs.size(); ++i) {
580  tracer::setValueTrace(outputs[i], output_values[i]);
581  }
582  }
583 
584  // The unoptimized starting graph. This field is effectively const, but we
585  // can't make it so because Graph::copy() is not const (and making it const is
586  // not that easy at this point).
587  std::shared_ptr<Graph> graph;
588 
589  // If false, we'll run the graph as we get it, without any optimizations.
590  // Useful for debugging.
591  const bool optimize;
592  const size_t num_inputs;
593  const size_t num_flat_inputs; // Number of inputs, assuming all tuples would
594  // be flattened.
595  const size_t num_outputs;
596 
597  // Populated only when optimize is false (and in that case plan_cache will be
598  // unused). The compiled version of graph.
599  ExecutionPlan fallback;
600 
601  // Mapping from argument configurations to optimized versions of the graph
602  // that are specialized to the spec.
603  std::unordered_map<ArgumentSpec, ExecutionPlan> plan_cache;
604 
605  // GraphExecutors can be accessed from multiple threads, so this thread needs
606  // to be held every time we access the fallback or plan_cache.
607  std::mutex compile_mutex;
608 
609  // Some tunable parameters
610  size_t autodiffSubgraphNodeThreshold = 2;
611  size_t autodiffSubgraphInlineThreshold = 5;
612 };
613 
614 GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize)
615  : pImpl(new GraphExecutorImpl(std::move(graph), optimize)) {}
616 
617 void GraphExecutor::run(Stack& inputs) {
618  return pImpl->run(inputs);
619 }
620 
621 std::shared_ptr<Graph> GraphExecutor::graph() const {
622  return pImpl->graph;
623 }
624 
625 std::shared_ptr<Graph> GraphExecutor::graphFor(const Stack& inputs) const {
626  return pImpl->graphFor(inputs);
627 }
628 
629 GraphExecutorState GraphExecutor::getDebugState() {
630  return pImpl->getDebugState();
631 }
632 
633 void GraphExecutor::debugDisableAutodiffSubgraphInlining() {
634  return pImpl->debugDisableAutodiffSubgraphInlining();
635 }
636 
637 void runRequiredPasses(const std::shared_ptr<Graph>& g) {
638  specializeAutogradZero(*g);
639  LowerGradOf(*g);
640  // implicit inserted expand nodes are not necessarily always valid
641  // when used inside script methods that might have unstable shapes
642  // we remove the implicitly created ones, and have shape analysis
643  // add valid expand nodes when the shapes are stable
644  RemoveExpands(g);
645  CanonicalizeOps(g);
646  EliminateDeadCode(g);
647 }
648 
649 } // namespace jit
650 } // namespace torch
constexpr size_t size() const
size - Get the array size.
Definition: ArrayRef.h:138
Definition: jit_type.h:17
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: ArrayRef.h:41