doxygen-c/html/graph__executor_8cpp_source.html

 #include <torch/csrc/jit/graph_executor.h>

 #include <ATen/core/ivalue.h>
 #include <c10/util/Exception.h>
 #include <torch/csrc/autograd/grad_mode.h>
 #include <torch/csrc/jit/argument_spec.h>
 #include <torch/csrc/jit/autodiff.h>
 #include <torch/csrc/jit/custom_operator.h>
 #include <torch/csrc/jit/interpreter.h>
 #include <torch/csrc/jit/ir.h>
 #include <torch/csrc/jit/resource_guard.h>
 #include <ATen/core/ivalue.h>
 #include <torch/csrc/jit/passes/batch_mm.h>
 #include <torch/csrc/jit/passes/canonicalize_ops.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/constant_pooling.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
 #include <torch/csrc/jit/passes/create_autodiff_subgraphs.h>
 #include <torch/csrc/jit/passes/dead_code_elimination.h>
 #include <torch/csrc/jit/passes/graph_fuser.h>
 #include <torch/csrc/jit/passes/inline_autodiff_subgraphs.h>
 #include <torch/csrc/jit/passes/inplace_check.h>
 #include <torch/csrc/jit/passes/loop_unrolling.h>
 #include <torch/csrc/jit/passes/lower_grad_of.h>
 #include <torch/csrc/jit/passes/peephole.h>
 #include <torch/csrc/jit/passes/remove_expands.h>
 #include <torch/csrc/jit/passes/requires_grad_analysis.h>
 #include <torch/csrc/jit/passes/shape_analysis.h>
 #include <torch/csrc/jit/passes/specialize_autogradzero.h>
 #include <torch/csrc/jit/symbolic_variable.h>
 #include <torch/csrc/jit/tracer.h>

 #include <torch/csrc/autograd/edge.h>
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/jit/script/compiler.h>

 #include <cstdint>
 #include <iterator>
 #include <memory>
 #include <mutex>
 #include <unordered_map>
 #include <utility>
 #include <vector>

 namespace torch {
 namespace jit {

 namespace {

 using tensor_list = std::vector<at::Tensor>;
 using Variable = autograd::Variable;
 using autograd::variable_list;

 struct ExecutionPlan {
   ExecutionPlan() = default;
   ExecutionPlan(std::shared_ptr<Graph> graph)
       : code(graph), graph(std::move(graph)) {}

   void run(Stack& stack) const {
     return InterpreterState(code).run(stack);
   }

   operator bool() const {
     return static_cast<bool>(graph);
   }

   ExecutionPlanState getDebugState() {
     ExecutionPlanState state;
     state.code = &code;
     state.graph = graph.get();
     return state;
   }

   Code code;
   std::shared_ptr<Graph> graph;
 };

 struct DifferentiableGraphBackward : public autograd::Function {
   DifferentiableGraphBackward(GraphExecutor executor, size_t capture_size)
       : executor(std::move(executor)) {
     is_var_capture.reserve(capture_size);
     var_captures.reserve(capture_size);
     ivalue_captures.reserve(capture_size);
   }

   variable_list apply(variable_list&& inputs) override {
     Stack stack;
     stack.reserve(is_var_capture.size() + inputs.size());
     stack.insert(
         stack.end(),
         std::make_move_iterator(inputs.begin()),
         std::make_move_iterator(inputs.end()));
     auto var_capture_it = var_captures.begin();
     auto ivalue_capture_it = ivalue_captures.begin();
     for (bool is_var : is_var_capture) {
       if (is_var) {
         stack.emplace_back(var_capture_it->unpack(this->shared_from_this()));
         ++var_capture_it;
       } else {
         stack.push_back(*ivalue_capture_it);
         ++ivalue_capture_it;
       }
     }

     executor.run(stack);
     AT_ASSERT(stack.size() == num_outputs());

     variable_list outputs;
     outputs.reserve(num_outputs());
     for (size_t i = 0; i < num_outputs(); ++i) {
       // Input grad can also be None even if it requires grad
       // Example: `other` in expand_as(self, other)
       if (should_compute_output(i) && !stack[i].isNone()) {
         auto output = std::move(stack[i]).toTensor();
         const auto& edge = next_edge(i);
         if (output.defined()) {
           outputs.emplace_back(std::move(output));
         } else if (edge.is_valid()) {
           outputs.emplace_back(
               edge.function->input_metadata(edge.input_nr).zeros_like());
         } else {
           outputs.emplace_back();
         }
       } else {
         outputs.emplace_back();
       }
     }
     return outputs;
   }

   void capture(const IValue& val, bool is_output) {
     const bool is_tensor = val.isTensor();
     is_var_capture.push_back(is_tensor);
     if (is_tensor) {
       var_captures.emplace_back(Variable(val.toTensor()), is_output);
     } else {
       ivalue_captures.push_back(val);
     }
   }

  private:
   friend struct ExecutionPlan;
   GraphExecutor executor;

   // INVARIANT: is_var_capture.size() == var_captures.size() +
   // ivalue_captures.size()
   std::vector<bool> is_var_capture;
   std::vector<autograd::SavedVariable> var_captures;
   std::vector<IValue> ivalue_captures;
 };

 // an optimized way of executing the subgraph computed directly on
 // tensors rather than Variables.
 // This will unwrap Variables, run the plan, and re-wrap them.
 // It can optionally also have a gradient which is hooked up
 // to the output Variables if present.
 struct DifferentiableGraphOp {
   DifferentiableGraphOp(Gradient grad)
       : f(grad.f),
         grad(std::move(grad)),
         grad_executor(this->grad.df),
         num_inputs(this->grad.f->inputs().size()),
         num_outputs(this->grad.f->outputs().size()) {}

   // XXX: keep in mind that stack can be larger than the inputs we need!
   int operator()(Stack& stack) const {
     auto grad_fn = std::make_shared<DifferentiableGraphBackward>(
         grad_executor,
         grad.df_input_captured_inputs.size() +
             grad.df_input_captured_outputs.size());

     {
       auto inputs = last(stack, num_inputs);
       // hook up the outputs of df to the gradient functions of the inputs that
       // require gradients
       for (auto idx : grad.df_output_vjps) {
         auto v = Variable(inputs[idx].toTensor());
         grad_fn->add_next_edge(
             v.defined() ? v.gradient_edge() : autograd::Edge{});
       }
       captureInputs(*grad_fn, inputs);
     }

     detachVariables(stack);
     InterpreterState(f).run(stack);

     {
       auto outputs = last(stack, num_outputs);
       // hookup the gradients for the output tensors that require gradients
       // to the inputs to our gradient function df
       // TODO - XXX - if any output is the same tensor multiple times, views
       // have to be setup here. We need to refactor autograd until it is safe
       // for tensors to be constructed without all the viewing infrastructure.
       // this is currently intentionally not done here so we can get an idea of
       // our perf before introducing overhead for correctness
       for (auto idx : grad.df_input_vjps) {
         // Note: we have to set this up in place, or we have to throw away and
         // reallocate variables that were already created in wrapTensors. We
         // should add an API for this.

         // XXX: undefined tensor syntax in autograd
         Variable output;
         if (!outputs[idx].isNone()) {
           output = outputs[idx].toTensor();
         }
         // NB: since our requires_grad setting is only a heuristic we might end
         // up wanting to differentiate through integral tensors, which is
         // generally a hard error in autograd.
         if (at::isFloatingType(output.scalar_type())) {
           autograd::create_gradient_edge(output, grad_fn);
           output.set_requires_grad(true);
         } else {
           grad_fn->add_input_metadata(autograd::Function::undefined_input{});
         }
       }
       captureOutputs(*grad_fn, outputs);
       // drop the temporary outputs so that we return the same number of
       // outputs as if we were not also calculating gradient
       const size_t num_temporary_outputs = num_outputs - grad.f_real_outputs;
       stack.erase(stack.end() - num_temporary_outputs, stack.end());
     }
     return 0;
   }

  private:
   friend GraphExecutor* detail::getGradExecutor(Operation& op);

   void detachVariables(Stack& stack) const {
     // It would be nice to use an ArrayRef here, but unfortunately those can
     // only return const references, so we need to do a bunch of indexing
     // ourselves.
     const int64_t stack_size = stack.size();
     const int64_t stack_offset = stack_size - num_inputs;
     for (int64_t i = stack_offset; i < stack_size; ++i) {
       auto& v = stack[i];
       if (!v.isTensor())
         continue;
       auto t = std::move(v).toTensor();
       v = IValue{t.defined() ? autograd::as_variable_ref(t).detach()
                              : std::move(t)};
     }
   }
   // Capture (save) inputs that would be required to subsequently run backwards
   void captureInputs(
       DifferentiableGraphBackward& grad_fn,
       at::ArrayRef<IValue> inputs) const {
     for (size_t offset : grad.df_input_captured_inputs) {
       grad_fn.capture(inputs[offset], /*is_output*/ false);
     }
   }
   void captureOutputs(
       DifferentiableGraphBackward& grad_fn,
       at::ArrayRef<IValue> outputs) const {
     for (size_t offset : grad.df_input_captured_outputs) {
       grad_fn.capture(outputs[offset], /*is_output*/ true);
     }
   }

   Code f;
   Gradient grad;
   GraphExecutor grad_executor;

   const size_t num_inputs;
   const size_t num_outputs;
 };

 void packGradient(Gradient gradient, Node* dnode) {
   AT_ASSERT(dnode->kind() == prim::DifferentiableGraph);
   dnode->g_(attr::Subgraph, gradient.f)
       ->g_(attr::ReverseSubgraph, gradient.df)
       ->i_(attr::f_real_outputs, gradient.f_real_outputs)
       ->is_(attr::df_input_vjps, fmap<int64_t>(gradient.df_input_vjps))
       ->is_(
           attr::df_input_captured_inputs,
           fmap<int64_t>(gradient.df_input_captured_inputs))
       ->is_(
           attr::df_input_captured_outputs,
           fmap<int64_t>(gradient.df_input_captured_outputs))
       ->is_(attr::df_output_vjps, fmap<int64_t>(gradient.df_output_vjps));
 }

 Gradient getGradient(const Node* n) {
   AT_ASSERT(n->kind() == prim::DifferentiableGraph);
   Gradient grad;
   grad.f = n->g(attr::Subgraph);
   grad.df = n->g(attr::ReverseSubgraph);
   grad.f_real_outputs = n->i(attr::f_real_outputs);
   grad.df_input_vjps = fmap<size_t>(n->is(attr::df_input_vjps));
   grad.df_input_captured_inputs =
       fmap<size_t>(n->is(attr::df_input_captured_inputs));
   grad.df_input_captured_outputs =
       fmap<size_t>(n->is(attr::df_input_captured_outputs));
   grad.df_output_vjps = fmap<size_t>(n->is(attr::df_output_vjps));
   return grad;
 }

 } // anonymous namespace

 RegisterOperators reg_graph_executor_ops(
     {Operator(prim::DifferentiableGraph, [](const Node* n) -> Operation {
       return DifferentiableGraphOp(getGradient(n));
     })});

 namespace detail {

 GraphExecutor* getGradExecutor(Operation& op) {
   if (auto diff_op = op.target<DifferentiableGraphOp>()) {
     return &diff_op->grad_executor;
   }
   return nullptr;
 }

 } // namespace detail

 // a Graph can be created via tracing, or via a language-based frontend
 // GraphExecutor runs it. It can run the same graph on many different sizes
 // and different requires_grad states, and handles specializations for each
 // situation. GraphExecutor is completely unaware of tracing or module
 // parameters to keep the tracing concerns separated.
 struct GraphExecutorImpl {
   static std::shared_ptr<Graph> prepareGraph(std::shared_ptr<Graph>& graph) {
     auto copy = graph->copy();
     EraseShapeInformation(copy);
     return copy;
   }

   static size_t countFlatInputs(const TypePtr& ptr) {
     if (auto optional_type = ptr->cast<OptionalType>()) {
       return countFlatInputs(optional_type->getElementType());
     }
     if (auto tuple_type = ptr->cast<TupleType>()) {
       size_t total = 0;
       for (auto& elem : tuple_type->elements()) {
         total += countFlatInputs(elem);
       }
       return total;
     }
     return 1;
   }

   static size_t countFlatInputs(const std::shared_ptr<Graph>& graph) {
     size_t total = 0;
     for (Value* input : graph->inputs()) {
       total += countFlatInputs(input->type());
     }
     return total;
   }

   inline bool hasMutableOperators(Block* block) {
     for (auto n : block->nodes()) {
       if (n->kind().is_aten() && n->schema().is_mutable())
         return true;
       for (auto b : n->blocks()) {
         if (hasMutableOperators(b))
           return true;
       }
     }
     return false;
   }

   GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize)
       : graph(prepareGraph(graph)),
         // until we have correct alias analysis any use of mutable operators
         // disables all optimization
         optimize(optimize),
         num_inputs(this->graph->inputs().size()),
         num_flat_inputs(countFlatInputs(graph)),
         num_outputs(this->graph->outputs().size()) {}

   // entry point where execution begins
   void run(Stack& stack) {
     AT_CHECK(
         stack.size() >= num_inputs,
         "expected ",
         num_inputs,
         " inputs, but got only ",
         stack.size());

     if (tracer::isTracing()) {
       return runTraced(stack);
     }

     auto& execution_plan =
         optimize ? getOrCompile(stack) : getOrCompileFallback();
     return execution_plan.run(stack);
   }

   std::shared_ptr<Graph> graphFor(const Stack& stack) const {
     AT_ASSERT(stack.size() >= num_inputs);
     auto inputs = last(stack, num_inputs);
     ArgumentSpec spec(
         autograd::GradMode::is_enabled(), inputs, num_flat_inputs);

     if (!optimize) {
       AT_CHECK(fallback, "No graph found for given inputs");
       return fallback.graph;
     }

     auto it = plan_cache.find(spec);
     AT_CHECK(it != plan_cache.end(), "No graph found for given inputs");
     return it->second.graph;
   }

   GraphExecutorState getDebugState() {
     GraphExecutorState state;
     state.graph = graph.get();
     if (fallback) {
       state.fallback = fallback.getDebugState();
     }
     for (auto& entry : plan_cache) {
       state.execution_plans.emplace(entry.first, entry.second.getDebugState());
     }
     return state;
   }

   // This function should be used only for testing purposes
   void debugDisableAutodiffSubgraphInlining() {
     // Allow single-node autodiff subgraphs
     autodiffSubgraphNodeThreshold = 1;
     // Don't inline autodiff subgraphs into autograd functions
     autodiffSubgraphInlineThreshold = 1;
   }

  private:
   friend struct GraphExecutor;

   const ExecutionPlan& getOrCompileFallback() {
     std::lock_guard<std::mutex> lock(compile_mutex);
     if (!fallback) {
       auto graph_ = graph->copy();
       runRequiredPasses(graph_);
       fallback = ExecutionPlan(graph_);
     }
     return fallback;
   }

   const ExecutionPlan& getOrCompile(const Stack& stack) {
     // outside lock guard, to minimize the time holding the lock on the fast
     // path ArgumentSpec even computes its hashCode here.
     ArgumentSpec spec(
         autograd::GradMode::is_enabled(),
         last(stack, num_inputs),
         num_flat_inputs);
     {
       std::lock_guard<std::mutex> lock(compile_mutex);
       auto it = plan_cache.find(spec);
       if (it != plan_cache.end())
         return it->second;
       auto plan = compileSpec(spec);
       auto r = plan_cache.emplace(std::move(spec), std::move(plan));
       return r.first->second;
     }
   }

   ExecutionPlan compileSpec(const ArgumentSpec& spec) {
     auto opt_graph = graph->copy();
     setInputTypes(*opt_graph, spec);

     // Phase 1. Specialize to input definedness (this is very important for
     //          gradient graphs), and run required passes to bring the graph
     //          to an executable form.
     runRequiredPasses(opt_graph);

     // Phase 2. Propagate detailed information about the spec through the
     //          graph (enabled more specializations in later passes).
     //          Shape propagation sometimes depends on certain arguments being
     //          constants, and constant propagation doesn't need shape
     //          information anyway, so it's better to run it first.
     ConstantPropagation(opt_graph);
     PropagateInputShapes(opt_graph);
     PropagateRequiresGrad(opt_graph);

     // Phase 3. Run differentiable optimizations (i.e. simple graph rewrites
     // that
     //          we can still execute using autograd).
     runOptimization(opt_graph, spec);

     // Phase 4. If this graph will be differentiated, we need to slice out the
     //          symbolically differentiable subgraphs for further optimizations.
     // Phase 5. Apply non-differentiable optimizations to the graphs we've found
     //          (or the whole grpah if we know we won't need its derivative).
     if (needsGradient(opt_graph)) {
       auto diff_nodes =
           CreateAutodiffSubgraphs(opt_graph, autodiffSubgraphNodeThreshold);
       for (Node* dnode : diff_nodes) {
         auto diff_graph = std::move(dnode->g(attr::Subgraph));
         Gradient gradient = differentiate(diff_graph);
         runNondiffOptimization(gradient.f);
         packGradient(gradient, dnode);
       }
       InlineAutodiffSubgraphs(opt_graph, autodiffSubgraphInlineThreshold);
     } else {
       runNondiffOptimization(opt_graph);
     }
     // Make sure there are no leftovers from any passes.
     EliminateDeadCode(opt_graph);
     return ExecutionPlan(opt_graph);
   }

   void runOptimization(
       std::shared_ptr<Graph>& graph,
       const ArgumentSpec& spec) {
     // Basic graph preprocessing to eliminate noise.
     EliminateDeadCode(graph);
     EliminateCommonSubexpression(graph);
     ConstantPooling(graph);

     PeepholeOptimize(graph);

     // Unroll small loops, and eliminate expressions that are the same at every
     // iteration.
     UnrollLoops(graph);
     EliminateCommonSubexpression(graph);

     // Rewrite subgraphs with many MMs into expressions that batch them.
     BatchMM(graph);

     CheckInplace(graph);
   }

   void runNondiffOptimization(std::shared_ptr<Graph>& graph) {
     FuseGraph(graph);
   }

   static bool needsGradient(const std::shared_ptr<const Graph>& graph) {
     if (!autograd::GradMode::is_enabled())
       return false;
     if (mayIntroduceGradient(graph->block()))
       return true;
     for (const Value* input : graph->inputs()) {
       if (input->type()->requires_grad())
         return true;
     }
     return false;
   }

   static bool mayIntroduceGradient(const Block* b) {
     for (const Node* n : b->nodes()) {
       if (n->kind() == prim::PythonOp)
         return true;
       for (const Block* bb : n->blocks()) {
         if (mayIntroduceGradient(bb))
           return true;
       }
     }
     return false;
   }

   void runTraced(Stack& stack) {
     const auto& state = tracer::getTracingState();
     auto inputs = last(stack, num_inputs);
     auto input_values = fmap(
         inputs, [](const IValue& v) { return tracer::getNestedValueTrace(v); });

     ArgumentSpec spec(
         autograd::GradMode::is_enabled(), inputs, num_flat_inputs);
     // NB: we could just run the fallback in here and call it a day, but that
     // would loose all the control flow information we have in the graph. Thus,
     // we run the fallback to get the correct output values, but we will
     // override the tracing states later.
     {
       // No need to trace a script module.
       ResourceGuard guard(tracer::pauseTracing());
       getOrCompileFallback().run(stack);
     }

     // Traces always have types propagated through them, so we make sure to
     // also propagate types through the graph we are inserting here.
     // However, this->graph itself may already have been generated with
     // tracing and so we only do the type propgation if no concrete types have
     // been set.
     auto local_graph = this->graph->copy();
     setInputTypes(*local_graph, spec);
     PropagateInputShapes(local_graph);
     auto output_values =
         inlineCallTo(*state->graph, *local_graph, input_values);

     auto outputs = last(stack, num_outputs);
     for (size_t i = 0; i < outputs.size(); ++i) {
       tracer::setValueTrace(outputs[i], output_values[i]);
     }
   }

   // The unoptimized starting graph. This field is effectively const, but we
   // can't make it so because Graph::copy() is not const (and making it const is
   // not that easy at this point).
   std::shared_ptr<Graph> graph;

   // If false, we'll run the graph as we get it, without any optimizations.
   // Useful for debugging.
   const bool optimize;
   const size_t num_inputs;
   const size_t num_flat_inputs; // Number of inputs, assuming all tuples would
                                 // be flattened.
   const size_t num_outputs;

   // Populated only when optimize is false (and in that case plan_cache will be
   // unused). The compiled version of graph.
   ExecutionPlan fallback;

   // Mapping from argument configurations to optimized versions of the graph
   // that are specialized to the spec.
   std::unordered_map<ArgumentSpec, ExecutionPlan> plan_cache;

   // GraphExecutors can be accessed from multiple threads, so this thread needs
   // to be held every time we access the fallback or plan_cache.
   std::mutex compile_mutex;

   // Some tunable parameters
   size_t autodiffSubgraphNodeThreshold = 2;
   size_t autodiffSubgraphInlineThreshold = 5;
 };

 GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize)
     : pImpl(new GraphExecutorImpl(std::move(graph), optimize)) {}

 void GraphExecutor::run(Stack& inputs) {
   return pImpl->run(inputs);
 }

 std::shared_ptr<Graph> GraphExecutor::graph() const {
   return pImpl->graph;
 }

 std::shared_ptr<Graph> GraphExecutor::graphFor(const Stack& inputs) const {
   return pImpl->graphFor(inputs);
 }

 GraphExecutorState GraphExecutor::getDebugState() {
   return pImpl->getDebugState();
 }

 void GraphExecutor::debugDisableAutodiffSubgraphInlining() {
   return pImpl->debugDisableAutodiffSubgraphInlining();
 }

 void runRequiredPasses(const std::shared_ptr<Graph>& g) {
   specializeAutogradZero(*g);
   LowerGradOf(*g);
   // implicit inserted expand nodes are not necessarily always valid
   // when used inside script methods that might have unstable shapes
   // we remove the implicitly created ones, and have shape analysis
   // add valid expand nodes when the shapes are stable
   RemoveExpands(g);
   CanonicalizeOps(g);
   EliminateDeadCode(g);
 }

 } // namespace jit
 } // namespace torch
torch::jit::ResourceGuard
Definition: resource_guard.h:7

torch::jit::GraphExecutorState
Definition: graph_executor.h:23

torch::jit::ArgumentSpec
Definition: argument_spec.h:69

std
Definition: interned_strings.h:312

torch::jit::GraphExecutor
Definition: graph_executor.h:30

c10::OptionalType
Definition: jit_type.h:226

c10::ArrayRef::size
constexpr size_t size() const
size - Get the array size.
Definition: ArrayRef.h:138

detail
Definition: Dispatch.h:13

torch::jit::Value
Definition: ir.h:144

torch::jit::Gradient
Definition: autodiff.h:41

torch
Definition: jit_type.h:17

c10::IValue
Definition: ivalue.h:127

c10::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: ArrayRef.h:41

c10::TupleType
Definition: jit_type.h:651

torch::jit::GraphExecutorImpl
Definition: graph_executor.cpp:320

torch::jit::Node
Definition: ir.h:228

torch::jit::Block
Definition: ir.h:818