Caffe2 - C++ API
A deep learning, cross platform ML framework
c10_operator.h
1 #pragma once
2 
3 #include <vector>
4 #include <ATen/core/dispatch/OpSchemaRegistration.h>
5 #include <ATen/core/dispatch/KernelRegistration.h>
6 #include <ATen/core/function_schema.h>
7 
8 namespace caffe2 {
9 namespace detail {
10 
11 constexpr const char* PREALLOCATED_OUTPUT_ARGNAME =
12  "_caffe2_preallocated_outputs";
13 
14 using _CallCaffe2OpFunc = std::vector<at::Tensor>(
15  const c10::FunctionSchema& schema,
16  std::vector<c10::IValue>&& inputs,
17  std::vector<at::Tensor>&& outputs);
18 
19 template <class Caffe2Operator>
20 inline std::vector<at::Tensor> _call_caffe2_op(
21  const c10::FunctionSchema& schema,
22  std::vector<c10::IValue>&& inputs,
23  std::vector<at::Tensor>&& outputs) {
24  Caffe2Operator op(schema, std::move(inputs), std::move(outputs));
25  op.Run();
26  return std::move(op).move_newstyle_outputs();
27 }
28 
29 // This function is inline in the hope that compilers optimizing for speed will
30 // inline it into call_caffe2_op_from_c10, allowing call_op to be inlined and
31 // avoiding the function pointer indirection, while compilers optimizing for
32 // binary size will keep it a separate function instead of inlining it into
33 // a template and will reuse the binary code of this function between ops.
34 // We measured and confirmed that binary size off the instagram ios app is
35 // reduced when having _call_caffe2_op_from_c10 separate from the templated
36 // call_caffe2_op_from_c10.
37 inline void _call_caffe2_op_from_c10(
38  c10::Stack* stack,
39  const c10::FunctionSchema& schema,
40  _CallCaffe2OpFunc* call_op) {
41  // precondition: on the stack, there's one IValue for each argument of the
42  // c10 schema. The last argument is an optional tensor list that
43  // (if not ivalue::None) contains a preallocated output tensor for each
44  // operator output.
45 
46  AT_ASSERT(
47  schema.arguments().size() != 0 &&
48  schema.arguments().back().type()->isSubtypeOf(
49  OptionalType::create(ListType::ofTensors())));
50  IValue preallocated_outputs = torch::jit::pop(*stack);
51 
52  const size_t num_outputs = schema.returns().size();
53  const size_t num_inputs = schema.arguments().size() -
54  1; // -1 because the last argument is the list of preallocated tensors
55 
56  std::vector<at::Tensor> outputs;
57  if (preallocated_outputs.isNone()) {
58  // either the schema doesn't support preallocated outputs or it does but
59  // they haven't been passed in. Pass a list of uninitialized tensors to
60  // the caffe2 operator as preallocated outputs.
61  outputs.resize(num_outputs);
62  } else {
63  AT_ASSERT(preallocated_outputs.isTensorList());
64  outputs =
65  std::move(*std::move(preallocated_outputs).toTensorList()).elements();
66  }
67 
68  // TODO Avoid vector allocation. One idea would be to keep the std::vector
69  // instances in the cache.
70  std::vector<IValue> inputs = torch::jit::pop(*stack, num_inputs);
71 
72  outputs = (*call_op)(schema, std::move(inputs), std::move(outputs));
73 
74  for (auto&& output : std::move(outputs)) {
75  torch::jit::push(*stack, std::move(output));
76  }
77 
78  // postcondition: All inputs are cleared from the stack, there's now one
79  // IValue for each output which holds the result. This
80  // might reuse one of the preallocated tensors but doesn't have to.
81 }
82 
83 template <const c10::OperatorHandle& (*OpHandle)(), class Caffe2Operator>
84 void call_caffe2_op_from_c10(
85  c10::Stack* stack,
86  c10::KernelCache* cache) { // TODO Pass in correct cache type
87  _call_caffe2_op_from_c10(
88  stack, OpHandle().schema(), &_call_caffe2_op<Caffe2Operator>);
89 }
90 
91 inline c10::FunctionSchema make_function_schema_for_c10(const char* OperatorName, std::vector<c10::Argument> inputs, std::vector<c10::Argument> outputs) {
92  // actual_inputs is the real inputs plus an optional tensor list argument
93  // for preallocated outputs
94  std::vector<c10::Argument> actual_inputs = std::move(inputs);
95  actual_inputs.emplace_back(
96  PREALLOCATED_OUTPUT_ARGNAME,
97  c10::OptionalType::create(c10::ListType::ofTensors()),
98  nullopt,
99  IValue());
100 
101  return c10::FunctionSchema(
102  std::string("_caffe2::") + OperatorName,
103  "",
104  std::move(actual_inputs),
105  std::move(outputs));
106 }
107 
108 }
109 }
110 
111 
156 #ifndef C10_MOBILE
157 #define C10_DECLARE_CAFFE2_OPERATOR(OperatorName) \
158  namespace caffe2 { \
159  namespace _c10_ops { \
160  C10_DECLARE_OP_SCHEMA(OperatorName); \
161  } \
162  }
163 
164 // TODO This macro should take a JIT schema string instead of a vector of inputs and outputs.
165 #define C10_REGISTER_CAFFE2_OPERATOR_CPU( \
166  OperatorName, Inputs, Outputs, OperatorClass) \
167  /* Register the op schema with the c10 dispatcher */ \
168  namespace caffe2 { \
169  namespace _c10_ops { \
170  C10_DEFINE_OP_SCHEMA( \
171  OperatorName, \
172  caffe2::detail::make_function_schema_for_c10( \
173  #OperatorName, \
174  Inputs, \
175  Outputs)); \
176  } \
177  } \
178  /* Register call_caffe2_op_from_c10 as a kernel with the c10 dispatcher */ \
179  namespace c10 { \
180  C10_REGISTER_KERNEL(caffe2::_c10_ops::OperatorName) /*.withCache<Cache>()*/ \
181  .kernel<&caffe2::detail::call_caffe2_op_from_c10< \
182  ::caffe2::_c10_ops::OperatorName, \
183  OperatorClass>>() \
184  .dispatchKey(CPUTensorId()); \
185  }
186 
187 #define C10_REGISTER_CAFFE2_OPERATOR_CUDA(OperatorName, OperatorClass) \
188  namespace c10 { \
189  C10_REGISTER_KERNEL(caffe2::_c10_ops::OperatorName) /*.withCache<Cache>()*/ \
190  .kernel<&caffe2::detail::call_caffe2_op_from_c10< \
191  ::caffe2::_c10_ops::OperatorName, \
192  OperatorClass>>() \
193  .dispatchKey(CUDATensorId()); \
194  }
195 
196 // You should never manually call the C10_REGISTER_CAFFE2_OPERATOR_HIP macro.
197 // The C10_REGISTER_CAFFE2_OPERATOR_CUDA macro from above will be automatically
198 // rewritten to C10_REGISTER_CAFFE2_OPERATOR_HIP by hipify.
199 #define C10_REGISTER_CAFFE2_OPERATOR_HIP(OperatorName, OperatorClass) \
200  namespace c10 { \
201  C10_REGISTER_KERNEL(caffe2::_c10_ops::OperatorName) /*.withCache<Cache>()*/ \
202  .kernel<&caffe2::detail::call_caffe2_op_from_c10< \
203  ::caffe2::_c10_ops::OperatorName, \
204  OperatorClass>>() \
205  .dispatchKey(HIPTensorId()); \
206  }
207 
208 #else
209 // Don't use c10 dispatcher on mobile because of binary size
210 #define C10_DECLARE_CAFFE2_OPERATOR(OperatorName)
211 #define C10_REGISTER_CAFFE2_OPERATOR_CPU(OperatorName, Inputs, Outputs, OperatorClass)
212 #define C10_REGISTER_CAFFE2_OPERATOR_CUDA(OperatorName, OperatorClass)
213 #define C10_REGISTER_CAFFE2_OPERATOR_HIP(OperatorName, OperatorClass)
214 #endif
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
A kernel can keep around a cache to have better performance when it&#39;s called multiple times...
Definition: KernelCache.h:15