Caffe2 - C++ API
A deep learning, cross platform ML framework
operator_c10wrapper.h
1 #pragma once
2 
3 #include <ATen/core/dispatch/Dispatcher.h>
4 #include "caffe2/core/operator.h"
5 #include <c10/util/ArrayRef.h>
6 #include <c10/util/Metaprogramming.h>
7 #include <ATen/core/ivalue.h>
8 
9 namespace caffe2 {
10 
19 namespace detail {
20 template <class Context>
21 class C10OperatorWrapper final : public Operator<Context> {
22  public:
23  USE_OPERATOR_CONTEXT_FUNCTIONS;
24 
26  const c10::OperatorHandle& op,
27  const OperatorDef& operator_def,
28  Workspace* ws)
29  : Operator<Context>(operator_def, ws),
30  op_(op),
31  kernel_(at::nullopt),
32  has_preallocated_outputs_(
33  op_.schema().arguments().size() != 0 &&
34  op_.schema().arguments().back().name() ==
35  detail::PREALLOCATED_OUTPUT_ARGNAME) {
36  AT_ASSERT(
37  !has_preallocated_outputs_ ||
38  op_.schema().arguments().back().type()->isSubtypeOf(
39  OptionalType::create(ListType::ofTensors())));
40 
41  AT_ASSERT(operator_def.output_size() == op_.schema().returns().size());
42  AT_ASSERT(
43  operator_def.input_size() + (has_preallocated_outputs_ ? 1 : 0) <=
44  op_.schema()
45  .arguments()
46  .size()); // '<=' because there might be caffe2 nontensor arguments
47  }
48 
49  bool RunOnDevice() override {
50  // due to caching the stack_, concurrent calling is not allowed.
51  // TODO thread_local might fix this
52  std::lock_guard<std::mutex> lock(mutex_);
53 
54  pushInputs_();
55  callKernel_();
56  popOutputs_();
57 
58  return true;
59  }
60 
61  private:
62  void pushInputs_() {
63  AT_ASSERT(stack_.size() == 0);
64  stack_.reserve(
65  op_.schema().arguments().size() + (has_preallocated_outputs_ ? 1 : 0));
66 
67  size_t input_tensor_index = 0;
68 
69  for (const auto& argument : op_.schema().arguments()) {
70  if (argument.name() == detail::PREALLOCATED_OUTPUT_ARGNAME) {
71  // note: if detail::PREALLOCATED_OUTPUT_ARGNAME was at the end of the
72  // argument list, then has_preallocated_outputs_ would be true.
73  AT_ASSERTM(
74  has_preallocated_outputs_,
75  "Error in caffe2->c10 wrapper: Operator schema has a parameter named ",
76  detail::PREALLOCATED_OUTPUT_ARGNAME,
77  ", but it's not at the end of the argument list");
78 
79  AT_ASSERTM(
80  argument.type()->isSubtypeOf(
81  OptionalType::create(ListType::ofTensors())),
82  "Error in caffe2->c10 wrapper: Operator schema has a parameter named ",
83  detail::PREALLOCATED_OUTPUT_ARGNAME,
84  ", but it's not of type TensorList?");
85  stack_.emplace_back(preallocated_outputs_());
86 
87  } else if (argument.type()->isSubtypeOf(TensorType::get())) {
88  AT_ASSERTM(
89  input_tensor_index < InputSize(),
90  "Error in caffe2->c10 wrapper: Too few tensor arguments given (",
91  InputSize(),
92  "), operator schema expected more.");
93  stack_.emplace_back(at::Tensor(Input(input_tensor_index++)));
94 
95  } else if (argument.type()->isSubtypeOf(ListType::ofTensors())) {
96  AT_ASSERTM(
97  input_tensor_index == 0,
98  "Error in caffe2->c10 wrapper: Schema can only have either one or more Tensor inputs or one TensorList input.");
99  stack_.emplace_back(ivalue::TensorList::create(array_inputs_()));
100  input_tensor_index = InputSize();
101 
102  } else {
103  stack_.emplace_back(get_nontensor_argument_(argument));
104  }
105 
106  AT_ASSERTM(
107  input_tensor_index == InputSize(),
108  "Error in caffe2->c10 wrapper: Number of caffe2 operator inputs (",
109  InputSize(),
110  ") doesn't match number of tensor arguments (",
111  input_tensor_index,
112  ") in the c10 operator schema.");
113  }
114  }
115 
116  void callKernel_() {
117  AT_ASSERT(stack_.size() == op_.schema().arguments().size());
118  if (!kernel_.has_value()) {
119  // TODO if kernel is already set, try re-dispatch to assert it goes to the same kernel
120  kernel_ = c10::Dispatcher::singleton().lookup(op_, &stack_);
121  }
122  kernel_->call(&stack_);
123  }
124 
125  void popOutputs_() {
126  AT_ASSERT(stack_.size() == op_.schema().returns().size());
127  for (size_t i = 0; i < op_.schema().returns().size(); ++i) {
128  OperatorBase::SetOutputTensor(i, Tensor(C10Tensor(std::move(stack_[i]).toTensor())));
129  }
130  stack_.clear();
131  }
132 
133  std::vector<at::Tensor> array_inputs_() {
134  std::vector<at::Tensor> result;
135  result.reserve(InputSize());
136  for (size_t i = 0; i < InputSize(); ++i) {
137  result.emplace_back(Input(i));
138  }
139  return result;
140  }
141 
142  std::vector<at::Tensor> preallocated_outputs_() {
143  std::vector<at::Tensor> result;
144  result.reserve(OutputSize());
145  for (size_t i = 0; i < OutputSize(); ++i) {
146  result.emplace_back(OperatorBase::OutputTensorOrUndefined(i));
147  }
148  return result;
149  }
150 
151  IValue get_nontensor_argument_(const c10::Argument& argument) {
152  if (argument.type()->isSubtypeOf(IntType::get())) {
153  return get_nontensor_argument_<int>(
154  argument.name(), argument.default_value());
155  } else if (argument.type()->isSubtypeOf(FloatType::get())) {
156  return get_nontensor_argument_<double>(
157  argument.name(), argument.default_value());
158  } else if (argument.type()->isSubtypeOf(BoolType::get())) {
159  return get_nontensor_argument_<bool>(
160  argument.name(), argument.default_value());
161  } else {
162  // TODO Support more types
163  AT_ERROR(
164  "Error in caffe2->c10 wrapper: Unsupported argument type ",
165  argument.type()->str(),
166  " in c10 operator schema");
167  }
168  }
169 
170  template <class T>
171  IValue get_nontensor_argument_(
172  const std::string& name,
173  const c10::optional<IValue>& default_value) {
174  if (default_value.has_value()) {
175  return this->template GetSingleArgument<T>(name, default_value->to<T>());
176  } else {
177  AT_CHECK(
178  this->template HasSingleArgumentOfType<T>(name),
179  "Error in caffe2->c10 wrapper: Expected argument '",
180  name,
181  "' missing or wrong type.");
182  return this->template GetSingleArgument<T>(name, 0);
183  }
184  }
185 
187  c10::optional<OpKernel> kernel_;
188 
189  // has_preallocated_outputs_ is true iff the operator schema has a last
190  // argument that is a TensorList and has a name equal to with the name equal
191  // to detail::PREALLOCATED_OUTPUT_ARGNAME. This argument is then used to pass
192  // in preallocated output tensors to the caffe2 operator.
193  bool has_preallocated_outputs_;
194 
195  // this is stored as a member here to avoid having to re-allocate a stack
196  // for each call. Between kernel calls, stack_.size() == 0, but capacity
197  // should not need to be grown anymore after the first call.
198  std::vector<IValue> stack_;
199  std::mutex mutex_;
200 };
201 
202 template <class Context>
203 inline std::function<
204  std::unique_ptr<OperatorBase>(const OperatorDef&, Workspace*)>
205 createC10OperatorWrapper(const c10::OperatorHandle& op_handle) {
206  return [op_handle](const OperatorDef& op_def, Workspace* ws) {
207  return c10::guts::make_unique<C10OperatorWrapper<Context>>(
208  op_handle, op_def, ws);
209  };
210 }
211 
212 } // namespace detail
213 
214 // TODO Also register c10 operators on mobile
215 #ifndef C10_MOBILE
216 // TODO Currently we only register the CPU variant. This is going to be fixed
217 // once the tensor detemplatization lands.
218 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU(OperatorHandle, Name) \
219  REGISTER_CPU_OPERATOR_CREATOR( \
220  Name, detail::createC10OperatorWrapper<CPUContext>(OperatorHandle))
221 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CUDA(OperatorHandle, Name) \
222  REGISTER_CUDA_OPERATOR_CREATOR( \
223  Name, detail::createC10OperatorWrapper<CUDAContext>(OperatorHandle))
224 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_HIP(OperatorHandle, Name) \
225  REGISTER_HIP_OPERATOR_CREATOR( \
226  Name, detail::createC10OperatorWrapper<HIPContext>(OperatorHandle))
227 #else
228 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU(OperatorHandle, Name)
229 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CUDA(OperatorHandle, Name)
230 #define REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_HIP(OperatorHandle, Name)
231 #endif
232 } // namespace caffe2
This is a minimal Tensor class for use in c10 code.
Definition: Tensor.h:18
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
OpKernel lookup(const OperatorHandle &op, const Stack *stack) const
Perform a dynamic dispatch and get the kernel for an operator.
Definition: Dispatcher.h:159
const Tensor & Input(int idx, DeviceType type=Context::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
This is a handle to an operator schema registered with the dispatcher.
Definition: Dispatcher.h:139