Caffe2 - C++ API
A deep learning, cross platform ML framework
segment_reduction_op.h
1 #ifndef CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
2 #define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
3 
4 #include "caffe2/core/context.h"
5 #include "caffe2/core/logging.h"
6 #include "caffe2/core/operator.h"
7 #include "caffe2/operators/reducer_functors.h"
8 
9 namespace caffe2 {
10 
11 template <typename TData>
13  public:
15 
16  bool observeInput(const Tensor& dataInput) {
17  data_ = dataInput.raw_data();
18  return dataInput.template IsType<TData>();
19  }
20 
21  inline const TData*
22  getBlockPtr(int64_t in_block_size, int64_t idx, int64_t /* blocks */ = 1) {
23  return static_cast<const TData*>(data_) + in_block_size * idx;
24  }
25 
26  protected:
27  const void* data_ = nullptr;
28 };
29 
31 // Range reducer ops: leverage that input segment is continuous and allow
32 // reducer functors to do something special
33 // Note: for now there are no real use cases for it yet :)
34 // Also, doesn't support additional arguments for now
36 
43 template <
44  typename T,
45  typename SIndex,
46  class Context,
47  class RangeReducer,
48  class InputAccessor = BaseInputAccessor<T>>
49 class AbstractSortedSegmentRangeOp : public Operator<Context> {
50  public:
51  USE_OPERATOR_CONTEXT_FUNCTIONS;
52  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeOp);
53 
54  bool RunOnDevice() override {
55  auto& dataInput = Input(DATA);
56  auto& segment_ids = Input(SEGMENT_IDS);
57 
58  CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
59  auto N = segment_ids.size(0);
60  CAFFE_ENFORCE_EQ(
61  N,
62  dataInput.size(0),
63  "SEGMENT_IDS must have the same length as outer dimension of DATA");
64 
65  OPERATOR_NEEDS_FEATURE(
66  inputAccessor_.observeInput(dataInput),
67  "Unsupported input type: ",
68  dataInput.dtype().name(),
69  ".");
70 
71  const SIndex* s_ids = segment_ids.template data<SIndex>();
72 
73  const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
74  auto shape = dataInput.sizes().vec();
75  shape[0] = K;
76  auto* output = Output(0, shape, at::dtype<T>());
77 
78  T* out = output->template mutable_data<T>();
79 
80  if (N == 0) {
81  return true;
82  }
83 
84  int64_t block_size = dataInput.numel() / N;
85 
86  // Assume the segments are sorted and there are no gaps
87  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
88  for (int64_t i = 0; i < N;) {
89  int64_t start = i;
90  for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
91  ;
92 
93  RangeReducer()(
94  block_size,
95  i - start,
96  inputAccessor_.getBlockPtr(block_size, start, i - start),
97  out + block_size * s_ids[start],
98  &context_);
99 
100  // check correctness of the next segment
101  if (i < N) {
102  CAFFE_ENFORCE_EQ(
103  s_ids[start] + 1,
104  s_ids[i],
105  "Indices must be sorted and not have gaps");
106  }
107  }
108  return true;
109  }
110 
111  static constexpr int kNumInputs = 2;
112  INPUT_TAGS(DATA, SEGMENT_IDS);
113 
114  private:
115  InputAccessor inputAccessor_;
116 };
117 
118 template <
119  typename T,
120  typename SIndex,
121  class Context,
122  class RangeReducerGradient>
124  public:
125  USE_OPERATOR_CONTEXT_FUNCTIONS;
126  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeGradientOp);
127 
128  bool RunOnDevice() override {
129  // TODO(azzolini): avoid using input/output if not used by a particular op
130  auto& data_in = Input(DATA_IN);
131  auto& data_out = Input(DATA_OUT);
132  auto& segment_grads = Input(SEGMENT_GRADS);
133  auto& segment_ids = Input(SEGMENT_IDS);
134 
135  CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
136  int64_t N = segment_ids.size(0);
137 
138  const SIndex* s_ids = segment_ids.template data<SIndex>();
139  const T* s_grads = segment_grads.template data<T>();
140  const T* d_in = data_in.template data<T>();
141  const T* d_out = data_out.template data<T>();
142 
143  auto shape = segment_grads.sizes().vec();
144  shape[0] = N;
145  auto* data_grads = Output(0, shape, at::dtype<T>());
146 
147  const SIndex K = segment_grads.size(0);
148  T* out = data_grads->template mutable_data<T>();
149 
150  if (N == 0) {
151  return true;
152  }
153 
154  int64_t block_size = segment_grads.size_from_dim(1);
155 
156  // Assume the segments are sorted and there are no gaps
157  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
158  // repeat the check from forward op
159  CAFFE_ENFORCE_EQ(
160  K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
161  for (int64_t i = 0; i < N;) {
162  int64_t start = i;
163  for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
164  ;
165 
166  auto expanded_idx = block_size * start;
167  auto reduced_idx = block_size * s_ids[start];
168  RangeReducerGradient()(
169  block_size,
170  i - start,
171  s_grads + reduced_idx,
172  out + expanded_idx,
173  d_in + expanded_idx,
174  d_out + reduced_idx,
175  &context_);
176 
177  // check correctness of the next segment
178  if (i < N) {
179  CAFFE_ENFORCE_EQ(
180  s_ids[start] + 1,
181  s_ids[i],
182  "Indices must be sorted and not have gaps");
183  }
184  }
185  return true;
186  }
187 
188  static constexpr int kNumInputs = 4;
189  INPUT_TAGS(DATA_IN, DATA_OUT, SEGMENT_GRADS, SEGMENT_IDS);
190 };
191 
192 template <typename T, typename SIndex, typename Context, typename ReducerDef>
194  using OpDef = ReducerDef;
195  static constexpr const char* basename = "SortedSegmentRange";
196  static constexpr const char* doc = R"DOC(
197 Applies '{op}' to each segment of input tensor. In order to allow for more
198 efficient implementation of '{op}', the input segments have to be contiguous
199 and non-empty.
200 
201 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
202 DATA to a particular group (segment). Values belonging to the same segment are
203 aggregated together.
204 
205 The first dimension of the output is equal to the number of input segments,
206 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
207 
208 {op_doc}
209  )DOC";
210  static void PopulateSchema(OpSchema& schema) {
211  schema.Input(0, "DATA", "Input tensor to be aggregated");
212  schema.Input(
213  1,
214  "SEGMENT_IDS",
215  "Vector with the same length as the first dimension of DATA "
216  "and values in the range 0..K-1 and in increasing order that "
217  "maps each slice of DATA to one of the segments");
218  schema.Output(
219  0,
220  "OUTPUT",
221  "Aggregated tensor with the first dimension of K and the "
222  "other dimentsions inherited from DATA");
223  }
225  T,
226  SIndex,
227  Context,
228  typename ReducerDef::template Reducer<T, Context>>;
230  T,
231  SIndex,
232  Context,
233  typename ReducerDef::template ReducerGradient<T, Context>>;
234  struct GetGradient : public GradientMakerBase {
235  using GradientMakerBase::GradientMakerBase;
236  vector<OperatorDef> GetGradientDefs() override {
237  return SingleGradientDef(
238  string(basename) + ReducerDef::name + "Gradient",
239  "",
240  vector<string>{I(0), O(0), GO(0), I(1)},
241  // no gradient on segment_ids!
242  vector<string>{GI(0)});
243  }
244  };
245 };
246 
248 // Incremental reducer ops: assume that reducer consumes pieces of data one by
249 // one. Also, supports additional arguments passed to reducer, e.g. scalers for
250 // weighted sum.
251 //
252 // Note: in current implementation additional inputs are considered auxiliary
253 // constants and have limitations:
254 // - there is no gradient computation for auxiliary inputs
255 // - auxiliary inputs aren't affected by fused embedding lookup in operations
256 // like sparse_sorted_segment
258 
275 template <
276  typename T,
277  class Context,
278  class Reducer,
279  bool FirstDim,
280  class InputAccessor = BaseInputAccessor<T>>
281 class AbstractReduceFrontOrBackOp : public Operator<Context> {
282  public:
283  USE_OPERATOR_CONTEXT_FUNCTIONS;
284 
285  template <class... Args>
286  explicit AbstractReduceFrontOrBackOp(Args&&... args)
287  : Operator<Context>(std::forward<Args>(args)...),
288  OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}
289 
290  bool RunOnDevice() override {
291  auto& data = Input(0);
292  // If more complicated fixed size logic becomes necessary, it can be moved
293  // to the reducer class
294  int64_t in_block_size = FirstDim
295  ? data.size_from_dim(num_reduce_dims_)
296  : data.size_to_dim(data.dim() - num_reduce_dims_);
298  this, in_block_size);
299  }
300 
301  template <int FixedSize>
302  bool DoRunWithValue() {
303  auto& data = Input(0);
304 
305  CAFFE_ENFORCE_LE(num_reduce_dims_, data.dim());
306 
307  typename Reducer::Meta ctx(FirstDim);
308  ctx.observeInput(0, data, num_reduce_dims_);
309  for (int i = 1; i < Reducer::kInputCount; ++i) {
310  auto& aux_in = Input(i);
311  ctx.observeInput(i, aux_in, num_reduce_dims_);
312  }
313 
314  OPERATOR_NEEDS_FEATURE(
315  inputAccessor_.observeInput(data),
316  "Unsupported input type: ",
317  data.dtype().name(),
318  ".");
319 
320  vector<int64_t> shape;
321  ctx.appendOutputShape(&shape);
322  auto* output = Output(0, shape, at::dtype<T>());
323 
324  T* out = output->template mutable_data<T>();
325 
326  const int block_size = FirstDim
327  ? data.size_from_dim(num_reduce_dims_)
328  : data.size_from_dim(data.dim() - num_reduce_dims_);
329 
330  const int num_blocks = block_size > 0 ? data.numel() / block_size : 0;
331 
332  Reducer r(ctx, out, &context_);
333  for (int64_t i = 0; i < num_blocks; ++i) {
334  r.template process<FixedSize>(
335  ctx, inputAccessor_.getBlockPtr(block_size, i), i, &context_);
336  }
337  r.template finish<FixedSize>(ctx, &context_);
338  return true;
339  }
340 
341  static constexpr int kNumInputs = Reducer::kInputCount;
342 
343  private:
344  int num_reduce_dims_;
345  InputAccessor inputAccessor_;
346 };
347 
348 template <
349  typename T,
350  class Context,
351  class ReducerGradient,
352  bool FirstDim = true>
354  public:
355  USE_OPERATOR_CONTEXT_FUNCTIONS;
356 
357  template <class... Args>
358  explicit AbstractReduceFrontOrBackGradientOp(Args&&... args)
359  : Operator<Context>(std::forward<Args>(args)...),
360  OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}
361 
362  bool RunOnDevice() override {
363  // If more complicated fixed size logic becomes necessary, it can be moved
364  // to the reducer class
365  int64_t grad_block_size = Input(REDUCTION_GRAD).numel();
367  this, grad_block_size);
368  }
369 
370  template <int FixedSize>
371  bool DoRunWithValue() {
372  auto& reduction_grad = Input(REDUCTION_GRAD);
373  auto& source_shape = this->template Input<Tensor>(SOURCE_SHAPE, CPU);
374 
375  typename ReducerGradient::Meta ctx(reduction_grad, 0, FirstDim);
376  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
377  auto& aux_in = Input(i);
378  ctx.observeOriginalInput(
379  ReducerGradient::originalInputs()[i],
380  aux_in,
381  nullptr, /*no grad*/
382  num_reduce_dims_);
383  }
384 
385  const T* r_grad = reduction_grad.template data<T>();
386 
387  CAFFE_ENFORCE_LE(num_reduce_dims_, source_shape.numel());
388 
389  vector<int64_t> shape(
390  source_shape.template data<int64_t>(),
391  source_shape.template data<int64_t>() + source_shape.numel());
392 
393  auto* data_grads = Output(0, shape, at::dtype<T>());
394 
395  int64_t block_size = FirstDim
396  ? data_grads->size_from_dim(num_reduce_dims_)
397  : data_grads->size_from_dim(data_grads->dim() - num_reduce_dims_);
398  int64_t block_num = block_size > 0 ? data_grads->numel() / block_size : 0;
399 
400  T* out = data_grads->template mutable_data<T>();
401 
402  ReducerGradient r(ctx, r_grad, &context_);
403  for (int64_t i = 0; i < block_num; ++i) {
404  r.template fillGrad<FixedSize>(
405  ctx,
406  out + block_size * i,
407  i,
408  &context_,
409  FirstDim ? block_num : block_size);
410  }
411  return true;
412  }
413 
414  static constexpr int kNumInputs =
415  ReducerGradient::originalInputs().size() + 2;
416  enum _InputTags {
417  REDUCTION_GRAD = ReducerGradient::originalInputs().size(),
418  SOURCE_SHAPE
419  };
420 
421  private:
422  int num_reduce_dims_;
423 };
424 
425 template <typename T, typename Context, typename ReducerDef>
427  using OpDef = ReducerDef;
428  static constexpr const char* basename = "ReduceFront";
429  static constexpr const char* doc = R"DOC(
430 Reduces the input tensor along the first dimension of the input tensor by
431 applying '{op}'. This op acts in a similar way to SortedSegment{op} and
432 UnsortedSegment{op} but as if all input slices belong to a single segment.
433 
434 {op_doc}
435  )DOC";
436  static void PopulateSchema(OpSchema& schema) {
437  schema.Input(
438  0, "DATA", "Input tensor to be reduced on the first dimension");
439  schema.TensorInferenceFunction([](const OperatorDef& def,
440  const vector<TensorShape>& in) {
441  CAFFE_ENFORCE_EQ(1, in.size());
442  ArgumentHelper helper(def);
443  int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
444  typename ReducerDef::template Reducer<T, Context>::Meta ctx(true);
445  vector<int64_t> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
446  return vector<TensorShape>{
447  CreateTensorShape(out_dims, in[0].data_type())};
448  });
449  ReducerDef::PopulateSchema(schema);
450  }
451  using ReducerGradient =
452  typename ReducerDef::template ReducerGradient<T, Context>;
454  T,
455  Context,
456  typename ReducerDef::template Reducer<T, Context>,
457  true>;
458  using BackwardOp =
460  struct GetGradient : public GradientMakerBase {
461  using GradientMakerBase::GradientMakerBase;
462  vector<OperatorDef> GetGradientDefs() override {
463  // Have utility function generating these names?
464  string tmp_dims = "_" + O(0) + "_dims";
465 
466  vector<string> grad_ins;
467  for (const int i : ReducerGradient::originalInputs()) {
468  grad_ins.push_back(I(i));
469  }
470  grad_ins.push_back(GO(0));
471  grad_ins.push_back(tmp_dims);
472 
473  vector<Argument> args;
474  if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
475  args.push_back(GetArgument(def_, "num_reduce_dim"));
476  }
477  // FIXME: pass in num_reduce_dims?!
478  return vector<OperatorDef>{
479  CreateOperatorDef(
480  "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
481  CreateOperatorDef(
482  string(basename) + ReducerDef::name + "Gradient",
483  "",
484  grad_ins,
485  // no gradient on auxiliary inputs for now
486  vector<string>{GI(0)}),
487  };
488  }
489  };
490 };
491 
492 template <typename T, typename Context, typename ReducerDef>
494  using OpDef = ReducerDef;
495  static constexpr const char* basename = "ReduceBack";
496  static constexpr const char* doc = R"DOC(
497 Reduces the input tensor along the last dimension of the input tensor by
498 applying '{op}'. This op acts in a similar way to SortedSegment{op} and
499 UnsortedSegment{op} but as if all input slices belong to a single segment.
500 
501 {op_doc}
502  )DOC";
503  static void PopulateSchema(OpSchema& schema) {
504  schema.Input(
505  0, "DATA", "Input tensor to be reduced on the first dimension");
506  schema.TensorInferenceFunction([](const OperatorDef& def,
507  const vector<TensorShape>& in) {
508  CAFFE_ENFORCE_EQ(1, in.size());
509  ArgumentHelper helper(def);
510  int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
511  typename ReducerDef::template Reducer<T, Context>::Meta ctx(false);
512  vector<int64_t> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
513  return vector<TensorShape>{
514  CreateTensorShape(out_dims, in[0].data_type())};
515  });
516  ReducerDef::PopulateSchema(schema);
517  }
518  using ReducerGradient =
519  typename ReducerDef::template ReducerGradient<T, Context>;
521  T,
522  Context,
523  typename ReducerDef::template Reducer<T, Context>,
524  false>;
525  using BackwardOp =
527  struct GetGradient : public GradientMakerBase {
528  using GradientMakerBase::GradientMakerBase;
529  vector<OperatorDef> GetGradientDefs() override {
530  // Have utility function generating these names?
531  string tmp_dims = "_" + O(0) + "_dims";
532 
533  vector<string> grad_ins;
534  for (const int i : ReducerGradient::originalInputs()) {
535  grad_ins.push_back(I(i));
536  }
537  grad_ins.push_back(GO(0));
538  grad_ins.push_back(tmp_dims);
539 
540  vector<Argument> args;
541  if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
542  args.push_back(GetArgument(def_, "num_reduce_dim"));
543  }
544  // FIXME: pass in num_reduce_dims?!
545  return vector<OperatorDef>{
546  CreateOperatorDef(
547  "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
548  CreateOperatorDef(
549  string(basename) + ReducerDef::name + "Gradient",
550  "",
551  grad_ins,
552  // no gradient on auxiliary inputs for now
553  vector<string>{GI(0)}),
554  };
555  }
556  };
557 };
558 
581 template <
582  typename T,
583  typename SIndex,
584  class Context,
585  class Reducer,
586  bool SparseFused = true,
587  class InputAccessor = BaseInputAccessor<T>>
588 class AbstractSortedSegmentOp : public Operator<Context> {
589  public:
590  USE_OPERATOR_CONTEXT_FUNCTIONS;
591  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentOp);
592 
593  bool RunOnDevice() override {
594  if (SparseFused) {
596  this, Input(INDICES));
597  } else {
598  // type doesn't matter
599  return DoRunWithType<int64_t>();
600  }
601  }
602 
603  template <typename IndexType>
604  bool DoRunWithType() {
605  // If more complicated fixed size logic becomes necessary, it can be moved
606  // to the reducer class
607  int64_t in_block_size = Input(0).size_from_dim(1);
609  this, in_block_size);
610  }
611 
612  template <typename IndexType, int FixedSize>
613  bool DoRunWithValue() {
614  auto& dataInput = Input(0);
615  auto& segment_ids = Input(SEGMENT_IDS);
616 
617  CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
618  int64_t N = segment_ids.size(0);
619  const int64_t M = dataInput.size(0);
620 
621  const IndexType* idxs;
622  if (SparseFused) { // static if
623  auto& indices = Input(INDICES);
624  CAFFE_ENFORCE_EQ(1, indices.dim(), "INDICES must be a vector");
625  CAFFE_ENFORCE_EQ(
626  N,
627  indices.size(0),
628  "SEGMENT_IDS must have the same length as INDICES");
629  idxs = indices.template data<IndexType>();
630  } else {
631  CAFFE_ENFORCE_EQ(
632  N, M, "DATA must have the same first dimension as SEGMENT_IDS");
633  }
634 
635  // It would probably look nicer with varargs templates but it's too much
636  // metaprogramming
637  typename Reducer::Meta ctx;
638  ctx.observeInput(0, dataInput, 1);
639  for (int i = 1; i < Reducer::kInputCount; ++i) {
640  auto& aux_in = Input(i);
641  CAFFE_ENFORCE_EQ(
642  N,
643  aux_in.size(0),
644  "Input ",
645  i,
646  " must have the same first dim as SEGMENT_IDS");
647  ctx.observeInput(i, aux_in, 1);
648  }
649 
650  OPERATOR_NEEDS_FEATURE(
651  inputAccessor_.observeInput(dataInput),
652  "Unsupported input type: ",
653  dataInput.dtype().name(),
654  ".");
655 
656  const SIndex* s_ids = segment_ids.template data<SIndex>();
657 
658  const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
659  vector<int64_t> shape;
660  shape.push_back(K);
661  ctx.appendOutputShape(&shape);
662  auto* output = Output(0, shape, at::dtype<T>());
663 
664  T* out = output->template mutable_data<T>();
665  if (N == 0) {
666  return true;
667  }
668  int64_t in_block_size = dataInput.size_from_dim(1);
669  int64_t out_block_size = output->size_from_dim(1);
670 
671  // Assume the segments are sorted and there are no gaps
672  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
673  for (int64_t i = 0; i < N;) {
674  int64_t start = i;
675 
676  Reducer r(ctx, out + out_block_size * s_ids[start], &context_);
677  for (; i < N && s_ids[start] == s_ids[i]; ++i) {
678  IndexType idx;
679  if (SparseFused) { // static if
680  CAFFE_ENFORCE(
681  0 <= idxs[i] && idxs[i] < M,
682  "Index out of bounds: ",
683  idxs[i],
684  ", range 0 to ",
685  M);
686  idx = idxs[i];
687  } else {
688  idx = i;
689  }
690  r.template process<FixedSize>(
691  ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
692  }
693 
694  r.template finish<FixedSize>(ctx, &context_);
695  // check correctness of the next segment
696  if (i < N) {
697  CAFFE_ENFORCE_EQ(
698  s_ids[start] + 1,
699  s_ids[i],
700  "Indices must be sorted and not have gaps");
701  }
702  }
703  return true;
704  }
705 
706  enum {
707  INDICES = Reducer::kInputCount,
708  SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
709  };
710  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
711  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
712 
713  private:
714  InputAccessor inputAccessor_;
715 };
716 
717 // Gradient actually doesn't depend on whether sparse lookup is fused or not
718 template <typename T, typename SIndex, class Context, class ReducerGradient>
719 class AbstractSortedSegmentGradientOp : public Operator<Context> {
720  public:
721  USE_OPERATOR_CONTEXT_FUNCTIONS;
722  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentGradientOp);
723 
724  bool RunOnDevice() override {
725  // If more complicated fixed size logic becomes necessary, it can be moved
726  // to the reducer class
727  int64_t grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
729  this, grad_block_size);
730  }
731 
732  template <int FixedSize>
733  bool DoRunWithValue() {
734  auto& segment_grads = Input(SEGMENT_GRADS);
735  auto& segment_ids = Input(SEGMENT_IDS);
736 
737  CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
738  int64_t N = segment_ids.size(0);
739 
740  typename ReducerGradient::Meta ctx(segment_grads, 1);
741  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
742  auto& aux_in = Input(i);
743  CAFFE_ENFORCE_EQ(
744  N,
745  aux_in.size(0),
746  "Input ",
747  i,
748  " must have the same first dim as SEGMENT_IDS");
749  ctx.observeOriginalInput(
750  ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
751  }
752 
753  const SIndex* s_ids = segment_ids.template data<SIndex>();
754  const T* s_grads = segment_grads.template data<T>();
755 
756  vector<int64_t> shape;
757  shape.push_back(N);
758  ctx.appendGradShape(&shape);
759  auto* data_grads = Output(0, shape, at::dtype<T>());
760 
761  int64_t d_block_size = data_grads->size_from_dim(1);
762  const SIndex K = segment_grads.size(0);
763  int64_t s_block_size = segment_grads.size_from_dim(1);
764  T* out = data_grads->template mutable_data<T>();
765 
766  if (N == 0) {
767  return true;
768  }
769 
770  // Assume the segments are sorted and there are no gaps
771  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
772  // repeat the check from forward op
773  CAFFE_ENFORCE_EQ(
774  K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
775  for (int64_t i = 0; i < N;) {
776  int64_t start = i;
777  int64_t end = start;
778 
779  if (ReducerGradient::computeLength()) {
780  for (; end < N && s_ids[start] == s_ids[end]; ++end) {
781  }
782  }
783 
784  ReducerGradient r(ctx, s_grads + s_block_size * s_ids[start], &context_);
785  for (; i < N && s_ids[start] == s_ids[i]; ++i) {
786  r.template fillGrad<FixedSize>(
787  ctx, out + d_block_size * i, i, &context_, end - start);
788  }
789 
790  // check correctness of the next segment
791  if (i < N) {
792  CAFFE_ENFORCE_EQ(
793  s_ids[start] + 1,
794  s_ids[i],
795  "Indices must be sorted and not have gaps");
796  }
797  }
798  return true;
799  }
800 
801  // Input layout:
802  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
803  // orig_argXs represent original op's inputs and will be passed to the reducer
804  // directly
805  static constexpr int kNumInputs =
806  ReducerGradient::originalInputs().size() + 2;
807  enum _InputTags {
808  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
809  SEGMENT_IDS
810  };
811 };
812 
813 // base implementation of sorted/unsorted sparse/non-sparse gradient computation
814 template <
815  typename ForwardOp,
816  typename ReducerDef,
817  typename ReducerGradient,
818  bool Sorted,
819  bool SparseFused>
821  using GradientMakerBase::GradientMakerBase;
822  vector<OperatorDef> GetGradientDefs() override {
823  CAFFE_ENFORCE(
824  !ReducerGradient::requiresDataInput(Def()),
825  "grads on aux inputs are not yet implemented for Segment operators.");
826  vector<string> grad_ins;
827  for (const int i : ReducerGradient::originalInputs()) {
828  grad_ins.push_back(I(i));
829  }
830  grad_ins.push_back(GO(0));
831  grad_ins.push_back(I(ForwardOp::SEGMENT_IDS));
832  vector<OperatorDef> r{CreateOperatorDef(
833  string(Sorted ? "SortedSegment" : "UnsortedSegment") +
834  ReducerDef::name + "Gradient",
835  "",
836  grad_ins,
837  // no gradient on segment_ids or auxiliary inputs for now
838  vector<string>{SparseFused ? GI_V(0) : GI(0)})};
839  if (SparseFused) {
840  SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
841  }
842  return r;
843  }
844 };
845 
846 template <typename T, typename SIndex, typename Context, typename ReducerDef>
848  using OpDef = ReducerDef;
849  static constexpr const char* basename = "SortedSegment";
850  static constexpr const char* doc = R"DOC(
851 Applies '{op}' to each segment of input tensor. Segments need to be sorted and
852 contiguous. See also UnsortedSegment{op} that doesn't have this requirement.
853 
854 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
855 DATA to a particular group (segment). Values belonging to the same segment are
856 aggregated together.
857 
858 The first dimension of the output is equal to the number of input segments,
859 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
860 
861 {op_doc}
862  )DOC";
863  static void PopulateSchema(OpSchema& schema) {
864  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
865  schema.Input(
866  Reducer::kInputCount,
867  "SEGMENT_IDS",
868  "Vector with the same length as the first dimension of DATA "
869  "and values in the range 0..K-1 and in increasing order that "
870  "maps each slice of DATA to one of the segments");
871  schema.Output(
872  0,
873  "OUTPUT",
874  "Aggregated output tensor. Has the first dimension of K "
875  "(the number of segments).");
876  ReducerDef::PopulateSchema(schema);
877  }
878  using Reducer = typename ReducerDef::template Reducer<T, Context>;
879  using ReducerGradient =
880  typename ReducerDef::template ReducerGradient<T, Context>;
882  using BackwardOp =
885  ForwardOp,
886  ReducerDef,
887  ReducerGradient,
888  true /*Sorted*/,
889  false /*SparseFused*/>;
890 };
891 
892 template <typename T, typename SIndex, typename Context, typename ReducerDef>
894  using OpDef = ReducerDef;
895  static constexpr const char* basename = "SparseSortedSegment";
896  static constexpr const char* doc = R"DOC(
897 Pulls in slices of the input tensor, groups them into segments and applies
898 '{op}' to each segment. Segments need to be sorted and contiguous. See also
899 SparseUnsortedSegment{op} that doesn't have this requirement.
900 
901 This op is basically Gather and SortedSegment{op} fused together.
902 
903 INDICES should contain integers in range 0..N-1 where N is the first dimension
904 of DATA. INDICES represent which slices of DATA need to be pulled in.
905 
906 SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
907 particular group (segment). Values belonging to the same segment are aggregated
908 together. SEGMENT_IDS should have the same dimension as INDICES.
909 
910 The first dimension of the output is equal to the number of input segments,
911 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
912 
913 {op_doc}
914  )DOC";
915  static void PopulateSchema(OpSchema& schema) {
916  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
917  schema.Input(
918  Reducer::kInputCount,
919  "INDICES",
920  "Integer vector containing indices of the first dimension of DATA for "
921  "the slices that are being aggregated");
922  schema.Input(
923  Reducer::kInputCount + 1,
924  "SEGMENT_IDS",
925  "Vector with the same length as INDICES and values in the range "
926  "0..K-1 and in increasing order that maps each slice of DATA referenced"
927  " by INDICES to one of the segments");
928  schema.Output(
929  0,
930  "OUTPUT",
931  "Aggregated output tensor. Has the first dimension of K "
932  "(the number of segments).");
933  ReducerDef::PopulateSchema(schema);
934  }
935  using Reducer = typename ReducerDef::template Reducer<T, Context>;
936  using ReducerGradient =
937  typename ReducerDef::template ReducerGradient<T, Context>;
939  // TODO(dzhulgakov): we're registering the same class twice here,
940  // consider avoiding op duplication here
941  using BackwardOp =
944  ForwardOp,
945  ReducerDef,
946  ReducerGradient,
947  true /*Sorted*/,
948  true /*SparseFused*/>;
949 };
950 
980 template <
981  typename T,
982  typename SIndex,
983  class Context,
984  class Reducer,
985  bool SparseFused = true,
986  class InputAccessor = BaseInputAccessor<T>>
987 class AbstractUnsortedSegmentOp : public Operator<Context> {
988  public:
989  USE_OPERATOR_CONTEXT_FUNCTIONS;
990 
991  template <class... Args>
992  explicit AbstractUnsortedSegmentOp(Args&&... args)
993  : Operator<Context>(std::forward<Args>(args)...),
994  OP_SINGLE_ARG(int, "num_segments", num_segments_, -1) {}
995 
996  bool RunOnDevice() override {
997  if (SparseFused) {
999  this, Input(INDICES));
1000  } else {
1001  // type doesn't matter
1002  return DoRunWithType<int64_t>();
1003  }
1004  }
1005 
1006  template <typename IndexType>
1007  bool DoRunWithType() {
1008  // If more complicated fixed size logic becomes necessary, it can be moved
1009  // to the reducer class
1010  int64_t in_block_size = Input(0).size_from_dim(1);
1012  this, in_block_size);
1013  }
1014 
1015  template <typename IndexType, int FixedSize>
1016  bool DoRunWithValue() {
1017  auto& data = Input(0);
1018  auto& segment_ids = Input(SEGMENT_IDS);
1019 
1020  CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
1021  int64_t N = segment_ids.size(0);
1022  const int64_t M = data.size(0);
1023 
1024  const IndexType* idxs;
1025  if (SparseFused) { // static if
1026  auto& indices = Input(INDICES);
1027  CAFFE_ENFORCE_EQ(1, indices.dim(), "INDICES must be a vector");
1028  CAFFE_ENFORCE_EQ(
1029  N,
1030  indices.size(0),
1031  "SEGMENT_IDS must have the same length as INDICES");
1032  idxs = indices.template data<IndexType>();
1033  } else {
1034  CAFFE_ENFORCE_EQ(
1035  N, M, "DATA must have the same first dimension as SEGMENT_IDS");
1036  }
1037 
1038  // It would probably look nicer with varargs templates but it's too much
1039  // metaprogramming
1040  typename Reducer::Meta ctx;
1041  ctx.observeInput(0, data, 1);
1042  for (int i = 1; i < Reducer::kInputCount; ++i) {
1043  auto& aux_in = Input(i);
1044  CAFFE_ENFORCE_EQ(
1045  N,
1046  aux_in.size(0),
1047  "Input ",
1048  i,
1049  " must have the same first dim as SEGMENT_IDS");
1050  ctx.observeInput(i, aux_in, 1);
1051  }
1052 
1053  const SIndex* s_ids = segment_ids.template data<SIndex>();
1054  OPERATOR_NEEDS_FEATURE(
1055  inputAccessor_.observeInput(data),
1056  "Unsupported input type: ",
1057  data.dtype().name(),
1058  ".");
1059 
1060  // determine the number of segments
1061  SIndex K;
1062  if (num_segments_ != -1) {
1063  K = num_segments_;
1064  } else {
1065  K = 0;
1066  for (int64_t i = 0; i < N; ++i) {
1067  K = std::max(K, s_ids[i] + 1);
1068  }
1069  }
1070 
1071  vector<int64_t> shape;
1072  shape.push_back(K);
1073  ctx.appendOutputShape(&shape);
1074  auto* output = Output(0, shape, at::dtype<T>());
1075 
1076  int64_t in_block_size = data.size_from_dim(1);
1077  int64_t out_block_size = output->size_from_dim(1);
1078  T* out = output->template mutable_data<T>();
1079 
1080  reducers_.clear();
1081  reducers_.reserve(K);
1082  for (int64_t i = 0; i < K; ++i) {
1083  reducers_.emplace_back(ctx, out + out_block_size * i, &context_);
1084  }
1085 
1086  for (int64_t i = 0; i < N; ++i) {
1087  auto s_id = s_ids[i];
1088  CAFFE_ENFORCE(
1089  0 <= s_id && s_id < K,
1090  "Segment id out of range: ",
1091  s_id,
1092  ", range 0 to ",
1093  K);
1094  IndexType idx;
1095  if (SparseFused) { // static if
1096  CAFFE_ENFORCE(
1097  0 <= idxs[i] && idxs[i] < M,
1098  "Index out of bounds: ",
1099  idxs[i],
1100  ", range 0 to ",
1101  M);
1102  idx = idxs[i];
1103  } else {
1104  idx = i;
1105  }
1106  reducers_[s_id].template process<FixedSize>(
1107  ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
1108  }
1109 
1110  for (int64_t i = 0; i < K; ++i) {
1111  reducers_[i].template finish<FixedSize>(ctx, &context_);
1112  }
1113  // call reducers destructors (if there is any)
1114  reducers_.clear();
1115  return true;
1116  }
1117 
1118  enum {
1119  INDICES = Reducer::kInputCount,
1120  SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
1121  };
1122  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
1123  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
1124 
1125  private:
1126  int64_t num_segments_;
1127  // member field to reuse memory
1128  vector<Reducer> reducers_;
1129  InputAccessor inputAccessor_;
1130 };
1131 
1132 // Gradient actually doesn't depend on whether sparse lookup is fused or not
1133 template <typename T, typename SIndex, class Context, class ReducerGradient>
1135  public:
1136  USE_OPERATOR_CONTEXT_FUNCTIONS;
1137  USE_SIMPLE_CTOR_DTOR(AbstractUnsortedSegmentGradientOp);
1138 
1139  bool RunOnDevice() override {
1140  // If more complicated fixed size logic becomes necessary, it can be moved
1141  // to the reducer class
1142  int64_t grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
1144  this, grad_block_size);
1145  }
1146 
1147  template <int FixedSize>
1148  bool DoRunWithValue() {
1149  auto& segment_grads = Input(SEGMENT_GRADS);
1150  auto& segment_ids = Input(SEGMENT_IDS);
1151 
1152  CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
1153  int64_t N = segment_ids.size(0);
1154 
1155  typename ReducerGradient::Meta ctx(segment_grads, 1);
1156  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1157  auto& aux_in = Input(i);
1158  CAFFE_ENFORCE_EQ(
1159  N,
1160  aux_in.size(0),
1161  "Input ",
1162  i,
1163  " must have the same first dim as SEGMENT_IDS");
1164  ctx.observeOriginalInput(
1165  ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
1166  }
1167 
1168  const SIndex* s_ids = segment_ids.template data<SIndex>();
1169  const T* s_grads = segment_grads.template data<T>();
1170 
1171  vector<int64_t> shape;
1172  shape.push_back(N);
1173  ctx.appendGradShape(&shape);
1174  auto* data_grads = Output(0, shape, at::dtype<T>());
1175 
1176  int64_t d_block_size = data_grads->size_from_dim(1);
1177  const SIndex K = segment_grads.size(0);
1178  int64_t s_block_size = segment_grads.size_from_dim(1);
1179  T* out = data_grads->template mutable_data<T>();
1180 
1181  if (ReducerGradient::computeLength()) {
1182  segment_length_.resize(K, 0);
1183  for (int i = 0; i < N; ++i) {
1184  auto s_id = s_ids[i];
1185  CAFFE_ENFORCE(
1186  0 <= s_id && s_id < K,
1187  "Segment id out of range: ",
1188  s_id,
1189  ", range 0 to ",
1190  K);
1191  segment_length_[s_ids[i]]++;
1192  }
1193  }
1194 
1195  reducers_.clear();
1196  reducers_.reserve(K);
1197  for (SIndex i = 0; i < K; ++i) {
1198  reducers_.emplace_back(ctx, s_grads + s_block_size * i, &context_);
1199  }
1200 
1201  for (int64_t i = 0; i < N; ++i) {
1202  auto s_id = s_ids[i];
1203  if (ReducerGradient::computeLength()) {
1204  reducers_[s_id].template fillGrad<FixedSize>(
1205  ctx, out + d_block_size * i, i, &context_, segment_length_[s_id]);
1206  } else {
1207  reducers_[s_id].template fillGrad<FixedSize>(
1208  ctx, out + d_block_size * i, i, &context_, 0);
1209  }
1210  }
1211  // call reducers destructors (if there is any)
1212  reducers_.clear();
1213  return true;
1214  }
1215 
1216  // Input layout:
1217  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
1218  // orig_argXs represent original op's inputs and will be passed to the reducer
1219  // directly
1220  static constexpr int kNumInputs =
1221  ReducerGradient::originalInputs().size() + 2;
1222  enum _InputTags {
1223  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
1224  SEGMENT_IDS
1225  };
1226 
1227  private:
1228  // member field to reuse memory
1229  vector<ReducerGradient> reducers_;
1230  vector<int> segment_length_;
1231 };
1232 
1233 template <typename T, typename SIndex, typename Context, typename ReducerDef>
1235  using OpDef = ReducerDef;
1236  static constexpr const char* basename = "UnsortedSegment";
1237  static constexpr const char* doc = R"DOC(
1238 Applies '{op}' to each segment of input tensor. Segments ids can appear in
1239 arbitrary order (unlike in SortedSegment{op}).
1240 
1241 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
1242 DATA to a particular group (segment). Values belonging to the same segment are
1243 aggregated together.
1244 
1245 If `num_segments` argument is passed it would be used as a first dimension for
1246 the output. Otherwise, it'd be dynamically calculated from as the max value of
1247 SEGMENT_IDS plus one. Other output dimensions are inherited from the input
1248 tensor.
1249 
1250 {op_doc}
1251  )DOC";
1252  static void PopulateSchema(OpSchema& schema) {
1253  schema.Arg(
1254  "num_segments",
1255  "Optional int argument specifying the number of output segments and "
1256  "thus the first dimension of the output");
1257  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1258  schema.Input(
1259  Reducer::kInputCount,
1260  "SEGMENT_IDS",
1261  "Integer vector with the same length as the first dimension of DATA "
1262  "that maps each slice of DATA to one of the segments");
1263  schema.Output(
1264  0,
1265  "OUTPUT",
1266  "Aggregated output tensor. Has the first dimension of equal to the "
1267  "number of segments.");
1268  ReducerDef::PopulateSchema(schema);
1269  }
1270  using Reducer = typename ReducerDef::template Reducer<T, Context>;
1271  using ReducerGradient =
1272  typename ReducerDef::template ReducerGradient<T, Context>;
1274  T,
1275  SIndex,
1276  Context,
1277  typename ReducerDef::template Reducer<T, Context>,
1278  false>;
1279  using BackwardOp =
1282  ForwardOp,
1283  ReducerDef,
1284  ReducerGradient,
1285  false /*Sorted*/,
1286  false /*SparseFused*/>;
1287 };
1288 
1289 template <typename T, typename SIndex, typename Context, typename ReducerDef>
1291  using OpDef = ReducerDef;
1292  static constexpr const char* basename = "SparseUnsortedSegment";
1293  static constexpr const char* doc = R"DOC(
1294 Pulls in slices of the input tensor, groups them into segments and applies
1295 '{op}' to each segment. Segments ids can appear in arbitrary order (unlike in
1296 SparseSortedSegment{op}).
1297 
1298 This op is basically Gather and UnsortedSegment{op} fused together.
1299 
1300 INDICES should contain integers in range 0..N-1 where N is the first dimension
1301 of DATA. INDICES represent which slices of DATA need to be pulled in.
1302 
1303 SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
1304 particular group (segment). Values belonging to the same segment are aggregated
1305 together. SEGMENT_IDS should have the same dimension as INDICES.
1306 
1307 If `num_segments` argument is passed it would be used as a first dimension for
1308 the output. Otherwise, it'd be dynamically calculated from as the max value of
1309 SEGMENT_IDS plus one. Other output dimensions are inherited from the input
1310 tensor.
1311 
1312 {op_doc}
1313  )DOC";
1314  static void PopulateSchema(OpSchema& schema) {
1315  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1316  schema.Input(
1317  Reducer::kInputCount,
1318  "INDICES",
1319  "Integer vector containing indices of the first dimension of DATA for "
1320  "the slices that are being aggregated");
1321  schema.Input(
1322  Reducer::kInputCount + 1,
1323  "SEGMENT_IDS",
1324  "Integer vector with the same length as INDICES that maps each slice "
1325  "of DATA referenced by INDICES to one of the segments");
1326  schema.Output(
1327  0,
1328  "OUTPUT",
1329  "Aggregated output tensor. Has the first dimension of equal to the "
1330  "number of segments.");
1331  ReducerDef::PopulateSchema(schema);
1332  }
1333  using Reducer = typename ReducerDef::template Reducer<T, Context>;
1334  using ReducerGradient =
1335  typename ReducerDef::template ReducerGradient<T, Context>;
1337  // TODO(dzhulgakov): we're registering the same class twice here,
1338  // consider avoiding op duplication here
1339  using BackwardOp =
1342  ForwardOp,
1343  ReducerDef,
1344  ReducerGradient,
1345  false /*Sorted*/,
1346  true /*SparseFused*/>;
1347 };
1348 
1371 // TODO(dzhulgakov): for now it's implemented with incremental reducers because
1372 // of fused sparse support. But using "lengths" representation actually implies
1373 // continuous segments and thus range reducers can be used for non-sparse
1374 // version.
1375 
1376 template <
1377  typename TData,
1378  typename TLengths,
1379  class Context,
1380  class Reducer,
1381  bool SparseFused = true,
1382  class InputAccessor = BaseInputAccessor<TData>>
1383 class AbstractLengthsOp : public Operator<Context> {
1384  public:
1385  USE_OPERATOR_CONTEXT_FUNCTIONS;
1386  USE_SIMPLE_CTOR_DTOR(AbstractLengthsOp);
1387 
1388  bool RunOnDevice() override {
1389  if (SparseFused) {
1391  this, Input(INDICES));
1392  } else {
1393  // type doesn't matter
1394  return DoRunWithType<int64_t>();
1395  }
1396  }
1397 
1398  template <typename IndexType>
1399  bool DoRunWithType() {
1400  // If more complicated fixed size logic becomes necessary, it can be moved
1401  // to the reducer class
1402  int64_t in_block_size = Input(0).size_from_dim(1);
1404  this, in_block_size);
1405  }
1406 
1407  template <typename IndexType, int FixedSize>
1408  bool DoRunWithValue() {
1409  auto& dataInput = Input(0);
1410  auto& lengthsInput = Input(LENGTHS);
1411 
1412  CAFFE_ENFORCE_EQ(1, lengthsInput.dim(), "LENGTHS must be a vector");
1413  const int64_t dataSize = dataInput.size(0);
1414  // Either first dim the data or how much we pull in indexies from it
1415  int64_t dataToReduceSize;
1416  const int64_t outputSize = lengthsInput.size(0);
1417 
1418  const IndexType* indices;
1419  if (SparseFused) { // static if
1420  auto& indicesInput = Input(INDICES);
1421  CAFFE_ENFORCE_EQ(1, indicesInput.dim(), "INDICES must be a vector");
1422  indices = indicesInput.template data<IndexType>();
1423  dataToReduceSize = indicesInput.size(0);
1424  } else {
1425  dataToReduceSize = dataSize;
1426  }
1427 
1428  typename Reducer::Meta ctx;
1429  ctx.observeInput(0, dataInput, 1);
1430  for (int i = 1; i < Reducer::kInputCount; ++i) {
1431  auto& aux_in = Input(i);
1432  CAFFE_ENFORCE(
1433  dataToReduceSize == aux_in.size(0),
1434  "Input ",
1435  i,
1436  " must have the same first dim as SEGMENT_IDS");
1437  ctx.observeInput(i, aux_in, 1);
1438  }
1439 
1440  const TLengths* lengths = lengthsInput.template data<TLengths>();
1441 
1442  OPERATOR_NEEDS_FEATURE(
1443  inputAccessor_.observeInput(dataInput),
1444  "Unsupported input type: ",
1445  dataInput.dtype().name(),
1446  ".");
1447 
1448  vector<int64_t> shape{outputSize};
1449  ctx.appendOutputShape(&shape);
1450  auto* output = Output(0, shape, at::dtype<TData>());
1451 
1452  int64_t in_block_size = dataInput.size_from_dim(1);
1453  int64_t out_block_size = output->size_from_dim(1);
1454  TData* out = output->template mutable_data<TData>();
1455 
1456  int64_t dataIndex = 0;
1457  for (int64_t rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) {
1458  Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_);
1459  for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
1460  ++dataIndex) {
1461  IndexType idx;
1462  if (SparseFused) { // static if
1463  idx = indices[dataIndex];
1464  CAFFE_ENFORCE(
1465  0 <= idx && idx < dataSize,
1466  "The ",
1467  dataIndex,
1468  "th index from the input indices is out of bounds: ",
1469  idx,
1470  " vs. valid range 0 to ",
1471  dataSize);
1472  } else {
1473  idx = dataIndex;
1474  CAFFE_ENFORCE(
1475  0 <= idx && idx < dataSize,
1476  "When calculating the ",
1477  rangeIndex,
1478  "th output with length=",
1479  lengths[rangeIndex],
1480  ", the index is out of bounds: ",
1481  idx,
1482  " vs. valid range 0 to ",
1483  dataSize);
1484  }
1485 
1486  const TData* input = inputAccessor_.getBlockPtr(in_block_size, idx);
1487  reducer.template process<FixedSize>(ctx, input, dataIndex, &context_);
1488  }
1489  reducer.template finish<FixedSize>(ctx, &context_);
1490  }
1491  CAFFE_ENFORCE(
1492  dataIndex == dataToReduceSize, dataIndex, " != ", dataToReduceSize);
1493 
1494  return true;
1495  }
1496 
1497  enum {
1498  INDICES = Reducer::kInputCount,
1499  LENGTHS = Reducer::kInputCount + (SparseFused ? 1 : 0)
1500  };
1501  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
1502  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
1503 
1504  private:
1505  InputAccessor inputAccessor_;
1506 };
1507 
1508 /*
1509  * Some notice:
1510  * 1. Gradient actually doesn't depend on whether sparse lookup is fused or not
1511  * 2. INDICES are not used in CPU version, but they are needed in async CUDA
1512  * version. So we register 3 input version for CPU as gradient op for
1513  * GPU/CPU convert. We then register 2 input version for CPU for backward
1514  * compatibility with older nets.
1515  */
1516 template <
1517  typename T,
1518  typename TLengths,
1519  class Context,
1520  class ReducerGradient,
1521  bool GradientNeedIndices = false>
1522 class AbstractLengthsGradientOp : public Operator<Context> {
1523  public:
1524  USE_OPERATOR_CONTEXT_FUNCTIONS;
1525  USE_SIMPLE_CTOR_DTOR(AbstractLengthsGradientOp);
1526 
1527  bool RunOnDevice() override {
1528  // If more complicated fixed size logic becomes necessary, it can be moved
1529  // to the reducer class
1530  int64_t gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1);
1532  this, gradBlockSize);
1533  }
1534 
1535  template <int FixedSize>
1536  bool DoRunWithValue() {
1537  auto& segmentGradsInput = Input(SEGMENT_GRADS);
1538  auto& lengthsInput = Input(LENGTHS);
1539 
1540  CAFFE_ENFORCE(lengthsInput.dim() == 1, "LENGTHS must be a vector");
1541  int64_t reducedDataSize = 0;
1542  int64_t numSegments = lengthsInput.size(0);
1543  CAFFE_ENFORCE(segmentGradsInput.dim() > 0);
1544  CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0));
1545  const TLengths* lengths = lengthsInput.template data<TLengths>();
1546  for (int64_t i = 0; i < numSegments; ++i) {
1547  reducedDataSize += lengths[i];
1548  }
1549 
1550  typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
1551  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1552  auto& aux_in = Input(i);
1553  CAFFE_ENFORCE_EQ(
1554  reducedDataSize,
1555  aux_in.size(0),
1556  "Input ",
1557  i,
1558  " must have the same first dim as SEGMENT_IDS");
1559  ctx.observeOriginalInput(
1560  ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
1561  }
1562 
1563  const T* segmentGrads = segmentGradsInput.template data<T>();
1564 
1565  vector<int64_t> shape;
1566  shape.push_back(reducedDataSize);
1567  ctx.appendGradShape(&shape);
1568  auto* dataGradsOutput = Output(0, shape, at::dtype<T>());
1569 
1570  int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
1571  int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
1572  T* dataGrads = dataGradsOutput->template mutable_data<T>();
1573 
1574  int64_t dataIndex = 0;
1575  for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
1576  ReducerGradient reducer(
1577  ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
1578  for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
1579  ++dataIndex) {
1580  reducer.template fillGrad<FixedSize>(
1581  ctx,
1582  dataGrads + dataGradsBlockSize * dataIndex,
1583  dataIndex,
1584  &context_,
1585  lengths[rangeIndex]);
1586  }
1587  }
1588  CAFFE_ENFORCE(
1589  dataIndex == reducedDataSize, dataIndex, " != ", reducedDataSize);
1590  return true;
1591  }
1592 
1593  // Input layout:
1594  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS, INDICES
1595  // orig_argXs represent original op's inputs and will be passed to the reducer
1596  // directly
1597  static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
1598  2 + (GradientNeedIndices ? 1 : 0);
1599  enum _InputTags {
1600  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
1601  LENGTHS,
1602  INDICES
1603  };
1604 };
1605 
1606 // Version of gradient that requires the main input and thus needs to receive
1607 // length, indices and other stuff
1608 template <
1609  typename Tembedding,
1610  typename T,
1611  typename TLengths,
1612  class Context,
1613  class ReducerGradient,
1614  bool SparseFused = true,
1615  bool GradientNeedIndices = false>
1617  public:
1618  USE_OPERATOR_CONTEXT_FUNCTIONS;
1619  USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputGradientOp);
1620 
1621  bool RunOnDevice() override {
1622  if (SparseFused) {
1624  this, Input(INDICES));
1625  } else {
1626  // type doesn't matter
1627  return DoRunWithType<int64_t>();
1628  }
1629  }
1630 
1631  template <typename IndexType>
1632  bool DoRunWithType() {
1633  // If more complicated fixed size logic becomes necessary, it can be moved
1634  // to the reducer class
1635  int64_t in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
1637  call(this, in_block_size);
1638  }
1639 
1640  template <typename IndexType, int FixedSize>
1641  bool DoRunWithValue() {
1642  auto& dataInput = Input(DATA_INPUT);
1643  auto& segmentGradsInput = Input(SEGMENT_GRADS);
1644  auto& lengthsInput = Input(LENGTHS);
1645 
1646  CAFFE_ENFORCE(lengthsInput.dim() == 1, "LENGTHS must be a vector");
1647  int64_t numSegments = lengthsInput.size(0);
1648  CAFFE_ENFORCE(segmentGradsInput.dim() > 0);
1649  CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0));
1650  const TLengths* lengths = lengthsInput.template data<TLengths>();
1651 
1652  typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
1653  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1654  int aux_num = ReducerGradient::originalInputs()[i];
1655  auto& aux_in = Input(i);
1656  auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
1657  ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
1658  }
1659 
1660  // Either first dim the data or how much we pull in indexies from it
1661  int64_t dataToReduceSize;
1662  const IndexType* indices = nullptr;
1663  if (SparseFused) { // static if
1664  auto& indicesInput = Input(INDICES);
1665  indices = indicesInput.template data<IndexType>();
1666  dataToReduceSize = indicesInput.size(0);
1667  } else {
1668  dataToReduceSize = dataInput.size(0);
1669  }
1670 
1671  const T* segmentGrads = segmentGradsInput.template data<T>();
1672 
1673  vector<int64_t> shape;
1674  shape.push_back(dataToReduceSize);
1675  ctx.appendGradShape(&shape);
1676  auto* dataGradsOutput = Output(0, shape, at::dtype<T>());
1677 
1678  int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
1679  int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
1680  T* dataGrads = dataGradsOutput->template mutable_data<T>();
1681 
1682  const Tembedding* data = dataInput.template data<Tembedding>();
1683  int64_t dataIndex = 0;
1684  for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
1685  ReducerGradient reducer(
1686  ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
1687  for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
1688  ++dataIndex) {
1689  IndexType data_pos;
1690  // No range checking, should've been verified in forward pass
1691  if (SparseFused) { // static if
1692  data_pos = indices[dataIndex];
1693  } else {
1694  data_pos = dataIndex;
1695  }
1696  reducer.template fillGradWithMainInput<FixedSize>(
1697  ctx,
1698  data + dataGradsBlockSize * data_pos,
1699  dataGrads + dataGradsBlockSize * dataIndex,
1700  dataIndex,
1701  &context_,
1702  lengths[rangeIndex]);
1703  }
1704  }
1705  return true;
1706  }
1707 
1708  // Input layout:
1709  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS,
1710  // DATA_INPUT, [INDICES]
1711  // orig_argXs represent original op's inputs and will be passed to the reducer
1712  // directly
1713  static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
1714  3 + (SparseFused ? 1 : 0) + (GradientNeedIndices ? 1 : 0);
1715  enum _InputTags {
1716  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
1717  LENGTHS,
1718  DATA_INPUT,
1719  INDICES,
1720  };
1721 };
1722 
1723 // Version of gradient that requires the main input as well as the output of the
1724 // forward op.
1725 template <typename T, typename TLengths, class Context, class ReducerGradient>
1727  : public Operator<Context> {
1728  public:
1729  USE_OPERATOR_CONTEXT_FUNCTIONS;
1731 
1732  bool RunOnDevice() override {
1733  // If more complicated fixed size logic becomes necessary, it can be moved
1734  // to the reducer class.
1735  int64_t in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
1737  this, in_block_size);
1738  }
1739 
1740  template <int FixedSize>
1741  bool DoRunWithValue() {
1742  auto& dataInput = Input(DATA_INPUT);
1743  auto& segmentGradsInput = Input(SEGMENT_GRADS);
1744  auto& lengthsInput = Input(LENGTHS);
1745  auto& forwardOutputInput = Input(FORWARD_OUTPUT);
1746 
1747  CAFFE_ENFORCE(lengthsInput.dim() == 1, "LENGTHS must be a vector");
1748  int64_t numSegments = lengthsInput.size(0);
1749  CAFFE_ENFORCE(segmentGradsInput.dim() > 0);
1750  CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0));
1751  const TLengths* lengths = lengthsInput.template data<TLengths>();
1752 
1753  typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
1754  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1755  int aux_num = ReducerGradient::originalInputs()[i];
1756  auto& aux_in = Input(i);
1757  auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
1758  ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
1759  }
1760 
1761  CAFFE_ENFORCE(forwardOutputInput.dim() > 0);
1762  CAFFE_ENFORCE(numSegments == forwardOutputInput.size(0));
1763  const T* forwardOutput = forwardOutputInput.template data<T>();
1764 
1765  int64_t dataToReduceSize = dataInput.size(0);
1766 
1767  const T* segmentGrads = segmentGradsInput.template data<T>();
1768 
1769  vector<int64_t> shape;
1770  shape.push_back(dataToReduceSize);
1771  ctx.appendGradShape(&shape);
1772  auto* dataGradsOutput = Output(0, shape, at::dtype<T>());
1773 
1774  int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
1775  int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
1776  T* dataGrads = dataGradsOutput->template mutable_data<T>();
1777 
1778  const T* data = dataInput.template data<T>();
1779 
1780  int64_t dataIndex = 0;
1781  for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
1782  ReducerGradient reducer(
1783  ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
1784  for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
1785  ++dataIndex) {
1786  // No range checking, should've been verified in forward pass
1787  reducer.template fillGradWithMainInputAndForwardOutput<FixedSize>(
1788  ctx,
1789  data + dataGradsBlockSize * dataIndex,
1790  dataGrads + dataGradsBlockSize * dataIndex,
1791  forwardOutput + segmentBlockSize * rangeIndex,
1792  dataIndex,
1793  &context_,
1794  lengths[rangeIndex]);
1795  }
1796  }
1797  return true;
1798  }
1799 
1800  // Input layout:
1801  // orig_arg1, orig_arg2, ..., orig_argN, FORWARD_OUTPUT, SEGMENT_GRADS,
1802  // LENGTHS, DATA_INPUT
1803  // orig_argXs represent original op's inputs and will be passed to the reducer
1804  // directly
1805  static constexpr int kNumInputs =
1806  ReducerGradient::originalInputs().size() + 4;
1807  enum _InputTags {
1808  FORWARD_OUTPUT = ReducerGradient::originalInputs().size(),
1809  SEGMENT_GRADS,
1810  LENGTHS,
1811  DATA_INPUT,
1812  };
1813 };
1814 
1815 // base implementation of sparse/non-sparse gradient computation
1816 template <
1817  typename ForwardOp,
1818  typename ReducerDef,
1819  typename ReducerGradient,
1820  bool SparseFused,
1821  bool GradientNeedIndices = false>
1823  using GradientMakerBase::GradientMakerBase;
1824  vector<OperatorDef> GetGradientDefs() override {
1825  vector<string> grad_ins;
1826  string suffix = "Gradient";
1827  for (const int i : ReducerGradient::originalInputs()) {
1828  grad_ins.push_back(I(i));
1829  }
1830  if (ReducerGradient::requiresForwardOutput()) {
1831  grad_ins.push_back(O(0));
1832  CAFFE_ENFORCE(
1833  !SparseFused,
1834  "Forward pass output not yet supported as input for backward pass "
1835  "for SparseLengthsXXX operators");
1836  suffix = "AndForwardOutput" + suffix;
1837  }
1838  grad_ins.push_back(GO(0));
1839  grad_ins.push_back(I(ForwardOp::LENGTHS));
1840  bool indices_pushed = false;
1841  if (ReducerGradient::requiresDataInput(Def())) {
1842  grad_ins.push_back(I(0));
1843  if (SparseFused) {
1844  grad_ins.push_back(I(ForwardOp::INDICES));
1845  indices_pushed = true;
1846  }
1847  suffix = "WithMainInput" + suffix;
1848  }
1849  if (GradientNeedIndices && !indices_pushed) {
1850  if (SparseFused) {
1851  grad_ins.push_back(I(ForwardOp::INDICES));
1852  } else {
1853  // Hacky: using Input as Indices, remove this after we have specialized
1854  // cuda LengthsIndicesInGradientSumGradient
1855  grad_ins.push_back(I(0));
1856  }
1857  }
1858  vector<string> grad_outs;
1859  grad_outs.push_back({SparseFused ? GI_V(0) : GI(0)});
1860  int aux_grads = ReducerGradient::numAuxInputsWithGrads(Def());
1861  for (int i = 1; i <= aux_grads; ++i) {
1862  grad_outs.push_back(GI(i));
1863  }
1864  vector<OperatorDef> r{CreateOperatorDef(
1865  string(SparseFused ? "SparseLengths" : "Lengths") +
1866  string(GradientNeedIndices ? "IndicesInGradient" : "") +
1867  ReducerDef::name + suffix,
1868  "",
1869  grad_ins,
1870  grad_outs)};
1871  if (SparseFused) {
1872  SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
1873  }
1874  return r;
1875  }
1876 };
1877 
1878 template <
1879  typename T,
1880  typename SIndex,
1881  typename Context,
1882  typename ReducerDef,
1883  bool GradientNeedIndices = false>
1885  using OpDef = ReducerDef;
1886  static constexpr const char* basename = "Lengths";
1887  static constexpr const char* doc = R"DOC(
1888 Applies '{op}' to each segment of the input tensor. Segments are defined
1889 by their *LENGTHS*. *LENGTHS* is a vector that maps each of the slices of
1890 *DATA* to a particular segment. Values belonging to the same segment are
1891 aggregated together and considered for the '{op}' operation.
1892 
1893 For example *LENGTHS = [2, 1]* stands for segments *DATA[0..1]* and *DATA[2]*
1894 
1895 The sum of elements in *LENGTHS* must equal the number of elements in the first
1896 dimension of *DATA*. The length of *OUTPUT* is equal to the number of input
1897 segments, i.e. len(*LENGTHS*).
1898 
1899 {op_doc}
1900 
1901 {extra}
1902  )DOC";
1903  static void PopulateSchema(OpSchema& schema) {
1904  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1905  schema.Input(
1906  Reducer::kInputCount,
1907  "LENGTHS",
1908  "Vector with the same sum of elements as the first dimension of DATA");
1909  schema.Output(
1910  0,
1911  "OUTPUT",
1912  "Aggregated output tensor. Has the first dimension of len(LENGTHS) ");
1913  schema.TensorInferenceFunction(
1914  [](const OperatorDef& def, const vector<TensorShape>& in) {
1915  vector<TensorShape> out(0);
1916  TensorShape output;
1917  for (int d : in[Reducer::kInputCount].dims()) {
1918  output.add_dims(d);
1919  }
1920  for (int j = 1; j < in[0].dims_size(); j++) {
1921  output.add_dims(in[0].dims(j));
1922  }
1923  output.set_data_type(in[0].data_type());
1924  out.push_back(output);
1925  return out;
1926  });
1927  ReducerDef::PopulateSchema(schema);
1928  }
1929  using Reducer = typename ReducerDef::template Reducer<T, Context>;
1930  using ReducerGradient =
1931  typename ReducerDef::template ReducerGradient<T, Context>;
1933  using BackwardOp =
1936  T,
1937  T,
1938  SIndex,
1939  Context,
1940  ReducerGradient,
1941  false>;
1944  T,
1945  SIndex,
1946  Context,
1947  ReducerGradient>;
1949  ForwardOp,
1950  ReducerDef,
1951  ReducerGradient,
1952  false /*SparseFused*/,
1953  GradientNeedIndices>;
1954 };
1955 
1956 OpSchema::Cost CostInferenceForSparseLengths(
1957  const OperatorDef& def,
1958  const vector<TensorShape>& inputs,
1959  bool use_weight);
1960 
1961 template <
1962  typename T,
1963  typename SIndex,
1964  typename Context,
1965  typename ReducerDef,
1966  bool GradientNeedIndices = false>
1968  using OpDef = ReducerDef;
1969  static constexpr const char* basename = "SparseLengths";
1970  static constexpr const char* doc = R"DOC(
1971 Pulls in slices of the input tensor, groups them into segments and applies
1972 '{op}' to each segment. Segments are defined by their LENGTHS.
1973 
1974 This op is basically Gather and Lengths{op} fused together.
1975 
1976 INDICES should contain integers in range 0..N-1 where N is the first dimension
1977 of DATA. INDICES represent which slices of DATA need to be pulled in.
1978 
1979 LENGTHS is a vector that defines slice sizes by first dimention of DATA. Values
1980 belonging to the same segment are aggregated together. sum(LENGTHS) has
1981 to match INDICES size.
1982 
1983 The first dimension of the output is equal to the number of input segment,
1984 i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
1985 
1986 {op_doc}
1987  )DOC";
1988  static void PopulateSchema(OpSchema& schema) {
1989  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1990  schema.Input(
1991  Reducer::kInputCount,
1992  "INDICES",
1993  "Integer vector containing indices of the first dimension of DATA for "
1994  "the slices that are being aggregated");
1995  schema.Input(
1996  Reducer::kInputCount + 1,
1997  "LENGTHS",
1998  "Non negative vector with sum of elements equal to INDICES length");
1999  schema.Output(
2000  0,
2001  "OUTPUT",
2002  "Aggregated output tensor. Has the first dimension of K "
2003  "(the number of segments).");
2004  schema.TensorInferenceFunction(
2005  [](const OperatorDef&, const std::vector<TensorShape>& input_types) {
2006  std::vector<TensorShape> out(1);
2007  out[0] = input_types[0];
2008  out[0].set_dims(0, input_types[Reducer::kInputCount + 1].dims(0));
2009  return out;
2010  });
2011  ReducerDef::PopulateSchema(schema);
2012 
2013  schema.CostInferenceFunction(
2014  [](const OperatorDef& def,
2015  const vector<TensorShape>& inputs) -> OpSchema::Cost {
2016  return CostInferenceForSparseLengths(
2017  def, inputs, strcmp(OpDef::name, "WeightedSum") == 0);
2018  });
2019  }
2020  using Reducer = typename ReducerDef::template Reducer<T, Context>;
2021  using ReducerGradient =
2022  typename ReducerDef::template ReducerGradient<T, Context>;
2024  // TODO(dzhulgakov): we're registering the same class twice here,
2025  // consider avoiding op duplication here
2026  // Note: registering 2 input version for now because of naming in the macro,
2027  // will register 3 input version alone
2028  /* INDICES are not used in CPU version, but they are needed in async CUDA
2029  * version. So we register 3 input version for CPU as gradient op for
2030  * GPU/CPU convert. We then register 2 input version for CPU for backward
2031  * compatibility with older nets.
2032  */
2034  T,
2035  SIndex,
2036  Context,
2037  ReducerGradient,
2038  false /*GradientNeedIndices*/>;
2040  T,
2041  T,
2042  SIndex,
2043  Context,
2044  ReducerGradient>;
2045  // Will return 3 input version. This is aliging new CPU/GPU nets.
2047  ForwardOp,
2048  ReducerDef,
2049  ReducerGradient,
2050  true /*SparseFused*/,
2051  GradientNeedIndices>;
2052 };
2053 } // namespace caffe2
2054 
2055 #endif // CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
A class to record the schema of an op.
Definition: any.cpp:108
Segment reduction op with optional fused embedding lookup.
A helper class to index into arguments.
Definition: proto_utils.h:200
Segment reduction op with optional fused embedding lookup.
Base implementation for segment reduction op that leverages continuity of the data.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
OpSchema & CostInferenceFunction(CostInferenceFunctionType function)
Register the Cost inference function.
Simple non-segmented reduction over the first few dimensions of the tensor.
OpSchema & TensorInferenceFunction(TensorInferenceFunctionType function)
Sets the tensor inference function, which is a std::function object defined in operator_schema.h.
Unsorted segment reduction op with optional fused embedding lookup.