Caffe2 - C++ API
A deep learning, cross platform ML framework
segment_reduction_op.h
1 
17 #ifndef CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
18 #define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
19 
20 #include "caffe2/core/context.h"
21 #include "caffe2/core/logging.h"
22 #include "caffe2/core/operator.h"
23 #include "caffe2/operators/reducer_functors.h"
24 
25 namespace caffe2 {
26 
27 template <typename TData>
29  public:
31 
32  bool observeInput(const Tensor<CPUContext>& dataInput) {
33  data_ = dataInput.raw_data();
34  return dataInput.template IsType<TData>();
35  }
36 
37  inline const TData*
38  getBlockPtr(TIndex in_block_size, TIndex idx, TIndex /* blocks */ = 1) {
39  return static_cast<const TData*>(data_) + in_block_size * idx;
40  }
41 
42  protected:
43  const void* data_ = nullptr;
44 };
45 
47 // Range reducer ops: leverage that input segment is continuous and allow
48 // reducer functors to do something special
49 // Note: for now there are no real use cases for it yet :)
50 // Also, doesn't support additional arguments for now
52 
59 template <
60  typename T,
61  typename SIndex,
62  class Context,
63  class RangeReducer,
64  class InputAccessor = BaseInputAccessor<T>>
65 class AbstractSortedSegmentRangeOp : public Operator<Context> {
66  public:
67  USE_OPERATOR_CONTEXT_FUNCTIONS;
68  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeOp);
69 
70  bool RunOnDevice() override {
71  auto& dataInput = Input(DATA);
72  auto& segment_ids = Input(SEGMENT_IDS);
73  auto* output = Output(0);
74 
75  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
76  auto N = segment_ids.dim(0);
77  CAFFE_ENFORCE_EQ(
78  N,
79  dataInput.dim(0),
80  "SEGMENT_IDS must have the same length as outer dimension of DATA");
81 
82  OPERATOR_NEEDS_FEATURE(
83  inputAccessor_.observeInput(dataInput),
84  "Unsupported input type: ",
85  dataInput.meta().name(),
86  ".");
87 
88  const SIndex* s_ids = segment_ids.template data<SIndex>();
89 
90  const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
91  auto shape = dataInput.dims();
92  shape[0] = K;
93  output->Resize(shape);
94 
95  T* out = output->template mutable_data<T>();
96 
97  if (N == 0) {
98  return true;
99  }
100 
101  TIndex block_size = dataInput.size() / N;
102 
103  // Assume the segments are sorted and there are no gaps
104  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
105  for (TIndex i = 0; i < N;) {
106  TIndex start = i;
107  for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
108  ;
109 
110  RangeReducer()(
111  block_size,
112  i - start,
113  inputAccessor_.getBlockPtr(block_size, start, i - start),
114  out + block_size * s_ids[start],
115  &context_);
116 
117  // check correctness of the next segment
118  if (i < N) {
119  CAFFE_ENFORCE_EQ(
120  s_ids[start] + 1,
121  s_ids[i],
122  "Indices must be sorted and not have gaps");
123  }
124  }
125  return true;
126  }
127 
128  static constexpr int kNumInputs = 2;
129  INPUT_TAGS(DATA, SEGMENT_IDS);
130 
131  private:
132  InputAccessor inputAccessor_;
133 };
134 
135 template <
136  typename T,
137  typename SIndex,
138  class Context,
139  class RangeReducerGradient>
141  public:
142  USE_OPERATOR_CONTEXT_FUNCTIONS;
143  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeGradientOp);
144 
145  bool RunOnDevice() override {
146  // TODO(azzolini): avoid using input/output if not used by a particular op
147  auto& data_in = Input(DATA_IN);
148  auto& data_out = Input(DATA_OUT);
149  auto& segment_grads = Input(SEGMENT_GRADS);
150  auto& segment_ids = Input(SEGMENT_IDS);
151  auto* data_grads = Output(0);
152 
153  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
154  TIndex N = segment_ids.dim(0);
155 
156  const SIndex* s_ids = segment_ids.template data<SIndex>();
157  const T* s_grads = segment_grads.template data<T>();
158  const T* d_in = data_in.template data<T>();
159  const T* d_out = data_out.template data<T>();
160 
161  auto shape = segment_grads.dims();
162  shape[0] = N;
163  data_grads->Resize(shape);
164 
165  const SIndex K = segment_grads.dim(0);
166  T* out = data_grads->template mutable_data<T>();
167 
168  if (N == 0) {
169  return true;
170  }
171 
172  TIndex block_size = segment_grads.size_from_dim(1);
173 
174  // Assume the segments are sorted and there are no gaps
175  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
176  // repeat the check from forward op
177  CAFFE_ENFORCE_EQ(
178  K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
179  for (TIndex i = 0; i < N;) {
180  TIndex start = i;
181  for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
182  ;
183 
184  auto expanded_idx = block_size * start;
185  auto reduced_idx = block_size * s_ids[start];
186  RangeReducerGradient()(
187  block_size,
188  i - start,
189  s_grads + reduced_idx,
190  out + expanded_idx,
191  d_in + expanded_idx,
192  d_out + reduced_idx,
193  &context_);
194 
195  // check correctness of the next segment
196  if (i < N) {
197  CAFFE_ENFORCE_EQ(
198  s_ids[start] + 1,
199  s_ids[i],
200  "Indices must be sorted and not have gaps");
201  }
202  }
203  return true;
204  }
205 
206  static constexpr int kNumInputs = 4;
207  INPUT_TAGS(DATA_IN, DATA_OUT, SEGMENT_GRADS, SEGMENT_IDS);
208 };
209 
210 template <typename T, typename SIndex, typename Context, typename ReducerDef>
212  using OpDef = ReducerDef;
213  static constexpr const char* basename = "SortedSegmentRange";
214  static constexpr const char* doc = R"DOC(
215 Applies '{op}' to each segment of input tensor. In order to allow for more
216 efficient implementation of '{op}', the input segments have to be contiguous
217 and non-empty.
218 
219 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
220 DATA to a particular group (segment). Values belonging to the same segment are
221 aggregated together.
222 
223 The first dimension of the output is equal to the number of input segments,
224 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
225 
226 {op_doc}
227  )DOC";
228  static void PopulateSchema(OpSchema& schema) {
229  schema.Input(0, "DATA", "Input tensor to be aggregated");
230  schema.Input(
231  1,
232  "SEGMENT_IDS",
233  "Vector with the same length as the first dimension of DATA "
234  "and values in the range 0..K-1 and in increasing order that "
235  "maps each slice of DATA to one of the segments");
236  schema.Output(
237  0,
238  "OUTPUT",
239  "Aggregated tensor with the first dimension of K and the "
240  "other dimentsions inherited from DATA");
241  }
243  T,
244  SIndex,
245  Context,
246  typename ReducerDef::template Reducer<T, Context>>;
248  T,
249  SIndex,
250  Context,
251  typename ReducerDef::template ReducerGradient<T, Context>>;
252  struct GetGradient : public GradientMakerBase {
253  using GradientMakerBase::GradientMakerBase;
254  vector<OperatorDef> GetGradientDefs() override {
255  return SingleGradientDef(
256  string(basename) + ReducerDef::name + "Gradient",
257  "",
258  vector<string>{I(0), O(0), GO(0), I(1)},
259  // no gradient on segment_ids!
260  vector<string>{GI(0)});
261  }
262  };
263 };
264 
266 // Incremental reducer ops: assume that reducer consumes pieces of data one by
267 // one. Also, supports additional arguments passed to reducer, e.g. scalers for
268 // weighted sum.
269 //
270 // Note: in current implementation additional inputs are considered auxiliary
271 // constants and have limitations:
272 // - there is no gradient computation for auxiliary inputs
273 // - auxiliary inputs aren't affected by fused embedding lookup in operations
274 // like sparse_sorted_segment
276 
293 template <
294  typename T,
295  class Context,
296  class Reducer,
297  bool FirstDim,
298  class InputAccessor = BaseInputAccessor<T>>
299 class AbstractReduceFrontOrBackOp : public Operator<Context> {
300  public:
301  USE_OPERATOR_CONTEXT_FUNCTIONS;
302 
303  AbstractReduceFrontOrBackOp(const OperatorDef& operator_def, Workspace* ws)
304  : Operator<Context>(operator_def, ws),
305  OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}
306 
307  bool RunOnDevice() override {
308  auto& data = Input(0);
309  // If more complicated fixed size logic becomes necessary, it can be moved
310  // to the reducer class
311  TIndex in_block_size = FirstDim
312  ? data.size_from_dim(num_reduce_dims_)
313  : data.size_to_dim(data.ndim() - num_reduce_dims_);
315  this, in_block_size);
316  }
317 
318  template <int FixedSize>
319  bool DoRunWithValue() {
320  auto& data = Input(0);
321  auto* output = Output(0);
322 
323  CAFFE_ENFORCE_LE(num_reduce_dims_, data.ndim());
324 
325  typename Reducer::Meta ctx(FirstDim);
326  ctx.observeInput(0, data, num_reduce_dims_);
327  for (int i = 1; i < Reducer::kInputCount; ++i) {
328  auto& aux_in = Input(i);
329  ctx.observeInput(i, aux_in, num_reduce_dims_);
330  }
331 
332  OPERATOR_NEEDS_FEATURE(
333  inputAccessor_.observeInput(data),
334  "Unsupported input type: ",
335  data.meta().name(),
336  ".");
337 
338  vector<TIndex> shape;
339  ctx.appendOutputShape(&shape);
340  output->Resize(shape);
341 
342  T* out = output->template mutable_data<T>();
343 
344  const int block_size = FirstDim
345  ? data.size_from_dim(num_reduce_dims_)
346  : data.size_from_dim(data.ndim() - num_reduce_dims_);
347 
348  const int num_blocks = block_size > 0 ? data.size() / block_size : 0;
349 
350  Reducer r(ctx, out, &context_);
351  for (TIndex i = 0; i < num_blocks; ++i) {
352  r.template process<FixedSize>(
353  ctx, inputAccessor_.getBlockPtr(block_size, i), i, &context_);
354  }
355  r.template finish<FixedSize>(ctx, &context_);
356  return true;
357  }
358 
359  static constexpr int kNumInputs = Reducer::kInputCount;
360 
361  private:
362  int num_reduce_dims_;
363  InputAccessor inputAccessor_;
364 };
365 
366 template <
367  typename T,
368  class Context,
369  class ReducerGradient,
370  bool FirstDim = true>
372  public:
373  USE_OPERATOR_CONTEXT_FUNCTIONS;
374 
376  const OperatorDef& operator_def,
377  Workspace* ws)
378  : Operator<Context>(operator_def, ws),
379  OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}
380 
381  bool RunOnDevice() override {
382  // If more complicated fixed size logic becomes necessary, it can be moved
383  // to the reducer class
384  TIndex grad_block_size = Input(REDUCTION_GRAD).size();
386  this, grad_block_size);
387  }
388 
389  template <int FixedSize>
390  bool DoRunWithValue() {
391  auto& reduction_grad = Input(REDUCTION_GRAD);
392  auto& source_shape = OperatorBase::Input<TensorCPU>(SOURCE_SHAPE);
393 
394  auto* data_grads = Output(0);
395 
396  typename ReducerGradient::Meta ctx(reduction_grad, 0, FirstDim);
397  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
398  auto& aux_in = Input(i);
399  ctx.observeOriginalInput(
400  ReducerGradient::originalInputs()[i],
401  aux_in,
402  nullptr, /*no grad*/
403  num_reduce_dims_);
404  }
405 
406  const T* r_grad = reduction_grad.template data<T>();
407 
408  CAFFE_ENFORCE_LE(num_reduce_dims_, source_shape.size());
409 
410  vector<TIndex> shape(
411  source_shape.template data<TIndex>(),
412  source_shape.template data<TIndex>() + source_shape.size());
413 
414  data_grads->Resize(shape);
415 
416  TIndex block_size = FirstDim
417  ? data_grads->size_from_dim(num_reduce_dims_)
418  : data_grads->size_from_dim(data_grads->ndim() - num_reduce_dims_);
419  TIndex block_num = block_size > 0 ? data_grads->size() / block_size : 0;
420 
421  T* out = data_grads->template mutable_data<T>();
422 
423  ReducerGradient r(ctx, r_grad, &context_);
424  for (TIndex i = 0; i < block_num; ++i) {
425  r.template fillGrad<FixedSize>(
426  ctx,
427  out + block_size * i,
428  i,
429  &context_,
430  FirstDim ? block_num : block_size);
431  }
432  return true;
433  }
434 
435  static constexpr int kNumInputs =
436  ReducerGradient::originalInputs().size() + 2;
437  enum _InputTags {
438  REDUCTION_GRAD = ReducerGradient::originalInputs().size(),
439  SOURCE_SHAPE
440  };
441 
442  private:
443  int num_reduce_dims_;
444 };
445 
446 template <typename T, typename Context, typename ReducerDef>
448  using OpDef = ReducerDef;
449  static constexpr const char* basename = "ReduceFront";
450  static constexpr const char* doc = R"DOC(
451 Reduces the input tensor along the first dimension of the input tensor by
452 applying '{op}'. This op acts in a similar way to SortedSegment{op} and
453 UnsortedSegment{op} but as if all input slices belong to a single segment.
454 
455 {op_doc}
456  )DOC";
457  static void PopulateSchema(OpSchema& schema) {
458  schema.Input(
459  0, "DATA", "Input tensor to be reduced on the first dimension");
460  schema.TensorInferenceFunction([](const OperatorDef& def,
461  const vector<TensorShape>& in) {
462  CAFFE_ENFORCE_EQ(1, in.size());
463  ArgumentHelper helper(def);
464  int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
465  typename ReducerDef::template Reducer<T, Context>::Meta ctx(true);
466  vector<TIndex> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
467  return vector<TensorShape>{
468  CreateTensorShape(out_dims, in[0].data_type())};
469  });
470  ReducerDef::PopulateSchema(schema);
471  }
472  using ReducerGradient =
473  typename ReducerDef::template ReducerGradient<T, Context>;
475  T,
476  Context,
477  typename ReducerDef::template Reducer<T, Context>,
478  true>;
479  using BackwardOp =
481  struct GetGradient : public GradientMakerBase {
482  using GradientMakerBase::GradientMakerBase;
483  vector<OperatorDef> GetGradientDefs() override {
484  // Have utility function generating these names?
485  string tmp_dims = "_" + O(0) + "_dims";
486 
487  vector<string> grad_ins;
488  for (const int i : ReducerGradient::originalInputs()) {
489  grad_ins.push_back(I(i));
490  }
491  grad_ins.push_back(GO(0));
492  grad_ins.push_back(tmp_dims);
493 
494  vector<Argument> args;
495  if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
496  args.push_back(GetArgument(def_, "num_reduce_dim"));
497  }
498  // FIXME: pass in num_reduce_dims?!
499  return vector<OperatorDef>{
500  CreateOperatorDef(
501  "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
502  CreateOperatorDef(
503  string(basename) + ReducerDef::name + "Gradient",
504  "",
505  grad_ins,
506  // no gradient on auxiliary inputs for now
507  vector<string>{GI(0)}),
508  };
509  }
510  };
511 };
512 
513 template <typename T, typename Context, typename ReducerDef>
515  using OpDef = ReducerDef;
516  static constexpr const char* basename = "ReduceBack";
517  static constexpr const char* doc = R"DOC(
518 Reduces the input tensor along the last dimension of the input tensor by
519 applying '{op}'. This op acts in a similar way to SortedSegment{op} and
520 UnsortedSegment{op} but as if all input slices belong to a single segment.
521 
522 {op_doc}
523  )DOC";
524  static void PopulateSchema(OpSchema& schema) {
525  schema.Input(
526  0, "DATA", "Input tensor to be reduced on the first dimension");
527  schema.TensorInferenceFunction([](const OperatorDef& def,
528  const vector<TensorShape>& in) {
529  CAFFE_ENFORCE_EQ(1, in.size());
530  ArgumentHelper helper(def);
531  int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
532  typename ReducerDef::template Reducer<T, Context>::Meta ctx(false);
533  vector<TIndex> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
534  return vector<TensorShape>{
535  CreateTensorShape(out_dims, in[0].data_type())};
536  });
537  ReducerDef::PopulateSchema(schema);
538  }
539  using ReducerGradient =
540  typename ReducerDef::template ReducerGradient<T, Context>;
542  T,
543  Context,
544  typename ReducerDef::template Reducer<T, Context>,
545  false>;
546  using BackwardOp =
548  struct GetGradient : public GradientMakerBase {
549  using GradientMakerBase::GradientMakerBase;
550  vector<OperatorDef> GetGradientDefs() override {
551  // Have utility function generating these names?
552  string tmp_dims = "_" + O(0) + "_dims";
553 
554  vector<string> grad_ins;
555  for (const int i : ReducerGradient::originalInputs()) {
556  grad_ins.push_back(I(i));
557  }
558  grad_ins.push_back(GO(0));
559  grad_ins.push_back(tmp_dims);
560 
561  vector<Argument> args;
562  if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
563  args.push_back(GetArgument(def_, "num_reduce_dim"));
564  }
565  // FIXME: pass in num_reduce_dims?!
566  return vector<OperatorDef>{
567  CreateOperatorDef(
568  "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
569  CreateOperatorDef(
570  string(basename) + ReducerDef::name + "Gradient",
571  "",
572  grad_ins,
573  // no gradient on auxiliary inputs for now
574  vector<string>{GI(0)}),
575  };
576  }
577  };
578 };
579 
602 template <
603  typename T,
604  typename SIndex,
605  class Context,
606  class Reducer,
607  bool SparseFused = true,
608  class InputAccessor = BaseInputAccessor<T>>
609 class AbstractSortedSegmentOp : public Operator<Context> {
610  public:
611  USE_OPERATOR_CONTEXT_FUNCTIONS;
612  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentOp);
613 
614  bool RunOnDevice() override {
615  if (SparseFused) {
617  this, Input(INDICES));
618  } else {
619  // type doesn't matter
620  return DoRunWithType<TIndex>();
621  }
622  }
623 
624  template <typename IndexType>
625  bool DoRunWithType() {
626  // If more complicated fixed size logic becomes necessary, it can be moved
627  // to the reducer class
628  TIndex in_block_size = Input(0).size_from_dim(1);
630  this, in_block_size);
631  }
632 
633  template <typename IndexType, int FixedSize>
634  bool DoRunWithValue() {
635  auto& dataInput = Input(0);
636  auto& segment_ids = Input(SEGMENT_IDS);
637  auto* output = Output(0);
638 
639  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
640  TIndex N = segment_ids.dim(0);
641  const TIndex M = dataInput.dim(0);
642 
643  const IndexType* idxs;
644  if (SparseFused) { // static if
645  auto& indices = Input(INDICES);
646  CAFFE_ENFORCE_EQ(1, indices.ndim(), "INDICES must be a vector");
647  CAFFE_ENFORCE_EQ(
648  N,
649  indices.dim(0),
650  "SEGMENT_IDS must have the same length as INDICES");
651  idxs = indices.template data<IndexType>();
652  } else {
653  CAFFE_ENFORCE_EQ(
654  N, M, "DATA must have the same first dimension as SEGMENT_IDS");
655  }
656 
657  // It would probably look nicer with varargs templates but it's too much
658  // metaprogramming
659  typename Reducer::Meta ctx;
660  ctx.observeInput(0, dataInput, 1);
661  for (int i = 1; i < Reducer::kInputCount; ++i) {
662  auto& aux_in = Input(i);
663  CAFFE_ENFORCE_EQ(
664  N,
665  aux_in.dim(0),
666  "Input ",
667  i,
668  " must have the same first dim as SEGMENT_IDS");
669  ctx.observeInput(i, aux_in, 1);
670  }
671 
672  OPERATOR_NEEDS_FEATURE(
673  inputAccessor_.observeInput(dataInput),
674  "Unsupported input type: ",
675  dataInput.meta().name(),
676  ".");
677 
678  const SIndex* s_ids = segment_ids.template data<SIndex>();
679 
680  const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
681  vector<TIndex> shape;
682  shape.push_back(K);
683  ctx.appendOutputShape(&shape);
684  output->Resize(shape);
685 
686  T* out = output->template mutable_data<T>();
687  if (N == 0) {
688  return true;
689  }
690  TIndex in_block_size = dataInput.size_from_dim(1);
691  TIndex out_block_size = output->size_from_dim(1);
692 
693  // Assume the segments are sorted and there are no gaps
694  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
695  for (TIndex i = 0; i < N;) {
696  TIndex start = i;
697 
698  Reducer r(ctx, out + out_block_size * s_ids[start], &context_);
699  for (; i < N && s_ids[start] == s_ids[i]; ++i) {
700  IndexType idx;
701  if (SparseFused) { // static if
702  CAFFE_ENFORCE(
703  0 <= idxs[i] && idxs[i] < M,
704  "Index out of bounds: ",
705  idxs[i],
706  ", range 0 to ",
707  M);
708  idx = idxs[i];
709  } else {
710  idx = i;
711  }
712  r.template process<FixedSize>(
713  ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
714  }
715 
716  r.template finish<FixedSize>(ctx, &context_);
717  // check correctness of the next segment
718  if (i < N) {
719  CAFFE_ENFORCE_EQ(
720  s_ids[start] + 1,
721  s_ids[i],
722  "Indices must be sorted and not have gaps");
723  }
724  }
725  return true;
726  }
727 
728  enum {
729  INDICES = Reducer::kInputCount,
730  SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
731  };
732  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
733  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
734 
735  private:
736  InputAccessor inputAccessor_;
737 };
738 
739 // Gradient actually doesn't depend on whether sparse lookup is fused or not
740 template <typename T, typename SIndex, class Context, class ReducerGradient>
741 class AbstractSortedSegmentGradientOp : public Operator<Context> {
742  public:
743  USE_OPERATOR_CONTEXT_FUNCTIONS;
744  USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentGradientOp);
745 
746  bool RunOnDevice() override {
747  // If more complicated fixed size logic becomes necessary, it can be moved
748  // to the reducer class
749  TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
751  this, grad_block_size);
752  }
753 
754  template <int FixedSize>
755  bool DoRunWithValue() {
756  auto& segment_grads = Input(SEGMENT_GRADS);
757  auto& segment_ids = Input(SEGMENT_IDS);
758  auto* data_grads = Output(0);
759 
760  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
761  TIndex N = segment_ids.dim(0);
762 
763  typename ReducerGradient::Meta ctx(segment_grads, 1);
764  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
765  auto& aux_in = Input(i);
766  CAFFE_ENFORCE_EQ(
767  N,
768  aux_in.dim(0),
769  "Input ",
770  i,
771  " must have the same first dim as SEGMENT_IDS");
772  ctx.observeOriginalInput(
773  ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
774  }
775 
776  const SIndex* s_ids = segment_ids.template data<SIndex>();
777  const T* s_grads = segment_grads.template data<T>();
778 
779  vector<TIndex> shape;
780  shape.push_back(N);
781  ctx.appendGradShape(&shape);
782  data_grads->Resize(shape);
783 
784  TIndex d_block_size = data_grads->size_from_dim(1);
785  const SIndex K = segment_grads.dim(0);
786  TIndex s_block_size = segment_grads.size_from_dim(1);
787  T* out = data_grads->template mutable_data<T>();
788 
789  if (N == 0) {
790  return true;
791  }
792 
793  // Assume the segments are sorted and there are no gaps
794  CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
795  // repeat the check from forward op
796  CAFFE_ENFORCE_EQ(
797  K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
798  for (TIndex i = 0; i < N;) {
799  TIndex start = i;
800  TIndex end = start;
801 
802  if (ReducerGradient::computeLength()) {
803  for (; end < N && s_ids[start] == s_ids[end]; ++end) {
804  }
805  }
806 
807  ReducerGradient r(ctx, s_grads + s_block_size * s_ids[start], &context_);
808  for (; i < N && s_ids[start] == s_ids[i]; ++i) {
809  r.template fillGrad<FixedSize>(
810  ctx, out + d_block_size * i, i, &context_, end - start);
811  }
812 
813  // check correctness of the next segment
814  if (i < N) {
815  CAFFE_ENFORCE_EQ(
816  s_ids[start] + 1,
817  s_ids[i],
818  "Indices must be sorted and not have gaps");
819  }
820  }
821  return true;
822  }
823 
824  // Input layout:
825  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
826  // orig_argXs represent original op's inputs and will be passed to the reducer
827  // directly
828  static constexpr int kNumInputs =
829  ReducerGradient::originalInputs().size() + 2;
830  enum _InputTags {
831  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
832  SEGMENT_IDS
833  };
834 };
835 
836 // base implementation of sorted/unsorted sparse/non-sparse gradient computation
837 template <
838  typename ForwardOp,
839  typename ReducerDef,
840  typename ReducerGradient,
841  bool Sorted,
842  bool SparseFused>
844  using GradientMakerBase::GradientMakerBase;
845  vector<OperatorDef> GetGradientDefs() override {
846  CAFFE_ENFORCE(
847  !ReducerGradient::requiresDataInput(Def()),
848  "grads on aux inputs are not yet implemented for Segment operators.");
849  vector<string> grad_ins;
850  for (const int i : ReducerGradient::originalInputs()) {
851  grad_ins.push_back(I(i));
852  }
853  grad_ins.push_back(GO(0));
854  grad_ins.push_back(I(ForwardOp::SEGMENT_IDS));
855  vector<OperatorDef> r{CreateOperatorDef(
856  string(Sorted ? "SortedSegment" : "UnsortedSegment") +
857  ReducerDef::name + "Gradient",
858  "",
859  grad_ins,
860  // no gradient on segment_ids or auxiliary inputs for now
861  vector<string>{SparseFused ? GI_V(0) : GI(0)})};
862  if (SparseFused) {
863  SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
864  }
865  return r;
866  }
867 };
868 
869 template <typename T, typename SIndex, typename Context, typename ReducerDef>
871  using OpDef = ReducerDef;
872  static constexpr const char* basename = "SortedSegment";
873  static constexpr const char* doc = R"DOC(
874 Applies '{op}' to each segment of input tensor. Segments need to be sorted and
875 contiguous. See also UnsortedSegment{op} that doesn't have this requirement.
876 
877 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
878 DATA to a particular group (segment). Values belonging to the same segment are
879 aggregated together.
880 
881 The first dimension of the output is equal to the number of input segments,
882 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
883 
884 {op_doc}
885  )DOC";
886  static void PopulateSchema(OpSchema& schema) {
887  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
888  schema.Input(
889  Reducer::kInputCount,
890  "SEGMENT_IDS",
891  "Vector with the same length as the first dimension of DATA "
892  "and values in the range 0..K-1 and in increasing order that "
893  "maps each slice of DATA to one of the segments");
894  schema.Output(
895  0,
896  "OUTPUT",
897  "Aggregated output tensor. Has the first dimension of K "
898  "(the number of segments).");
899  ReducerDef::PopulateSchema(schema);
900  }
901  using Reducer = typename ReducerDef::template Reducer<T, Context>;
902  using ReducerGradient =
903  typename ReducerDef::template ReducerGradient<T, Context>;
905  using BackwardOp =
908  ForwardOp,
909  ReducerDef,
910  ReducerGradient,
911  true /*Sorted*/,
912  false /*SparseFused*/>;
913 };
914 
915 template <typename T, typename SIndex, typename Context, typename ReducerDef>
917  using OpDef = ReducerDef;
918  static constexpr const char* basename = "SparseSortedSegment";
919  static constexpr const char* doc = R"DOC(
920 Pulls in slices of the input tensor, groups them into segments and applies
921 '{op}' to each segment. Segments need to be sorted and contiguous. See also
922 SparseUnsortedSegment{op} that doesn't have this requirement.
923 
924 This op is basically Gather and SortedSegment{op} fused together.
925 
926 INDICES should contain integers in range 0..N-1 where N is the first dimension
927 of DATA. INDICES represent which slices of DATA need to be pulled in.
928 
929 SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
930 particular group (segment). Values belonging to the same segment are aggregated
931 together. SEGMENT_IDS should have the same dimension as INDICES.
932 
933 The first dimension of the output is equal to the number of input segments,
934 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.
935 
936 {op_doc}
937  )DOC";
938  static void PopulateSchema(OpSchema& schema) {
939  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
940  schema.Input(
941  Reducer::kInputCount,
942  "INDICES",
943  "Integer vector containing indices of the first dimension of DATA for "
944  "the slices that are being aggregated");
945  schema.Input(
946  Reducer::kInputCount + 1,
947  "SEGMENT_IDS",
948  "Vector with the same length as INDICES and values in the range "
949  "0..K-1 and in increasing order that maps each slice of DATA referenced"
950  " by INDICES to one of the segments");
951  schema.Output(
952  0,
953  "OUTPUT",
954  "Aggregated output tensor. Has the first dimension of K "
955  "(the number of segments).");
956  ReducerDef::PopulateSchema(schema);
957  }
958  using Reducer = typename ReducerDef::template Reducer<T, Context>;
959  using ReducerGradient =
960  typename ReducerDef::template ReducerGradient<T, Context>;
962  // TODO(dzhulgakov): we're registering the same class twice here,
963  // consider avoiding op duplication here
964  using BackwardOp =
967  ForwardOp,
968  ReducerDef,
969  ReducerGradient,
970  true /*Sorted*/,
971  true /*SparseFused*/>;
972 };
973 
1003 template <
1004  typename T,
1005  typename SIndex,
1006  class Context,
1007  class Reducer,
1008  bool SparseFused = true,
1009  class InputAccessor = BaseInputAccessor<T>>
1010 class AbstractUnsortedSegmentOp : public Operator<Context> {
1011  public:
1012  USE_OPERATOR_CONTEXT_FUNCTIONS;
1013 
1014  AbstractUnsortedSegmentOp(const OperatorDef& operator_def, Workspace* ws)
1015  : Operator<Context>(operator_def, ws),
1016  OP_SINGLE_ARG(int, "num_segments", num_segments_, -1) {}
1017 
1018  bool RunOnDevice() override {
1019  if (SparseFused) {
1021  this, Input(INDICES));
1022  } else {
1023  // type doesn't matter
1024  return DoRunWithType<TIndex>();
1025  }
1026  }
1027 
1028  template <typename IndexType>
1029  bool DoRunWithType() {
1030  // If more complicated fixed size logic becomes necessary, it can be moved
1031  // to the reducer class
1032  TIndex in_block_size = Input(0).size_from_dim(1);
1034  this, in_block_size);
1035  }
1036 
1037  template <typename IndexType, int FixedSize>
1038  bool DoRunWithValue() {
1039  auto& data = Input(0);
1040  auto& segment_ids = Input(SEGMENT_IDS);
1041  auto* output = Output(0);
1042 
1043  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
1044  TIndex N = segment_ids.dim(0);
1045  const TIndex M = data.dim(0);
1046 
1047  const IndexType* idxs;
1048  if (SparseFused) { // static if
1049  auto& indices = Input(INDICES);
1050  CAFFE_ENFORCE_EQ(1, indices.ndim(), "INDICES must be a vector");
1051  CAFFE_ENFORCE_EQ(
1052  N,
1053  indices.dim(0),
1054  "SEGMENT_IDS must have the same length as INDICES");
1055  idxs = indices.template data<IndexType>();
1056  } else {
1057  CAFFE_ENFORCE_EQ(
1058  N, M, "DATA must have the same first dimension as SEGMENT_IDS");
1059  }
1060 
1061  // It would probably look nicer with varargs templates but it's too much
1062  // metaprogramming
1063  typename Reducer::Meta ctx;
1064  ctx.observeInput(0, data, 1);
1065  for (int i = 1; i < Reducer::kInputCount; ++i) {
1066  auto& aux_in = Input(i);
1067  CAFFE_ENFORCE_EQ(
1068  N,
1069  aux_in.dim(0),
1070  "Input ",
1071  i,
1072  " must have the same first dim as SEGMENT_IDS");
1073  ctx.observeInput(i, aux_in, 1);
1074  }
1075 
1076  const SIndex* s_ids = segment_ids.template data<SIndex>();
1077  OPERATOR_NEEDS_FEATURE(
1078  inputAccessor_.observeInput(data),
1079  "Unsupported input type: ",
1080  data.meta().name(),
1081  ".");
1082 
1083  // determine the number of segments
1084  SIndex K;
1085  if (num_segments_ != -1) {
1086  K = num_segments_;
1087  } else {
1088  K = 0;
1089  for (TIndex i = 0; i < N; ++i) {
1090  K = std::max(K, s_ids[i] + 1);
1091  }
1092  }
1093 
1094  vector<TIndex> shape;
1095  shape.push_back(K);
1096  ctx.appendOutputShape(&shape);
1097  output->Resize(shape);
1098 
1099  TIndex in_block_size = data.size_from_dim(1);
1100  TIndex out_block_size = output->size_from_dim(1);
1101  T* out = output->template mutable_data<T>();
1102 
1103  reducers_.clear();
1104  reducers_.reserve(K);
1105  for (TIndex i = 0; i < K; ++i) {
1106  reducers_.emplace_back(ctx, out + out_block_size * i, &context_);
1107  }
1108 
1109  for (TIndex i = 0; i < N; ++i) {
1110  auto s_id = s_ids[i];
1111  CAFFE_ENFORCE(
1112  0 <= s_id && s_id < K,
1113  "Segment id out of range: ",
1114  s_id,
1115  ", range 0 to ",
1116  K);
1117  IndexType idx;
1118  if (SparseFused) { // static if
1119  CAFFE_ENFORCE(
1120  0 <= idxs[i] && idxs[i] < M,
1121  "Index out of bounds: ",
1122  idxs[i],
1123  ", range 0 to ",
1124  M);
1125  idx = idxs[i];
1126  } else {
1127  idx = i;
1128  }
1129  reducers_[s_id].template process<FixedSize>(
1130  ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
1131  }
1132 
1133  for (TIndex i = 0; i < K; ++i) {
1134  reducers_[i].template finish<FixedSize>(ctx, &context_);
1135  }
1136  // call reducers destructors (if there is any)
1137  reducers_.clear();
1138  return true;
1139  }
1140 
1141  enum {
1142  INDICES = Reducer::kInputCount,
1143  SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
1144  };
1145  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
1146  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
1147 
1148  private:
1149  TIndex num_segments_;
1150  // member field to reuse memory
1151  vector<Reducer> reducers_;
1152  InputAccessor inputAccessor_;
1153 };
1154 
1155 // Gradient actually doesn't depend on whether sparse lookup is fused or not
1156 template <typename T, typename SIndex, class Context, class ReducerGradient>
1158  public:
1159  USE_OPERATOR_CONTEXT_FUNCTIONS;
1160  USE_SIMPLE_CTOR_DTOR(AbstractUnsortedSegmentGradientOp);
1161 
1162  bool RunOnDevice() override {
1163  // If more complicated fixed size logic becomes necessary, it can be moved
1164  // to the reducer class
1165  TIndex grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
1167  this, grad_block_size);
1168  }
1169 
1170  template <int FixedSize>
1171  bool DoRunWithValue() {
1172  auto& segment_grads = Input(SEGMENT_GRADS);
1173  auto& segment_ids = Input(SEGMENT_IDS);
1174  auto* data_grads = Output(0);
1175 
1176  CAFFE_ENFORCE_EQ(1, segment_ids.ndim(), "SEGMENT_IDS must be a vector");
1177  TIndex N = segment_ids.dim(0);
1178 
1179  typename ReducerGradient::Meta ctx(segment_grads, 1);
1180  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1181  auto& aux_in = Input(i);
1182  CAFFE_ENFORCE_EQ(
1183  N,
1184  aux_in.dim(0),
1185  "Input ",
1186  i,
1187  " must have the same first dim as SEGMENT_IDS");
1188  ctx.observeOriginalInput(
1189  ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
1190  }
1191 
1192  const SIndex* s_ids = segment_ids.template data<SIndex>();
1193  const T* s_grads = segment_grads.template data<T>();
1194 
1195  vector<TIndex> shape;
1196  shape.push_back(N);
1197  ctx.appendGradShape(&shape);
1198  data_grads->Resize(shape);
1199 
1200  TIndex d_block_size = data_grads->size_from_dim(1);
1201  const SIndex K = segment_grads.dim(0);
1202  TIndex s_block_size = segment_grads.size_from_dim(1);
1203  T* out = data_grads->template mutable_data<T>();
1204 
1205  if (ReducerGradient::computeLength()) {
1206  segment_length_.resize(K, 0);
1207  for (int i = 0; i < N; ++i) {
1208  auto s_id = s_ids[i];
1209  CAFFE_ENFORCE(
1210  0 <= s_id && s_id < K,
1211  "Segment id out of range: ",
1212  s_id,
1213  ", range 0 to ",
1214  K);
1215  segment_length_[s_ids[i]]++;
1216  }
1217  }
1218 
1219  reducers_.clear();
1220  reducers_.reserve(K);
1221  for (SIndex i = 0; i < K; ++i) {
1222  reducers_.emplace_back(ctx, s_grads + s_block_size * i, &context_);
1223  }
1224 
1225  for (TIndex i = 0; i < N; ++i) {
1226  auto s_id = s_ids[i];
1227  if (ReducerGradient::computeLength()) {
1228  reducers_[s_id].template fillGrad<FixedSize>(
1229  ctx, out + d_block_size * i, i, &context_, segment_length_[s_id]);
1230  } else {
1231  reducers_[s_id].template fillGrad<FixedSize>(
1232  ctx, out + d_block_size * i, i, &context_, 0);
1233  }
1234  }
1235  // call reducers destructors (if there is any)
1236  reducers_.clear();
1237  return true;
1238  }
1239 
1240  // Input layout:
1241  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
1242  // orig_argXs represent original op's inputs and will be passed to the reducer
1243  // directly
1244  static constexpr int kNumInputs =
1245  ReducerGradient::originalInputs().size() + 2;
1246  enum _InputTags {
1247  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
1248  SEGMENT_IDS
1249  };
1250 
1251  private:
1252  // member field to reuse memory
1253  vector<ReducerGradient> reducers_;
1254  vector<int> segment_length_;
1255 };
1256 
1257 template <typename T, typename SIndex, typename Context, typename ReducerDef>
1259  using OpDef = ReducerDef;
1260  static constexpr const char* basename = "UnsortedSegment";
1261  static constexpr const char* doc = R"DOC(
1262 Applies '{op}' to each segment of input tensor. Segments ids can appear in
1263 arbitrary order (unlike in SortedSegment{op}).
1264 
1265 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
1266 DATA to a particular group (segment). Values belonging to the same segment are
1267 aggregated together.
1268 
1269 If `num_segments` argument is passed it would be used as a first dimension for
1270 the output. Otherwise, it'd be dynamically calculated from as the max value of
1271 SEGMENT_IDS plus one. Other output dimensions are inherited from the input
1272 tensor.
1273 
1274 {op_doc}
1275  )DOC";
1276  static void PopulateSchema(OpSchema& schema) {
1277  schema.Arg(
1278  "num_segments",
1279  "Optional int argument specifying the number of output segments and "
1280  "thus the first dimension of the output");
1281  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1282  schema.Input(
1283  Reducer::kInputCount,
1284  "SEGMENT_IDS",
1285  "Integer vector with the same length as the first dimension of DATA "
1286  "that maps each slice of DATA to one of the segments");
1287  schema.Output(
1288  0,
1289  "OUTPUT",
1290  "Aggregated output tensor. Has the first dimension of equal to the "
1291  "number of segments.");
1292  ReducerDef::PopulateSchema(schema);
1293  }
1294  using Reducer = typename ReducerDef::template Reducer<T, Context>;
1295  using ReducerGradient =
1296  typename ReducerDef::template ReducerGradient<T, Context>;
1298  T,
1299  SIndex,
1300  Context,
1301  typename ReducerDef::template Reducer<T, Context>,
1302  false>;
1303  using BackwardOp =
1306  ForwardOp,
1307  ReducerDef,
1308  ReducerGradient,
1309  false /*Sorted*/,
1310  false /*SparseFused*/>;
1311 };
1312 
1313 template <typename T, typename SIndex, typename Context, typename ReducerDef>
1315  using OpDef = ReducerDef;
1316  static constexpr const char* basename = "SparseUnsortedSegment";
1317  static constexpr const char* doc = R"DOC(
1318 Pulls in slices of the input tensor, groups them into segments and applies
1319 '{op}' to each segment. Segments ids can appear in arbitrary order (unlike in
1320 SparseSortedSegment{op}).
1321 
1322 This op is basically Gather and UnsortedSegment{op} fused together.
1323 
1324 INDICES should contain integers in range 0..N-1 where N is the first dimension
1325 of DATA. INDICES represent which slices of DATA need to be pulled in.
1326 
1327 SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
1328 particular group (segment). Values belonging to the same segment are aggregated
1329 together. SEGMENT_IDS should have the same dimension as INDICES.
1330 
1331 If `num_segments` argument is passed it would be used as a first dimension for
1332 the output. Otherwise, it'd be dynamically calculated from as the max value of
1333 SEGMENT_IDS plus one. Other output dimensions are inherited from the input
1334 tensor.
1335 
1336 {op_doc}
1337  )DOC";
1338  static void PopulateSchema(OpSchema& schema) {
1339  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1340  schema.Input(
1341  Reducer::kInputCount,
1342  "INDICES",
1343  "Integer vector containing indices of the first dimension of DATA for "
1344  "the slices that are being aggregated");
1345  schema.Input(
1346  Reducer::kInputCount + 1,
1347  "SEGMENT_IDS",
1348  "Integer vector with the same length as INDICES that maps each slice "
1349  "of DATA referenced by INDICES to one of the segments");
1350  schema.Output(
1351  0,
1352  "OUTPUT",
1353  "Aggregated output tensor. Has the first dimension of equal to the "
1354  "number of segments.");
1355  ReducerDef::PopulateSchema(schema);
1356  }
1357  using Reducer = typename ReducerDef::template Reducer<T, Context>;
1358  using ReducerGradient =
1359  typename ReducerDef::template ReducerGradient<T, Context>;
1361  // TODO(dzhulgakov): we're registering the same class twice here,
1362  // consider avoiding op duplication here
1363  using BackwardOp =
1366  ForwardOp,
1367  ReducerDef,
1368  ReducerGradient,
1369  false /*Sorted*/,
1370  true /*SparseFused*/>;
1371 };
1372 
1395 // TODO(dzhulgakov): for now it's implemented with incremental reducers because
1396 // of fused sparse support. But using "lengths" representation actually implies
1397 // continuous segments and thus range reducers can be used for non-sparse
1398 // version.
1399 
1400 template <
1401  typename TData,
1402  typename TLengths,
1403  class Context,
1404  class Reducer,
1405  bool SparseFused = true,
1406  class InputAccessor = BaseInputAccessor<TData>>
1407 class AbstractLengthsOp : public Operator<Context> {
1408  public:
1409  USE_OPERATOR_CONTEXT_FUNCTIONS;
1410  USE_SIMPLE_CTOR_DTOR(AbstractLengthsOp);
1411 
1412  bool RunOnDevice() override {
1413  if (SparseFused) {
1415  this, Input(INDICES));
1416  } else {
1417  // type doesn't matter
1418  return DoRunWithType<TIndex>();
1419  }
1420  }
1421 
1422  template <typename IndexType>
1423  bool DoRunWithType() {
1424  // If more complicated fixed size logic becomes necessary, it can be moved
1425  // to the reducer class
1426  TIndex in_block_size = Input(0).size_from_dim(1);
1428  this, in_block_size);
1429  }
1430 
1431  template <typename IndexType, int FixedSize>
1432  bool DoRunWithValue() {
1433  auto& dataInput = Input(0);
1434  auto& lengthsInput = Input(LENGTHS);
1435  auto* output = Output(0);
1436 
1437  CAFFE_ENFORCE_EQ(1, lengthsInput.ndim(), "LENGTHS must be a vector");
1438  const TIndex dataSize = dataInput.dim(0);
1439  // Either first dim the data or how much we pull in indexies from it
1440  TIndex dataToReduceSize;
1441  const TIndex outputSize = lengthsInput.dim(0);
1442 
1443  const IndexType* indices;
1444  if (SparseFused) { // static if
1445  auto& indicesInput = Input(INDICES);
1446  CAFFE_ENFORCE_EQ(1, indicesInput.ndim(), "INDICES must be a vector");
1447  indices = indicesInput.template data<IndexType>();
1448  dataToReduceSize = indicesInput.dim(0);
1449  } else {
1450  dataToReduceSize = dataSize;
1451  }
1452 
1453  typename Reducer::Meta ctx;
1454  ctx.observeInput(0, dataInput, 1);
1455  for (int i = 1; i < Reducer::kInputCount; ++i) {
1456  auto& aux_in = Input(i);
1457  CAFFE_ENFORCE(
1458  dataToReduceSize == aux_in.dim(0),
1459  "Input ",
1460  i,
1461  " must have the same first dim as SEGMENT_IDS");
1462  ctx.observeInput(i, aux_in, 1);
1463  }
1464 
1465  const TLengths* lengths = lengthsInput.template data<TLengths>();
1466 
1467  OPERATOR_NEEDS_FEATURE(
1468  inputAccessor_.observeInput(dataInput),
1469  "Unsupported input type: ",
1470  dataInput.meta().name(),
1471  ".");
1472 
1473  vector<TIndex> shape{outputSize};
1474  ctx.appendOutputShape(&shape);
1475  output->Resize(shape);
1476 
1477  TIndex in_block_size = dataInput.size_from_dim(1);
1478  TIndex out_block_size = output->size_from_dim(1);
1479  TData* out = output->template mutable_data<TData>();
1480 
1481  TIndex dataIndex = 0;
1482  for (TIndex rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) {
1483  Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_);
1484  for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
1485  ++dataIndex) {
1486  IndexType idx;
1487  if (SparseFused) { // static if
1488  idx = indices[dataIndex];
1489  CAFFE_ENFORCE(
1490  0 <= idx && idx < dataSize,
1491  "Index ",
1492  dataIndex,
1493  " is out of bounds: ",
1494  idx,
1495  ", range 0 to ",
1496  dataSize);
1497  } else {
1498  idx = dataIndex;
1499  CAFFE_ENFORCE(
1500  idx < dataSize,
1501  "Range ",
1502  rangeIndex,
1503  " of length ",
1504  lengths[rangeIndex],
1505  " is out of bound ",
1506  dataSize);
1507  }
1508 
1509  const TData* input = inputAccessor_.getBlockPtr(in_block_size, idx);
1510  reducer.template process<FixedSize>(ctx, input, dataIndex, &context_);
1511  }
1512  reducer.template finish<FixedSize>(ctx, &context_);
1513  }
1514  CAFFE_ENFORCE(
1515  dataIndex == dataToReduceSize, dataIndex, " != ", dataToReduceSize);
1516 
1517  return true;
1518  }
1519 
1520  enum {
1521  INDICES = Reducer::kInputCount,
1522  LENGTHS = Reducer::kInputCount + (SparseFused ? 1 : 0)
1523  };
1524  static constexpr int kSelfInputs = SparseFused ? 2 : 1;
1525  static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;
1526 
1527  private:
1528  InputAccessor inputAccessor_;
1529 };
1530 
1531 /*
1532  * Some notice:
1533  * 1. Gradient actually doesn't depend on whether sparse lookup is fused or not
1534  * 2. INDICES are not used in CPU version, but they are needed in async CUDA
1535  * version. So we register 3 input version for CPU as gradient op for
1536  * GPU/CPU convert. We then register 2 input version for CPU for backward
1537  * compatibility with older nets.
1538  */
1539 template <
1540  typename T,
1541  typename TLengths,
1542  class Context,
1543  class ReducerGradient,
1544  bool GradientNeedIndices = false>
1545 class AbstractLengthsGradientOp : public Operator<Context> {
1546  public:
1547  USE_OPERATOR_CONTEXT_FUNCTIONS;
1548  USE_SIMPLE_CTOR_DTOR(AbstractLengthsGradientOp);
1549 
1550  bool RunOnDevice() override {
1551  // If more complicated fixed size logic becomes necessary, it can be moved
1552  // to the reducer class
1553  TIndex gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1);
1555  this, gradBlockSize);
1556  }
1557 
1558  template <int FixedSize>
1559  bool DoRunWithValue() {
1560  auto& segmentGradsInput = Input(SEGMENT_GRADS);
1561  auto& lengthsInput = Input(LENGTHS);
1562  auto* dataGradsOutput = Output(0);
1563 
1564  CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
1565  TIndex reducedDataSize = 0;
1566  TIndex numSegments = lengthsInput.dim(0);
1567  CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
1568  CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
1569  const TLengths* lengths = lengthsInput.template data<TLengths>();
1570  for (TIndex i = 0; i < numSegments; ++i) {
1571  reducedDataSize += lengths[i];
1572  }
1573 
1574  typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
1575  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1576  auto& aux_in = Input(i);
1577  CAFFE_ENFORCE_EQ(
1578  reducedDataSize,
1579  aux_in.dim(0),
1580  "Input ",
1581  i,
1582  " must have the same first dim as SEGMENT_IDS");
1583  ctx.observeOriginalInput(
1584  ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
1585  }
1586 
1587  const T* segmentGrads = segmentGradsInput.template data<T>();
1588 
1589  vector<TIndex> shape;
1590  shape.push_back(reducedDataSize);
1591  ctx.appendGradShape(&shape);
1592  dataGradsOutput->Resize(shape);
1593 
1594  TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
1595  TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
1596  T* dataGrads = dataGradsOutput->template mutable_data<T>();
1597 
1598  TIndex dataIndex = 0;
1599  for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
1600  ReducerGradient reducer(
1601  ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
1602  for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
1603  ++dataIndex) {
1604  reducer.template fillGrad<FixedSize>(
1605  ctx,
1606  dataGrads + dataGradsBlockSize * dataIndex,
1607  dataIndex,
1608  &context_,
1609  lengths[rangeIndex]);
1610  }
1611  }
1612  CAFFE_ENFORCE(
1613  dataIndex == reducedDataSize, dataIndex, " != ", reducedDataSize);
1614  return true;
1615  }
1616 
1617  // Input layout:
1618  // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
1619  // orig_argXs represent original op's inputs and will be passed to the reducer
1620  // directly
1621  static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
1622  2 + (GradientNeedIndices ? 1 : 0);
1623  enum _InputTags {
1624  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
1625  LENGTHS,
1626  INDICES
1627  };
1628 };
1629 
1630 // Version of gradient that requires the main input and thus needs to receive
1631 // length, indices and other stuff
1632 template <
1633  typename T,
1634  typename TLengths,
1635  class Context,
1636  class ReducerGradient,
1637  bool SparseFused = true,
1638  bool GradientNeedIndices = false>
1640  public:
1641  USE_OPERATOR_CONTEXT_FUNCTIONS;
1642  USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputGradientOp);
1643 
1644  bool RunOnDevice() override {
1645  if (SparseFused) {
1647  this, Input(INDICES));
1648  } else {
1649  // type doesn't matter
1650  return DoRunWithType<TIndex>();
1651  }
1652  }
1653 
1654  template <typename IndexType>
1655  bool DoRunWithType() {
1656  // If more complicated fixed size logic becomes necessary, it can be moved
1657  // to the reducer class
1658  TIndex in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
1660  call(this, in_block_size);
1661  }
1662 
1663  template <typename IndexType, int FixedSize>
1664  bool DoRunWithValue() {
1665  auto& dataInput = Input(DATA_INPUT);
1666  auto& segmentGradsInput = Input(SEGMENT_GRADS);
1667  auto& lengthsInput = Input(LENGTHS);
1668  auto* dataGradsOutput = Output(0);
1669 
1670  CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
1671  TIndex numSegments = lengthsInput.dim(0);
1672  CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
1673  CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
1674  const TLengths* lengths = lengthsInput.template data<TLengths>();
1675 
1676  typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
1677  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1678  int aux_num = ReducerGradient::originalInputs()[i];
1679  auto& aux_in = Input(i);
1680  auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
1681  ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
1682  }
1683 
1684  // Either first dim the data or how much we pull in indexies from it
1685  TIndex dataToReduceSize;
1686  const IndexType* indices = nullptr;
1687  if (SparseFused) { // static if
1688  auto& indicesInput = Input(INDICES);
1689  indices = indicesInput.template data<IndexType>();
1690  dataToReduceSize = indicesInput.dim(0);
1691  } else {
1692  dataToReduceSize = dataInput.dim(0);
1693  }
1694 
1695  const T* segmentGrads = segmentGradsInput.template data<T>();
1696 
1697  vector<TIndex> shape;
1698  shape.push_back(dataToReduceSize);
1699  ctx.appendGradShape(&shape);
1700  dataGradsOutput->Resize(shape);
1701 
1702  TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
1703  TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
1704  T* dataGrads = dataGradsOutput->template mutable_data<T>();
1705 
1706  const T* data = dataInput.template data<T>();
1707 
1708  TIndex dataIndex = 0;
1709  for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
1710  ReducerGradient reducer(
1711  ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
1712  for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
1713  ++dataIndex) {
1714  IndexType data_pos;
1715  // No range checking, should've been verified in forward pass
1716  if (SparseFused) { // static if
1717  data_pos = indices[dataIndex];
1718  } else {
1719  data_pos = dataIndex;
1720  }
1721  reducer.template fillGradWithMainInput<FixedSize>(
1722  ctx,
1723  data + dataGradsBlockSize * data_pos,
1724  dataGrads + dataGradsBlockSize * dataIndex,
1725  dataIndex,
1726  &context_,
1727  lengths[rangeIndex]);
1728  }
1729  }
1730  return true;
1731  }
1732 
1733  // Input layout:
1734  // orig_arg1, orig_arg2, ..., orig_argN, DATA_INPUT, SEGMENT_GRADS,
1735  // SEGMENT_LEGNTHS, [INDICES]
1736  // orig_argXs represent original op's inputs and will be passed to the reducer
1737  // directly
1738  static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
1739  3 + (SparseFused ? 1 : 0) + (GradientNeedIndices ? 1 : 0);
1740  enum _InputTags {
1741  SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
1742  LENGTHS,
1743  DATA_INPUT,
1744  INDICES,
1745  };
1746 };
1747 
1748 // Version of gradient that requires the main input as well as the output of the
1749 // forward op.
1750 template <typename T, typename TLengths, class Context, class ReducerGradient>
1752  : public Operator<Context> {
1753  public:
1754  USE_OPERATOR_CONTEXT_FUNCTIONS;
1756 
1757  bool RunOnDevice() override {
1758  // If more complicated fixed size logic becomes necessary, it can be moved
1759  // to the reducer class.
1760  TIndex in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
1762  this, in_block_size);
1763  }
1764 
1765  template <int FixedSize>
1766  bool DoRunWithValue() {
1767  auto& dataInput = Input(DATA_INPUT);
1768  auto& segmentGradsInput = Input(SEGMENT_GRADS);
1769  auto& lengthsInput = Input(LENGTHS);
1770  auto& forwardOutputInput = Input(FORWARD_OUTPUT);
1771  auto* dataGradsOutput = Output(0);
1772 
1773  CAFFE_ENFORCE(lengthsInput.ndim() == 1, "LENGTHS must be a vector");
1774  TIndex numSegments = lengthsInput.dim(0);
1775  CAFFE_ENFORCE(segmentGradsInput.ndim() > 0);
1776  CAFFE_ENFORCE(numSegments == segmentGradsInput.dim(0));
1777  const TLengths* lengths = lengthsInput.template data<TLengths>();
1778 
1779  typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
1780  for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
1781  int aux_num = ReducerGradient::originalInputs()[i];
1782  auto& aux_in = Input(i);
1783  auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
1784  ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
1785  }
1786 
1787  CAFFE_ENFORCE(forwardOutputInput.ndim() > 0);
1788  CAFFE_ENFORCE(numSegments == forwardOutputInput.dim(0));
1789  const T* forwardOutput = forwardOutputInput.template data<T>();
1790 
1791  TIndex dataToReduceSize = dataInput.dim(0);
1792 
1793  const T* segmentGrads = segmentGradsInput.template data<T>();
1794 
1795  vector<TIndex> shape;
1796  shape.push_back(dataToReduceSize);
1797  ctx.appendGradShape(&shape);
1798  dataGradsOutput->Resize(shape);
1799 
1800  TIndex dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
1801  TIndex segmentBlockSize = segmentGradsInput.size_from_dim(1);
1802  T* dataGrads = dataGradsOutput->template mutable_data<T>();
1803 
1804  const T* data = dataInput.template data<T>();
1805 
1806  TIndex dataIndex = 0;
1807  for (TIndex rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) {
1808  ReducerGradient reducer(
1809  ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
1810  for (TIndex start = dataIndex; dataIndex < start + lengths[rangeIndex];
1811  ++dataIndex) {
1812  // No range checking, should've been verified in forward pass
1813  reducer.template fillGradWithMainInputAndForwardOutput<FixedSize>(
1814  ctx,
1815  data + dataGradsBlockSize * dataIndex,
1816  dataGrads + dataGradsBlockSize * dataIndex,
1817  forwardOutput + segmentBlockSize * rangeIndex,
1818  dataIndex,
1819  &context_,
1820  lengths[rangeIndex]);
1821  }
1822  }
1823  return true;
1824  }
1825 
1826  // Input layout:
1827  // orig_arg1, orig_arg2, ..., orig_argN, FORWARD_OUTPUT, DATA_INPUT,
1828  // SEGMENT_GRADS, SEGMENT_LEGNTHS
1829  // orig_argXs represent original op's inputs and will be passed to the reducer
1830  // directly
1831  static constexpr int kNumInputs =
1832  ReducerGradient::originalInputs().size() + 4;
1833  enum _InputTags {
1834  FORWARD_OUTPUT = ReducerGradient::originalInputs().size(),
1835  SEGMENT_GRADS,
1836  LENGTHS,
1837  DATA_INPUT,
1838  };
1839 };
1840 
1841 // base implementation of sparse/non-sparse gradient computation
1842 template <
1843  typename ForwardOp,
1844  typename ReducerDef,
1845  typename ReducerGradient,
1846  bool SparseFused,
1847  bool GradientNeedIndices = false>
1849  using GradientMakerBase::GradientMakerBase;
1850  vector<OperatorDef> GetGradientDefs() override {
1851  vector<string> grad_ins;
1852  string suffix = "Gradient";
1853  for (const int i : ReducerGradient::originalInputs()) {
1854  grad_ins.push_back(I(i));
1855  }
1856  if (ReducerGradient::requiresForwardOutput()) {
1857  grad_ins.push_back(O(0));
1858  CAFFE_ENFORCE(
1859  !SparseFused,
1860  "Forward pass output not yet supported as input for backward pass "
1861  "for SparseLengthsXXX operators");
1862  suffix = "AndForwardOutput" + suffix;
1863  }
1864  grad_ins.push_back(GO(0));
1865  grad_ins.push_back(I(ForwardOp::LENGTHS));
1866  bool indices_pushed = false;
1867  if (ReducerGradient::requiresDataInput(Def())) {
1868  grad_ins.push_back(I(0));
1869  if (SparseFused) {
1870  grad_ins.push_back(I(ForwardOp::INDICES));
1871  indices_pushed = true;
1872  }
1873  suffix = "WithMainInput" + suffix;
1874  }
1875  if (GradientNeedIndices && !indices_pushed) {
1876  if (SparseFused) {
1877  grad_ins.push_back(I(ForwardOp::INDICES));
1878  } else {
1879  // Hacky: using Input as Indices, remove this after we have specialized
1880  // cuda LengthsIndicesInGradientSumGradient
1881  grad_ins.push_back(I(0));
1882  }
1883  }
1884  vector<string> grad_outs;
1885  grad_outs.push_back({SparseFused ? GI_V(0) : GI(0)});
1886  int aux_grads = ReducerGradient::numAuxInputsWithGrads(Def());
1887  for (int i = 1; i <= aux_grads; ++i) {
1888  grad_outs.push_back(GI(i));
1889  }
1890  vector<OperatorDef> r{CreateOperatorDef(
1891  string(SparseFused ? "SparseLengths" : "Lengths") +
1892  string(GradientNeedIndices ? "IndicesInGradient" : "") +
1893  ReducerDef::name + suffix,
1894  "",
1895  grad_ins,
1896  grad_outs)};
1897  if (SparseFused) {
1898  SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
1899  }
1900  return r;
1901  }
1902 };
1903 
1904 template <
1905  typename T,
1906  typename SIndex,
1907  typename Context,
1908  typename ReducerDef,
1909  bool GradientNeedIndices = false>
1911  using OpDef = ReducerDef;
1912  static constexpr const char* basename = "Lengths";
1913  static constexpr const char* doc = R"DOC(
1914 Applies '{op}' to each segment of the input tensor. Segments are defined
1915 by their LENGTHS.
1916 
1917 LENGTHS is a vector that maps each of the first dimension slices of the
1918 DATA to a particular group (segment). Values belonging to the same segment are
1919 aggregated together.
1920 
1921 For example LENGTHS = [2, 1] stands for segments DATA[0..1] and DATA[2]
1922 
1923 The first dimension of the output is equal to the number of input segments,
1924 i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
1925 
1926 {op_doc}
1927  )DOC";
1928  static void PopulateSchema(OpSchema& schema) {
1929  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
1930  schema.Input(
1931  Reducer::kInputCount,
1932  "LENGTHS",
1933  "Vector with the same sum of elements as the first dimension of DATA");
1934  schema.Output(
1935  0,
1936  "OUTPUT",
1937  "Aggregated output tensor. Has the first dimension of len(LENGTHS) ");
1938  schema.TensorInferenceFunction(
1939  [](const OperatorDef& def, const vector<TensorShape>& in) {
1940  vector<TensorShape> out(0);
1941  TensorShape output;
1942  for (int d : in[Reducer::kInputCount].dims()) {
1943  output.add_dims(d);
1944  }
1945  for (int j = 1; j < in[0].dims_size(); j++) {
1946  output.add_dims(in[0].dims(j));
1947  }
1948  output.set_data_type(in[0].data_type());
1949  out.push_back(output);
1950  return out;
1951  });
1952  ReducerDef::PopulateSchema(schema);
1953  }
1954  using Reducer = typename ReducerDef::template Reducer<T, Context>;
1955  using ReducerGradient =
1956  typename ReducerDef::template ReducerGradient<T, Context>;
1958  using BackwardOp =
1961  T,
1962  SIndex,
1963  Context,
1964  ReducerGradient,
1965  false>;
1968  T,
1969  SIndex,
1970  Context,
1971  ReducerGradient>;
1973  ForwardOp,
1974  ReducerDef,
1975  ReducerGradient,
1976  false /*SparseFused*/,
1977  GradientNeedIndices>;
1978 };
1979 
1980 template <
1981  typename T,
1982  typename SIndex,
1983  typename Context,
1984  typename ReducerDef,
1985  bool GradientNeedIndices = false>
1987  using OpDef = ReducerDef;
1988  static constexpr const char* basename = "SparseLengths";
1989  static constexpr const char* doc = R"DOC(
1990 Pulls in slices of the input tensor, groups them into segments and applies
1991 '{op}' to each segment. Segments are defined by their LENGTHS.
1992 
1993 This op is basically Gather and Lengths{op} fused together.
1994 
1995 INDICES should contain integers in range 0..N-1 where N is the first dimension
1996 of DATA. INDICES represent which slices of DATA need to be pulled in.
1997 
1998 LENGTHS is a vector that defines slice sizes by first dimention of DATA. Values
1999 belonging to the same segment are aggregated together. sum(LENGTHS) has
2000 to match INDICES size.
2001 
2002 The first dimension of the output is equal to the number of input segment,
2003 i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.
2004 
2005 {op_doc}
2006  )DOC";
2007  static void PopulateSchema(OpSchema& schema) {
2008  schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
2009  schema.Input(
2010  Reducer::kInputCount,
2011  "INDICES",
2012  "Integer vector containing indices of the first dimension of DATA for "
2013  "the slices that are being aggregated");
2014  schema.Input(
2015  Reducer::kInputCount + 1,
2016  "LENGTHS",
2017  "Non negative vector with sum of elements equal to INDICES length");
2018  schema.Output(
2019  0,
2020  "OUTPUT",
2021  "Aggregated output tensor. Has the first dimension of K "
2022  "(the number of segments).");
2023  ReducerDef::PopulateSchema(schema);
2024  }
2025  using Reducer = typename ReducerDef::template Reducer<T, Context>;
2026  using ReducerGradient =
2027  typename ReducerDef::template ReducerGradient<T, Context>;
2029  // TODO(dzhulgakov): we're registering the same class twice here,
2030  // consider avoiding op duplication here
2031  // Note: registering 2 input version for now because of naming in the macro,
2032  // will register 3 input version alone
2033  /* INDICES are not used in CPU version, but they are needed in async CUDA
2034  * version. So we register 3 input version for CPU as gradient op for
2035  * GPU/CPU convert. We then register 2 input version for CPU for backward
2036  * compatibility with older nets.
2037  */
2039  T,
2040  SIndex,
2041  Context,
2042  ReducerGradient,
2043  false /*GradientNeedIndices*/>;
2045  T,
2046  SIndex,
2047  Context,
2048  ReducerGradient>;
2049  // Will return 3 input version. This is aliging new CPU/GPU nets.
2051  ForwardOp,
2052  ReducerDef,
2053  ReducerGradient,
2054  true /*SparseFused*/,
2055  GradientNeedIndices>;
2056 };
2057 } // namespace caffe2
2058 
2059 #endif // CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
A class to record the schema of an op.
Segment reduction op with optional fused embedding lookup.
A helper class to index into arguments.
Definition: proto_utils.h:198
Segment reduction op with optional fused embedding lookup.
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
Base implementation for segment reduction op that leverages continuity of the data.
Copyright (c) 2016-present, Facebook, Inc.
const void * raw_data() const
Returns a const raw void* pointer of the underlying storage.
Definition: tensor.h:488
Simple non-segmented reduction over the first few dimensions of the tensor.
OpSchema & TensorInferenceFunction(TensorInferenceFunctionType function)
Sets the tensor inference function, which is a std::function object defined in operator_schema.h.
Unsorted segment reduction op with optional fused embedding lookup.