Caffe2 - C++ API
A deep learning, cross platform ML framework
reducer_functors.h
1 
18 #ifndef CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
19 #define CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
20 
21 #include <array>
22 
23 #include "caffe2/core/context.h"
24 #include "caffe2/core/tensor.h"
25 #include "caffe2/utils/math.h"
26 #include "caffe2/utils/proto_utils.h"
27 
28 namespace caffe2 {
29 
31 // Range reducers: can leverage that input segment is continuous and provide
32 // special implementation
34 
35 // Put forward and backward in the same template?
36 template <typename T, class Context>
38 template <typename T, class Context>
40 
41 template <typename T>
43  public:
44  void operator()(
45  const TIndex block_size,
46  const TIndex blocks,
47  const T* in,
48  T* out,
49  CPUContext* /*context*/) {
50  // do we need to go through wrapper in math.h?
51  EigenVectorMap<T> out_vec(out, block_size);
52  out_vec = ConstEigenMatrixMap<T>(in, block_size, blocks).rowwise().sum();
53  }
54 };
55 
56 template <typename T, class Context>
58  public:
59  void operator()(
60  const TIndex block_size,
61  const TIndex blocks,
62  const T* segment_grad,
63  T* data_grad,
64  const T* /*data_in*/, // unused
65  const T* /*data_out*/, // unused
66  Context* context) {
67  // do we have some op that does it smartly with minimum number of memcpy?
68  for (TIndex i = 0; i < blocks; ++i) {
69  context->template Copy<T, Context, Context>(
70  block_size, segment_grad, data_grad + block_size * i);
71  }
72  }
73 };
74 
76  template <typename T, class Context>
78  template <typename T, class Context>
80  static constexpr const char* name = "Sum";
81  static constexpr const char* doc =
82  "Summation is done element-wise across slices of the input tensor and "
83  "doesn't change the shape of the individual blocks.";
84 };
85 
86 // Put forward and backward in the same template?
87 template <typename T, class Context>
89 template <typename T, class Context>
91 
92 template <typename T>
94  public:
95  void operator()(
96  const TIndex block_size,
97  const TIndex blocks,
98  const T* in,
99  T* out,
100  CPUContext* /*context*/) {
101  for (int j = 0; j < block_size; ++j) {
102  T max_value = std::numeric_limits<T>::lowest();
103  for (int i = 0; i < blocks; ++i) {
104  max_value = std::max(max_value, in[i * block_size + j]);
105  }
106  T scaled_exp_sum = 0;
107  for (int i = 0; i < blocks; ++i) {
108  scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
109  }
110  *(out++) = std::log(scaled_exp_sum) + max_value;
111  }
112  }
113  T r{1};
114 };
115 
116 template <typename T, class Context>
118  public:
119  void operator()(
120  const TIndex block_size,
121  const TIndex blocks,
122  const T* segment_grad, // GO
123  T* data_grad, // GI
124  const T* data_in, // I
125  const T* data_out, // O
126  Context* /*context*/) {
127  for (int j = 0; j < block_size; ++j) {
128  const T out_grad = *(segment_grad++);
129  const T offset = *(data_out++);
130  for (int i = 0; i < blocks; ++i) {
131  auto idx = i * block_size + j;
132  data_grad[idx] = out_grad * std::exp(data_in[idx] - offset);
133  }
134  }
135  }
136 };
137 
139  template <typename T, class Context>
141  template <typename T, class Context>
143  static constexpr const char* name = "LogSumExp";
144  static constexpr const char* doc =
145  "LogSumExp computes the element-wise log of the sum of exponentials of "
146  "input slices. Operation doesn't change the shape of individual blocks.";
147 };
148 
149 template <typename T, class Context>
151 template <typename T, class Context>
153 
154 template <typename T>
156  public:
157  void operator()(
158  const TIndex block_size,
159  const TIndex blocks,
160  const T* in,
161  T* out,
162  CPUContext* /*context*/) {
163  for (int j = 0; j < block_size; ++j) {
164  T max_value = std::numeric_limits<T>::lowest();
165  for (int i = 0; i < blocks; ++i) {
166  max_value = std::max(max_value, in[i * block_size + j]);
167  }
168  T scaled_exp_sum = 0;
169  for (int i = 0; i < blocks; ++i) {
170  scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
171  }
172  scaled_exp_sum /= blocks;
173  *(out++) = std::log(scaled_exp_sum) + max_value;
174  }
175  }
176 };
177 
178 template <typename T, class Context>
180  public:
181  void operator()(
182  const TIndex block_size,
183  const TIndex blocks,
184  const T* segment_grad, // GO
185  T* data_grad, // GI
186  const T* data_in, // I
187  const T* data_out, // O
188  Context* /*context*/) {
189  for (int j = 0; j < block_size; ++j) {
190  const T out_grad = *(segment_grad++);
191  const T offset = *(data_out++);
192  for (int i = 0; i < blocks; ++i) {
193  auto idx = i * block_size + j;
194  data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
195  }
196  }
197  }
198 };
199 
201  template <typename T, class Context>
203  template <typename T, class Context>
205  static constexpr const char* name = "LogMeanExp";
206  static constexpr const char* doc =
207  "LogMeanExp computes the element-wise log of the mean of exponentials of "
208  "input slices. Operation doesn't change the shape of individual blocks.";
209 };
210 
211 template <typename T, class Context>
213 template <typename T, class Context>
215 
216 template <typename T>
218  public:
219  void operator()(
220  const TIndex block_size,
221  const TIndex blocks,
222  const T* in,
223  T* out,
224  CPUContext* /*context*/) {
225  for (int j = 0; j < block_size; ++j) {
226  T avg_value = 0;
227  for (int i = 0; i < blocks; ++i) {
228  avg_value += in[i * block_size + j] / blocks;
229  }
230  *(out++) = avg_value;
231  }
232  }
233 };
234 
235 template <typename T, class Context>
237  public:
238  void operator()(
239  const TIndex block_size,
240  const TIndex blocks,
241  const T* segment_grad, // GO
242  T* data_grad, // GI
243  const T* /*data_in*/, // I
244  const T* /*data_out*/, // O
245  Context* /*context*/) {
246  const auto in_grad = 1.0 / blocks;
247  for (int j = 0; j < block_size; ++j) {
248  const T out_grad = *(segment_grad++);
249  for (int i = 0; i < blocks; ++i) {
250  auto idx = i * block_size + j;
251  data_grad[idx] = out_grad * in_grad;
252  }
253  }
254  }
255 };
256 
258  template <typename T, class Context>
260  template <typename T, class Context>
262  static constexpr const char* name = "Mean";
263  static constexpr const char* doc =
264  "Mean computation is done element-wise, so that each element of the "
265  "output slice corresponds to the average value of the respective "
266  "elements in the input slices. Operation doesn't change the shape of "
267  "individual blocks.";
268 };
269 
270 template <typename T, class Context>
272 template <typename T, class Context>
274 
275 template <typename T>
277  public:
278  void operator()(
279  const TIndex block_size,
280  const TIndex blocks,
281  const T* in,
282  T* out,
283  CPUContext* /*context*/) {
284  for (int j = 0; j < block_size; ++j) {
285  T max_value = std::numeric_limits<T>::lowest();
286  for (int i = 0; i < blocks; ++i) {
287  max_value = std::max(max_value, in[i * block_size + j]);
288  }
289  *(out++) = max_value;
290  }
291  }
292 };
293 
294 template <typename T, class Context>
296  public:
297  void operator()(
298  const TIndex block_size,
299  const TIndex blocks,
300  const T* segment_grad, // GO
301  T* data_grad, // GI
302  const T* data_in, // I
303  const T* data_out, // O
304  Context* /*context*/) {
305  std::memset(
306  static_cast<void*>(data_grad), 0, blocks * block_size * sizeof(T));
307  for (int j = 0; j < block_size; ++j) {
308  const T out_grad = *(segment_grad++);
309  const T out = data_out[j];
310  for (int i = 0; i < blocks; ++i) {
311  auto idx = i * block_size + j;
312  if (out == data_in[idx]) {
313  data_grad[idx] = out_grad;
314  }
315  }
316  }
317  }
318 };
319 
321  template <typename T, class Context>
323  template <typename T, class Context>
325  static constexpr const char* name = "Max";
326  static constexpr const char* doc =
327  "Max computation is done element-wise, so that each element of the "
328  "output slice corresponds to the max value of the respective "
329  "elements in the input slices. Operation doesn't change the shape of "
330  "individual blocks. This implementation imitates torch nn.Max operator. "
331  "If the maximum value occurs more than once, the operator will return "
332  "the first occurence of value. When computing the gradient using the "
333  "backward propagation, the gradient input corresponding to the first "
334  "occurence of the maximum value will be used.";
335 };
336 
338 // Incremental reducers: consume elements one by one
340 
341 // Base implementation, everything can be overwritten
342 class BaseReducer {
343  public:
344  static constexpr int kInputCount = 1;
345 
346  struct Meta {
347  TIndex block_size;
348  vector<TIndex> block_shape;
349  bool first_dim;
350 
351  explicit Meta(bool first = true) : first_dim(first) {}
352 
353  void computeMeta(const std::vector<TIndex>& dims, int skip_dims) {
354  first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
355  : block_shape.assign(dims.begin(), dims.end() - skip_dims);
356  block_size = first_dim ? size_from_dim_(skip_dims, dims)
357  : size_from_dim_(dims.size() - skip_dims, dims);
358  }
359 
360  void
361  observeInput(int input, const Tensor<CPUContext>& value, int skip_dims) {
362  DCHECK_EQ(0, input);
363  auto& dims = value.dims();
364  computeMeta(dims, skip_dims);
365  }
366 
367  void appendOutputShape(vector<TIndex>* output_shape) {
368  output_shape->insert(
369  output_shape->end(), block_shape.begin(), block_shape.end());
370  }
371 
372  vector<TIndex> getOutputShape(const TensorShape& in, int skip_dims) {
373  vector<TIndex> dims(in.dims().begin(), in.dims().end());
374  computeMeta(dims, skip_dims);
375  return block_shape;
376  }
377  };
378 
379  template <int FixedSize>
380  void finish(const Meta& /*meta*/, CPUContext* /*context*/) {}
381 };
382 
384  public:
385  // which of the original inputs are required for gradient computation
386  static constexpr std::array<int, 0> originalInputs() {
387  return std::array<int, 0>();
388  }
389 
390  static constexpr bool computeLength() {
391  return false;
392  }
393 
394  static int numAuxInputsWithGrads(const OperatorDef& /*def*/) {
395  return 0;
396  }
397 
398  static bool requiresDataInput(const OperatorDef& /*def*/) {
399  return false;
400  }
401 
402  // True if the backward op requires the output of the forward op.
403  static bool requiresForwardOutput() {
404  return false;
405  }
406 
407  struct Meta {
408  TIndex block_size;
409  vector<TIndex> block_shape;
410  bool first_dim;
411 
412  Meta(
413  const Tensor<CPUContext>& out_grad,
414  int skip_dims,
415  bool first_dim = true)
416  : first_dim(first_dim) {
417  auto& dims = out_grad.dims();
418  first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
419  : block_shape.assign(dims.begin(), dims.end() - skip_dims);
420  block_size = first_dim
421  ? out_grad.size_from_dim(skip_dims)
422  : out_grad.size_from_dim(out_grad.ndim() - skip_dims);
423  }
424 
425  void observeOriginalInput(
426  int /*original_input*/,
427  const Tensor<CPUContext>& /*value*/,
428  Tensor<CPUContext>* /*input_grad*/, // optional grad to populate
429  int /*skip_dims*/) {}
430 
431  void appendGradShape(vector<TIndex>* output_shape) {
432  output_shape->insert(
433  output_shape->end(), block_shape.begin(), block_shape.end());
434  }
435  };
436 };
437 
438 // Put forward and backward in the same template?
439 template <typename T, class Context>
441 template <typename T, class Context>
443 
444 template <typename T>
445 class SumReducer<T, CPUContext> : public BaseReducer {
446  public:
448 
449  SumReducer(const Meta& meta, T* out, CPUContext* /*context*/)
450  : current_size_(0), out_(out) {
451  // add a wrapper in Context for it
452  if (meta.first_dim) {
453  memset(out, 0, sizeof(T) * meta.block_size);
454  }
455  }
456  template <int FixedSize>
457  void process(
458  const Meta& meta,
459  const T* in,
460  TIndex /*offset*/,
461  CPUContext* context) {
462  if (meta.first_dim) {
463  math::AxpyFixedSize<T, CPUContext, FixedSize>(
464  meta.block_size, 1, in, out_, context);
465  } else {
466  math::Sum<T, CPUContext>(
467  meta.block_size, in, out_ + current_size_++, context);
468  }
469  }
470 
471  private:
472  int current_size_;
473  T* out_;
474 };
475 
476 template <typename T, class Context>
478  public:
480 
482  const Meta& /*meta*/,
483  const T* s_grad,
484  CPUContext* /*context*/)
485  : s_grad_(s_grad) {}
486 
487  template <int FixedSize>
488  void fillGrad(
489  const Meta& meta,
490  T* data_grad,
491  TIndex offset,
492  Context* context,
493  const int length) {
494  if (FixedSize == 1) { // static if
495  *data_grad = *s_grad_;
496  } else if (meta.first_dim) {
497  context->template Copy<T, Context, Context>(
498  meta.block_size, s_grad_, data_grad);
499  } else {
500  math::Set<T, Context>(length, s_grad_[offset], data_grad, context);
501  }
502  }
503 
504  private:
505  const T* s_grad_;
506 };
507 
509  template <typename T, class Context>
511  template <typename T, class Context>
513  static constexpr const char* name = "Sum";
514  static constexpr const char* doc =
515  "Summation is done element-wise across slices of the input tensor and "
516  "doesn't change the shape of the individual blocks.";
517  static void PopulateSchema(OpSchema& /*schema*/) {}
518 };
519 
520 // Put forward and backward in the same template?
521 template <typename T, class Context>
523 template <typename T, class Context>
525 
526 template <typename T>
528  public:
529  static constexpr int kInputCount = 2;
530 
532 
534  const T* scalars;
535 
536  bool first_dim;
537 
538  explicit Meta(bool first = true) : first_dim(first) {}
539 
540  void
541  observeInput(int input, const Tensor<CPUContext>& value, int skip_dims) {
542  if (input == 1) {
543  CAFFE_ENFORCE_EQ(
544  skip_dims, value.ndim(), "SCALARS mustn't have extra dimensions");
545  scalars = value.data<T>();
546  return;
547  }
548  BaseReducer::Meta::observeInput(input, value, skip_dims);
549  }
550  };
551 
552  WeightedSumReducer(const Meta& meta, T* out, CPUContext* /*context*/)
553  : out_(out) {
554  // do we have a wrapper for it?
555  memset(out, 0, sizeof(T) * meta.block_size);
556  }
557  template <int FixedSize>
558  void
559  process(const Meta& meta, const T* in, TIndex offset, CPUContext* context) {
560  CAFFE_ENFORCE(
561  meta.first_dim,
562  "WeightedSumReducer implemented only for "
563  "front dimensions reduction");
564  math::AxpyFixedSize<T, CPUContext, FixedSize>(
565  meta.block_size, meta.scalars[offset], in, out_, context);
566  }
567 
568  private:
569  T* out_;
570 };
571 
572 template <typename T, class Context>
574  public:
575  // which of the original inputs are required for gradient computation
576  static constexpr std::array<int, 1> originalInputs() {
577  return {{1}};
578  }
579 
580  static int numAuxInputsWithGrads(const OperatorDef& def) {
581  return GetFlagArgument(def, "grad_on_weights");
582  }
583 
584  static bool requiresDataInput(const OperatorDef& def) {
585  return numAuxInputsWithGrads(def) > 0;
586  }
587 
589 
590  struct Meta : public BaseReducerGradient::Meta {
591  const T* scalars;
592  T* scalars_grad;
593 
594  using BaseReducerGradient::Meta::Meta;
595 
596  void observeOriginalInput(
597  int original_input,
598  const Tensor<CPUContext>& value,
599  Tensor<CPUContext>* input_grad, // optional grad to populate
600  int /*skip_dims*/) {
601  CAFFE_ENFORCE_EQ(1, original_input);
602  scalars = value.data<T>();
603  if (input_grad) {
604  input_grad->ResizeLike(value);
605  scalars_grad = input_grad->mutable_data<T>();
606  }
607  }
608  };
609 
611  const Meta& /*meta*/,
612  const T* s_grad,
613  CPUContext* /*context*/)
614  : s_grad_(s_grad) {}
615 
616  template <int FixedSize>
617  void fillGrad(
618  const Meta& meta,
619  T* data_grad,
620  TIndex offset,
621  Context* context,
622  const int /*length*/) {
623  math::ScaleFixedSize<T, CPUContext, FixedSize>(
624  meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
625  }
626 
627  // Special version which is called with the main input too, used only if
628  // additional input grad is requested
629  template <int FixedSize>
630  void fillGradWithMainInput(
631  const Meta& meta,
632  const T* data,
633  T* data_grad,
634  TIndex offset,
635  Context* context,
636  const int /*length*/) {
637  math::ScaleFixedSize<T, CPUContext, FixedSize>(
638  meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
639  math::Dot(
640  meta.block_size, s_grad_, data, meta.scalars_grad + offset, context);
641  }
642 
643  private:
644  const T* s_grad_;
645 };
646 
648  template <typename T, class Context>
650  template <typename T, class Context>
652  static constexpr const char* name = "WeightedSum";
653  static constexpr const char* doc =
654  "Input slices are first scaled by SCALARS and then summed element-wise. "
655  "It doesn't change the shape of the individual blocks.";
656  static void PopulateSchema(OpSchema& schema) {
657  schema.Input(0, "DATA", "Input tensor for the summation");
658  schema.Input(
659  1,
660  "SCALARS",
661  "Scalar multipliers for the input slices. Must be a vector with the "
662  "length matching the first dimension of DATA");
663  schema.Arg(
664  "grad_on_weights",
665  "Produce also gradient for `weights`. For now it's only supported in "
666  "`Lengths`-based operators");
667  }
668 };
669 
670 template <typename T, class Context>
672 template <typename T, class Context>
674 
675 template <typename T>
676 class MeanReducer<T, CPUContext> : public BaseReducer {
677  public:
679 
680  MeanReducer(const Meta& meta, T* out, CPUContext* /*context*/)
681  : out_(out), current_size_(0) {
682  if (meta.first_dim) {
683  memset(out, 0, sizeof(T) * meta.block_size);
684  }
685  }
686 
687  template <int FixedSize>
688  void process(
689  const Meta& meta,
690  const T* in,
691  TIndex /*offset*/,
692  CPUContext* context) {
693  if (meta.first_dim) {
694  math::AxpyFixedSize<T, CPUContext, FixedSize>(
695  meta.block_size, 1, in, out_, context);
696  } else {
697  math::Sum<T, CPUContext>(
698  meta.block_size, in, out_ + current_size_, context);
699  }
700  current_size_++;
701  }
702 
703  template <int FixedSize>
704  void finish(const Meta& meta, CPUContext* context) {
705  if (meta.first_dim) {
706  if (current_size_ > 0) {
707  math::ScaleFixedSize<T, CPUContext, FixedSize>(
708  meta.block_size, 1.0 / current_size_, out_, out_, context);
709  }
710  } else {
711  math::ScaleFixedSize<T, CPUContext, FixedSize>(
712  current_size_, 1.0 / meta.block_size, out_, out_, context);
713  }
714  }
715 
716  private:
717  T* out_;
718  int current_size_;
719 };
720 
721 template <typename T, class Context>
723  public:
724  static constexpr bool computeLength() {
725  return true;
726  }
727 
729 
731  const Meta& /*meta*/,
732  const T* s_grad,
733  CPUContext* /*context*/)
734  : s_grad_(s_grad) {}
735 
736  template <int FixedSize>
737  void fillGrad(
738  const Meta& meta,
739  T* data_grad,
740  TIndex offset,
741  Context* context,
742  const int length) {
743  CAFFE_ENFORCE_GT(length, 0, "Segment length must be > 0");
744  if (meta.first_dim) {
745  math::ScaleFixedSize<T, CPUContext, FixedSize>(
746  meta.block_size, 1.0 / length, s_grad_, data_grad, context);
747  } else {
748  math::Set<T, CPUContext>(
749  length, s_grad_[offset] * 1.0f / length, data_grad, context);
750  }
751  }
752 
753  private:
754  const T* s_grad_;
755 };
756 
758  template <typename T, class Context>
760  template <typename T, class Context>
762  static constexpr const char* name = "Mean";
763  static constexpr const char* doc =
764  "Mean computes the element-wise mean of the input slices. "
765  "Operation doesn't change the shape of the individual blocks.";
766  static void PopulateSchema(OpSchema& /*schema*/) {}
767 };
768 
769 template <typename T, class Context>
771 template <typename T, class Context>
773 
774 template <typename T>
775 class MaxReducer<T, CPUContext> : public BaseReducer {
776  public:
778 
779  MaxReducer(const Meta& meta, T* out, CPUContext* /*context*/)
780  : out_(out), current_size_(0) {}
781 
782  template <int FixedSize>
783  void process(
784  const Meta& meta,
785  const T* in,
786  TIndex /*offset*/,
787  CPUContext* context) {
788  CAFFE_ENFORCE(
789  meta.first_dim,
790  "MaxReducer implemented only for front dimensions reduction");
791  if (current_size_ > 0) {
792  EigenVectorMap<T> output_vec(out_, meta.block_size);
793  output_vec =
794  output_vec.cwiseMax(ConstEigenVectorMap<T>(in, meta.block_size));
795  } else {
796  memcpy(out_, in, sizeof(T) * meta.block_size);
797  }
798  ++current_size_;
799  }
800 
801  private:
802  T* out_;
803  int current_size_;
804 };
805 
806 template <typename T, class Context>
808  public:
809  static bool requiresDataInput(const OperatorDef& /*def*/) {
810  return true;
811  }
812 
813  static bool requiresForwardOutput() {
814  return true;
815  }
816 
818 
820  const Meta& /*meta*/,
821  const T* s_grad,
822  CPUContext* /*context*/)
823  : s_grad_(s_grad) {}
824 
825  template <int FixedSize>
826  void fillGradWithMainInputAndForwardOutput(
827  const Meta& meta,
828  const T* data,
829  T* data_grad,
830  const T* forward_output,
831  TIndex /*offset*/,
832  Context* /*context*/,
833  const int /*length*/) {
834  for (TIndex i = 0; i < meta.block_size; ++i) {
835  data_grad[i] = data[i] == forward_output[i] ? s_grad_[i] : 0;
836  }
837  }
838 
839  private:
840  const T* s_grad_;
841 };
842 
844  template <typename T, class Context>
846  template <typename T, class Context>
848  static constexpr const char* name = "Max";
849  static constexpr const char* doc =
850  "Max computes the element-wise max of the input slices. "
851  "Operation doesn't change the shape of the individual blocks.";
852  static void PopulateSchema(OpSchema& /*schema*/) {}
853 };
854 
855 } // namespace caffe2
856 
857 #endif // CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
const T * data() const
Returns a typed pointer of the underlying storage.
Definition: tensor.h:500
A class to record the schema of an op.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:82
T * mutable_data()
Returns a typed pointer of the underlying storage.
Definition: tensor.h:594
const vector< TIndex > & dims() const
Returns the dimensions of the tensor as a vector.
Definition: tensor.h:627
Copyright (c) 2016-present, Facebook, Inc.
void ResizeLike(const Tensor< OtherContext > &src_tensor)
Resize the tensor like the source tensor.
Definition: tensor.h:331
int ndim() const
Returns the number of dimensions of the data.
Definition: tensor.h:605
TIndex size_from_dim_(int k, vector< TIndex > dims)
Return product of all dimensions starting from K.
Definition: tensor.h:56