Caffe2 - C++ API
A deep learning, cross platform ML framework
reducer_functors.h
1 
2 #ifndef CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
3 #define CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
4 
5 #include <array>
6 
7 #include "caffe2/core/context.h"
8 #include "caffe2/core/tensor.h"
9 #include "caffe2/utils/eigen_utils.h"
10 #include "caffe2/utils/math.h"
11 #include "caffe2/utils/proto_utils.h"
12 
13 namespace caffe2 {
14 
16 // Range reducers: can leverage that input segment is continuous and provide
17 // special implementation
19 
20 // Put forward and backward in the same template?
21 template <typename T, class Context>
23 template <typename T, class Context>
25 
26 template <typename T>
28  public:
29  void operator()(
30  const int64_t block_size,
31  const int64_t blocks,
32  const T* in,
33  T* out,
34  CPUContext* /*context*/) {
35  // do we need to go through wrapper in math.h?
36  EigenVectorMap<T> out_vec(out, block_size);
37  out_vec = ConstEigenMatrixMap<T>(in, block_size, blocks).rowwise().sum();
38  }
39 };
40 
41 template <typename T, class Context>
43  public:
44  void operator()(
45  const int64_t block_size,
46  const int64_t blocks,
47  const T* segment_grad,
48  T* data_grad,
49  const T* /*data_in*/, // unused
50  const T* /*data_out*/, // unused
51  Context* context) {
52  // do we have some op that does it smartly with minimum number of memcpy?
53  for (int64_t i = 0; i < blocks; ++i) {
54  context->template CopySameDevice<T>(
55  block_size, segment_grad, data_grad + block_size * i);
56  }
57  }
58 };
59 
61  template <typename T, class Context>
63  template <typename T, class Context>
65  static constexpr const char* name = "Sum";
66  static constexpr const char* doc =
67  "Summation is done element-wise across slices of the input tensor and "
68  "doesn't change the shape of the individual blocks.";
69 };
70 
71 // Put forward and backward in the same template?
72 template <typename T, class Context>
74 template <typename T, class Context>
76 
77 template <typename T>
79  public:
80  void operator()(
81  const int64_t block_size,
82  const int64_t blocks,
83  const T* in,
84  T* out,
85  CPUContext* /*context*/) {
86  for (int j = 0; j < block_size; ++j) {
87  T max_value = std::numeric_limits<T>::lowest();
88  for (int i = 0; i < blocks; ++i) {
89  max_value = std::max(max_value, in[i * block_size + j]);
90  }
91  T scaled_exp_sum = 0;
92  for (int i = 0; i < blocks; ++i) {
93  scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
94  }
95  *(out++) = std::log(scaled_exp_sum) + max_value;
96  }
97  }
98  T r{1};
99 };
100 
101 template <typename T, class Context>
103  public:
104  void operator()(
105  const int64_t block_size,
106  const int64_t blocks,
107  const T* segment_grad, // GO
108  T* data_grad, // GI
109  const T* data_in, // I
110  const T* data_out, // O
111  Context* /*context*/) {
112  for (int j = 0; j < block_size; ++j) {
113  const T out_grad = *(segment_grad++);
114  const T offset = *(data_out++);
115  for (int i = 0; i < blocks; ++i) {
116  auto idx = i * block_size + j;
117  data_grad[idx] = out_grad * std::exp(data_in[idx] - offset);
118  }
119  }
120  }
121 };
122 
124  template <typename T, class Context>
126  template <typename T, class Context>
128  static constexpr const char* name = "LogSumExp";
129  static constexpr const char* doc =
130  "LogSumExp computes the element-wise log of the sum of exponentials of "
131  "input slices. Operation doesn't change the shape of individual blocks.";
132 };
133 
134 template <typename T, class Context>
136 template <typename T, class Context>
138 
139 template <typename T>
141  public:
142  void operator()(
143  const int64_t block_size,
144  const int64_t blocks,
145  const T* in,
146  T* out,
147  CPUContext* /*context*/) {
148  for (int j = 0; j < block_size; ++j) {
149  T max_value = std::numeric_limits<T>::lowest();
150  for (int i = 0; i < blocks; ++i) {
151  max_value = std::max(max_value, in[i * block_size + j]);
152  }
153  T scaled_exp_sum = 0;
154  for (int i = 0; i < blocks; ++i) {
155  scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
156  }
157  scaled_exp_sum /= blocks;
158  *(out++) = std::log(scaled_exp_sum) + max_value;
159  }
160  }
161 };
162 
163 template <typename T, class Context>
165  public:
166  void operator()(
167  const int64_t block_size,
168  const int64_t blocks,
169  const T* segment_grad, // GO
170  T* data_grad, // GI
171  const T* data_in, // I
172  const T* data_out, // O
173  Context* /*context*/) {
174  for (int j = 0; j < block_size; ++j) {
175  const T out_grad = *(segment_grad++);
176  const T offset = *(data_out++);
177  for (int i = 0; i < blocks; ++i) {
178  auto idx = i * block_size + j;
179  data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
180  }
181  }
182  }
183 };
184 
186  template <typename T, class Context>
188  template <typename T, class Context>
190  static constexpr const char* name = "LogMeanExp";
191  static constexpr const char* doc =
192  "LogMeanExp computes the element-wise log of the mean of exponentials of "
193  "input slices. Operation doesn't change the shape of individual blocks.";
194 };
195 
196 template <typename T, class Context>
198 template <typename T, class Context>
200 
201 template <typename T>
203  public:
204  void operator()(
205  const int64_t block_size,
206  const int64_t blocks,
207  const T* in,
208  T* out,
209  CPUContext* /*context*/) {
210  for (int j = 0; j < block_size; ++j) {
211  T avg_value = 0;
212  for (int i = 0; i < blocks; ++i) {
213  avg_value += in[i * block_size + j] / blocks;
214  }
215  *(out++) = avg_value;
216  }
217  }
218 };
219 
220 template <typename T, class Context>
222  public:
223  void operator()(
224  const int64_t block_size,
225  const int64_t blocks,
226  const T* segment_grad, // GO
227  T* data_grad, // GI
228  const T* /*data_in*/, // I
229  const T* /*data_out*/, // O
230  Context* /*context*/) {
231  const auto in_grad = 1.0 / blocks;
232  for (int j = 0; j < block_size; ++j) {
233  const T out_grad = *(segment_grad++);
234  for (int i = 0; i < blocks; ++i) {
235  auto idx = i * block_size + j;
236  data_grad[idx] = out_grad * in_grad;
237  }
238  }
239  }
240 };
241 
243  template <typename T, class Context>
245  template <typename T, class Context>
247  static constexpr const char* name = "Mean";
248  static constexpr const char* doc =
249  "Mean computation is done element-wise, so that each element of the "
250  "output slice corresponds to the average value of the respective "
251  "elements in the input slices. Operation doesn't change the shape of "
252  "individual blocks.";
253 };
254 
255 template <typename T, class Context>
257 template <typename T, class Context>
259 
260 template <typename T>
262  public:
263  void operator()(
264  const int64_t block_size,
265  const int64_t blocks,
266  const T* in,
267  T* out,
268  CPUContext* /*context*/) {
269  for (int j = 0; j < block_size; ++j) {
270  T max_value = std::numeric_limits<T>::lowest();
271  for (int i = 0; i < blocks; ++i) {
272  max_value = std::max(max_value, in[i * block_size + j]);
273  }
274  *(out++) = max_value;
275  }
276  }
277 };
278 
279 template <typename T, class Context>
281  public:
282  void operator()(
283  const int64_t block_size,
284  const int64_t blocks,
285  const T* segment_grad, // GO
286  T* data_grad, // GI
287  const T* data_in, // I
288  const T* data_out, // O
289  Context* /*context*/) {
290  std::memset(
291  static_cast<void*>(data_grad), 0, blocks * block_size * sizeof(T));
292  for (int j = 0; j < block_size; ++j) {
293  const T out_grad = *(segment_grad++);
294  const T out = data_out[j];
295  for (int i = 0; i < blocks; ++i) {
296  auto idx = i * block_size + j;
297  if (out == data_in[idx]) {
298  data_grad[idx] = out_grad;
299  }
300  }
301  }
302  }
303 };
304 
306  template <typename T, class Context>
308  template <typename T, class Context>
310  static constexpr const char* name = "Max";
311  static constexpr const char* doc =
312  "Max computation is done element-wise, so that each element of the "
313  "output slice corresponds to the max value of the respective "
314  "elements in the input slices. Operation doesn't change the shape of "
315  "individual blocks. This implementation imitates torch nn.Max operator. "
316  "If the maximum value occurs more than once, the operator will return "
317  "the first occurence of value. When computing the gradient using the "
318  "backward propagation, the gradient input corresponding to the first "
319  "occurence of the maximum value will be used.";
320 };
321 
323 // Incremental reducers: consume elements one by one
325 
326 // Base implementation, everything can be overwritten
327 class BaseReducer {
328  public:
329  static constexpr int kInputCount = 1;
330 
331  struct Meta {
332  int64_t block_size;
333  vector<int64_t> block_shape;
334  bool first_dim;
335 
336  explicit Meta(bool first = true) : first_dim(first) {}
337 
338  void computeMeta(at::IntArrayRef dims, int skip_dims) {
339  first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
340  : block_shape.assign(dims.begin(), dims.end() - skip_dims);
341  block_size = first_dim ? size_from_dim_(skip_dims, dims)
342  : size_from_dim_(dims.size() - skip_dims, dims);
343  }
344 
345  void observeInput(int input, const Tensor& value, int skip_dims) {
346  DCHECK_EQ(0, input);
347  auto dims = value.sizes();
348  computeMeta(dims, skip_dims);
349  }
350 
351  void appendOutputShape(vector<int64_t>* output_shape) {
352  output_shape->insert(
353  output_shape->end(), block_shape.begin(), block_shape.end());
354  }
355 
356  vector<int64_t> getOutputShape(const TensorShape& in, int skip_dims) {
357  vector<int64_t> dims(in.dims().begin(), in.dims().end());
358  computeMeta(dims, skip_dims);
359  return block_shape;
360  }
361  };
362 
363  template <int FixedSize>
364  void finish(const Meta& /*meta*/, CPUContext* /*context*/) {}
365 };
366 
368  public:
369  // which of the original inputs are required for gradient computation
370  static constexpr std::array<int, 0> originalInputs() {
371  return std::array<int, 0>();
372  }
373 
374  static constexpr bool computeLength() {
375  return false;
376  }
377 
378  static int numAuxInputsWithGrads(const OperatorDef& /*def*/) {
379  return 0;
380  }
381 
382  static bool requiresDataInput(const OperatorDef& /*def*/) {
383  return false;
384  }
385 
386  // True if the backward op requires the output of the forward op.
387  static bool requiresForwardOutput() {
388  return false;
389  }
390 
391  struct Meta {
392  int64_t block_size;
393  vector<int64_t> block_shape;
394  bool first_dim;
395 
396  Meta(const Tensor& out_grad, int skip_dims, bool first_dim = true)
397  : first_dim(first_dim) {
398  auto dims = out_grad.sizes();
399  first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
400  : block_shape.assign(dims.begin(), dims.end() - skip_dims);
401  block_size = first_dim
402  ? out_grad.size_from_dim(skip_dims)
403  : out_grad.size_from_dim(out_grad.dim() - skip_dims);
404  }
405 
406  void observeOriginalInput(
407  int /*original_input*/,
408  const Tensor& /*value*/,
409  Tensor* /*input_grad*/, // optional grad to populate
410  int /*skip_dims*/) {}
411 
412  void appendGradShape(vector<int64_t>* output_shape) {
413  output_shape->insert(
414  output_shape->end(), block_shape.begin(), block_shape.end());
415  }
416  };
417 };
418 
419 // Put forward and backward in the same template?
420 template <typename T, class Context>
421 class SumReducer;
422 template <typename T, class Context>
424 
425 template <typename T>
426 class SumReducer<T, CPUContext> : public BaseReducer {
427  public:
429 
430  SumReducer(const Meta& meta, T* out, CPUContext* /*context*/)
431  : current_size_(0), out_(out) {
432  // add a wrapper in Context for it
433  if (meta.first_dim) {
434  memset(out, 0, sizeof(T) * meta.block_size);
435  }
436  }
437  template <int FixedSize>
438  void process(
439  const Meta& meta,
440  const T* in,
441  int64_t /*offset*/,
442  CPUContext* context) {
443  if (meta.first_dim) {
444  math::AxpyFixedSize<T, CPUContext, FixedSize>(
445  meta.block_size, 1, in, out_, context);
446  } else {
447  math::Sum<T, CPUContext>(
448  meta.block_size, in, out_ + current_size_++, context);
449  }
450  }
451 
452  private:
453  int current_size_;
454  T* out_;
455 };
456 
457 template <typename T, class Context>
459  public:
461 
463  const Meta& /*meta*/,
464  const T* s_grad,
465  CPUContext* /*context*/)
466  : s_grad_(s_grad) {}
467 
468  template <int FixedSize>
469  void fillGrad(
470  const Meta& meta,
471  T* data_grad,
472  int64_t offset,
473  Context* context,
474  const int length) {
475  if (FixedSize == 1) { // static if
476  *data_grad = *s_grad_;
477  } else if (meta.first_dim) {
478  context->template CopySameDevice<T>(meta.block_size, s_grad_, data_grad);
479  } else {
480  math::Set<T, Context>(length, s_grad_[offset], data_grad, context);
481  }
482  }
483 
484  private:
485  const T* s_grad_;
486 };
487 
489  template <typename T, class Context>
491  template <typename T, class Context>
493  static constexpr const char* name = "Sum";
494  static constexpr const char* doc =
495  "Summation is done element-wise across slices of the input tensor and "
496  "doesn't change the shape of the individual blocks.";
497  static void PopulateSchema(OpSchema& /*schema*/) {}
498 };
499 
500 // Put forward and backward in the same template?
501 template <typename T, class Context>
503 template <typename T, class Context>
505 
506 template <typename T>
508  public:
509  static constexpr int kInputCount = 2;
510 
512 
514  const T* scalars;
515 
516  bool first_dim;
517 
518  explicit Meta(bool first = true) : first_dim(first) {}
519 
520  void observeInput(int input, const Tensor& value, int skip_dims) {
521  if (input == 1) {
522  CAFFE_ENFORCE_EQ(
523  skip_dims, value.dim(), "SCALARS mustn't have extra dimensions");
524  scalars = value.data<T>();
525  return;
526  }
527  BaseReducer::Meta::observeInput(input, value, skip_dims);
528  }
529  };
530 
531  WeightedSumReducer(const Meta& meta, T* out, CPUContext* /*context*/)
532  : out_(out) {
533  // do we have a wrapper for it?
534  memset(out, 0, sizeof(T) * meta.block_size);
535  }
536  template <int FixedSize>
537  void
538  process(const Meta& meta, const T* in, int64_t offset, CPUContext* context) {
539  CAFFE_ENFORCE(
540  meta.first_dim,
541  "WeightedSumReducer implemented only for "
542  "front dimensions reduction");
543  math::AxpyFixedSize<T, CPUContext, FixedSize>(
544  meta.block_size, meta.scalars[offset], in, out_, context);
545  }
546 
547  private:
548  T* out_;
549 };
550 
551 template <typename T, class Context>
553  public:
554  // which of the original inputs are required for gradient computation
555  static constexpr std::array<int, 1> originalInputs() {
556  return {{1}};
557  }
558 
559  static int numAuxInputsWithGrads(const OperatorDef& def) {
560  return GetFlagArgument(def, "grad_on_weights");
561  }
562 
563  static bool requiresDataInput(const OperatorDef& def) {
564  return numAuxInputsWithGrads(def) > 0;
565  }
566 
568 
569  struct Meta : public BaseReducerGradient::Meta {
570  const T* scalars;
571  T* scalars_grad;
572 
573  using BaseReducerGradient::Meta::Meta;
574 
575  void observeOriginalInput(
576  int original_input,
577  const Tensor& value,
578  Tensor* input_grad, // optional grad to populate
579  int /*skip_dims*/) {
580  CAFFE_ENFORCE_EQ(1, original_input);
581  scalars = value.data<T>();
582  if (input_grad) {
583  input_grad->ResizeLike(value);
584  scalars_grad = input_grad->template mutable_data<T>();
585  }
586  }
587  };
588 
590  const Meta& /*meta*/,
591  const T* s_grad,
592  CPUContext* /*context*/)
593  : s_grad_(s_grad) {}
594 
595  template <int FixedSize>
596  void fillGrad(
597  const Meta& meta,
598  T* data_grad,
599  int64_t offset,
600  Context* context,
601  const int /*length*/) {
602  math::ScaleFixedSize<T, CPUContext, FixedSize>(
603  meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
604  }
605 
606  // Special version which is called with the main input too, used only if
607  // additional input grad is requested
608  template <int FixedSize>
609  void fillGradWithMainInput(
610  const Meta& meta,
611  const T* data,
612  T* data_grad,
613  int64_t offset,
614  Context* context,
615  const int /*length*/) {
616  math::ScaleFixedSize<T, CPUContext, FixedSize>(
617  meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
618  math::Dot(
619  meta.block_size, s_grad_, data, meta.scalars_grad + offset, context);
620  }
621 
622  private:
623  const T* s_grad_;
624 };
625 
627  template <typename T, class Context>
629  template <typename T, class Context>
631  static constexpr const char* name = "WeightedSum";
632  static constexpr const char* doc =
633  "Input slices are first scaled by SCALARS and then summed element-wise. "
634  "It doesn't change the shape of the individual blocks.";
635  static void PopulateSchema(OpSchema& schema) {
636  schema.Input(0, "DATA", "Input tensor for the summation");
637  schema.Input(
638  1,
639  "SCALARS",
640  "Scalar multipliers for the input slices. Must be a vector with the "
641  "length matching the number of slices");
642  schema.Arg(
643  "grad_on_weights",
644  "Produce also gradient for `weights`. For now it's only supported in "
645  "`Lengths`-based operators");
646  }
647 };
648 
649 template <typename T, class Context>
650 class MeanReducer;
651 template <typename T, class Context>
653 
654 template <typename T>
656  public:
658 
659  MeanReducer(const Meta& meta, T* out, CPUContext* /*context*/)
660  : out_(out), current_size_(0) {
661  if (meta.first_dim) {
662  memset(out, 0, sizeof(T) * meta.block_size);
663  }
664  }
665 
666  template <int FixedSize>
667  void process(
668  const Meta& meta,
669  const T* in,
670  int64_t /*offset*/,
671  CPUContext* context) {
672  if (meta.first_dim) {
673  math::AxpyFixedSize<T, CPUContext, FixedSize>(
674  meta.block_size, 1, in, out_, context);
675  } else {
676  math::Sum<T, CPUContext>(
677  meta.block_size, in, out_ + current_size_, context);
678  }
679  current_size_++;
680  }
681 
682  template <int FixedSize>
683  void finish(const Meta& meta, CPUContext* context) {
684  if (meta.first_dim) {
685  if (current_size_ > 0) {
686  math::ScaleFixedSize<T, CPUContext, FixedSize>(
687  meta.block_size, 1.0 / current_size_, out_, out_, context);
688  }
689  } else {
690  math::ScaleFixedSize<T, CPUContext, FixedSize>(
691  current_size_, 1.0 / meta.block_size, out_, out_, context);
692  }
693  }
694 
695  private:
696  T* out_;
697  int current_size_;
698 };
699 
700 template <typename T, class Context>
702  public:
703  static constexpr bool computeLength() {
704  return true;
705  }
706 
708 
710  const Meta& /*meta*/,
711  const T* s_grad,
712  CPUContext* /*context*/)
713  : s_grad_(s_grad) {}
714 
715  template <int FixedSize>
716  void fillGrad(
717  const Meta& meta,
718  T* data_grad,
719  int64_t offset,
720  Context* context,
721  const int length) {
722  CAFFE_ENFORCE_GT(length, 0, "Segment length must be > 0");
723  if (meta.first_dim) {
724  math::ScaleFixedSize<T, CPUContext, FixedSize>(
725  meta.block_size, 1.0 / length, s_grad_, data_grad, context);
726  } else {
727  math::Set<T, CPUContext>(
728  length, s_grad_[offset] * 1.0f / length, data_grad, context);
729  }
730  }
731 
732  private:
733  const T* s_grad_;
734 };
735 
737  template <typename T, class Context>
739  template <typename T, class Context>
741  static constexpr const char* name = "Mean";
742  static constexpr const char* doc =
743  "Mean computes the element-wise mean of the input slices. "
744  "Operation doesn't change the shape of the individual blocks.";
745  static void PopulateSchema(OpSchema& /*schema*/) {}
746 };
747 
748 template <typename T, class Context>
749 class MaxReducer;
750 template <typename T, class Context>
752 
753 template <typename T>
754 class MaxReducer<T, CPUContext> : public BaseReducer {
755  public:
757 
758  MaxReducer(const Meta& meta, T* out, CPUContext* /*context*/)
759  : out_(out), current_size_(0) {
760  // add a wrapper in Context for it
761  memset(out, 0, sizeof(T) * meta.block_size);
762  }
763 
764  template <int FixedSize>
765  void process(
766  const Meta& meta,
767  const T* in,
768  int64_t /*offset*/,
769  CPUContext* context) {
770  CAFFE_ENFORCE(
771  meta.first_dim,
772  "MaxReducer implemented only for front dimensions reduction");
773  if (current_size_ > 0) {
774  EigenVectorMap<T> output_vec(out_, meta.block_size);
775  output_vec =
776  output_vec.cwiseMax(ConstEigenVectorMap<T>(in, meta.block_size));
777  } else {
778  memcpy(out_, in, sizeof(T) * meta.block_size);
779  }
780  ++current_size_;
781  }
782 
783  private:
784  T* out_;
785  int current_size_;
786 };
787 
788 template <typename T, class Context>
790  public:
791  static bool requiresDataInput(const OperatorDef& /*def*/) {
792  return true;
793  }
794 
795  static bool requiresForwardOutput() {
796  return true;
797  }
798 
800 
802  const Meta& /*meta*/,
803  const T* s_grad,
804  CPUContext* /*context*/)
805  : s_grad_(s_grad) {}
806 
807  template <int FixedSize>
808  void fillGradWithMainInputAndForwardOutput(
809  const Meta& meta,
810  const T* data,
811  T* data_grad,
812  const T* forward_output,
813  int64_t /*offset*/,
814  Context* /*context*/,
815  const int /*length*/) {
816  for (int64_t i = 0; i < meta.block_size; ++i) {
817  data_grad[i] = data[i] == forward_output[i] ? s_grad_[i] : 0;
818  }
819  }
820 
821  private:
822  const T* s_grad_;
823 };
824 
826  template <typename T, class Context>
828  template <typename T, class Context>
830  static constexpr const char* name = "Max";
831  static constexpr const char* doc =
832  "Max computes the element-wise max of the input slices. "
833  "Operation doesn't change the shape of the individual blocks.";
834  static void PopulateSchema(OpSchema& /*schema*/) {}
835 };
836 
837 } // namespace caffe2
838 
839 #endif // CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
A class to record the schema of an op.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:40
int64_t size_from_dim_(int k, IntArrayRef dims)
Return product of all dimensions starting from k.
Definition: TensorImpl.h:53
constexpr size_t size() const
size - Get the array size.
Definition: ArrayRef.h:138
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13