2 #ifndef CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_ 3 #define CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_ 7 #include "caffe2/core/context.h" 8 #include "caffe2/core/tensor.h" 9 #include "caffe2/utils/eigen_utils.h" 10 #include "caffe2/utils/math.h" 11 #include "caffe2/utils/proto_utils.h" 21 template <
typename T,
class Context>
23 template <
typename T,
class Context>
30 const int64_t block_size,
36 EigenVectorMap<T> out_vec(out, block_size);
37 out_vec = ConstEigenMatrixMap<T>(in, block_size, blocks).rowwise().sum();
41 template <
typename T,
class Context>
45 const int64_t block_size,
47 const T* segment_grad,
53 for (int64_t i = 0; i < blocks; ++i) {
54 context->template CopySameDevice<T>(
55 block_size, segment_grad, data_grad + block_size * i);
61 template <
typename T,
class Context>
63 template <
typename T,
class Context>
65 static constexpr
const char* name =
"Sum";
66 static constexpr
const char* doc =
67 "Summation is done element-wise across slices of the input tensor and " 68 "doesn't change the shape of the individual blocks.";
72 template <
typename T,
class Context>
74 template <
typename T,
class Context>
81 const int64_t block_size,
86 for (
int j = 0; j < block_size; ++j) {
87 T max_value = std::numeric_limits<T>::lowest();
88 for (
int i = 0; i < blocks; ++i) {
89 max_value = std::max(max_value, in[i * block_size + j]);
92 for (
int i = 0; i < blocks; ++i) {
93 scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
95 *(out++) = std::log(scaled_exp_sum) + max_value;
101 template <
typename T,
class Context>
105 const int64_t block_size,
106 const int64_t blocks,
107 const T* segment_grad,
112 for (
int j = 0; j < block_size; ++j) {
113 const T out_grad = *(segment_grad++);
114 const T offset = *(data_out++);
115 for (
int i = 0; i < blocks; ++i) {
116 auto idx = i * block_size + j;
117 data_grad[idx] = out_grad * std::exp(data_in[idx] - offset);
124 template <
typename T,
class Context>
126 template <
typename T,
class Context>
128 static constexpr
const char* name =
"LogSumExp";
129 static constexpr
const char* doc =
130 "LogSumExp computes the element-wise log of the sum of exponentials of " 131 "input slices. Operation doesn't change the shape of individual blocks.";
134 template <
typename T,
class Context>
136 template <
typename T,
class Context>
139 template <
typename T>
143 const int64_t block_size,
144 const int64_t blocks,
148 for (
int j = 0; j < block_size; ++j) {
149 T max_value = std::numeric_limits<T>::lowest();
150 for (
int i = 0; i < blocks; ++i) {
151 max_value = std::max(max_value, in[i * block_size + j]);
153 T scaled_exp_sum = 0;
154 for (
int i = 0; i < blocks; ++i) {
155 scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
157 scaled_exp_sum /= blocks;
158 *(out++) = std::log(scaled_exp_sum) + max_value;
163 template <
typename T,
class Context>
167 const int64_t block_size,
168 const int64_t blocks,
169 const T* segment_grad,
174 for (
int j = 0; j < block_size; ++j) {
175 const T out_grad = *(segment_grad++);
176 const T offset = *(data_out++);
177 for (
int i = 0; i < blocks; ++i) {
178 auto idx = i * block_size + j;
179 data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
186 template <
typename T,
class Context>
188 template <
typename T,
class Context>
190 static constexpr
const char* name =
"LogMeanExp";
191 static constexpr
const char* doc =
192 "LogMeanExp computes the element-wise log of the mean of exponentials of " 193 "input slices. Operation doesn't change the shape of individual blocks.";
196 template <
typename T,
class Context>
198 template <
typename T,
class Context>
201 template <
typename T>
205 const int64_t block_size,
206 const int64_t blocks,
210 for (
int j = 0; j < block_size; ++j) {
212 for (
int i = 0; i < blocks; ++i) {
213 avg_value += in[i * block_size + j] / blocks;
215 *(out++) = avg_value;
220 template <
typename T,
class Context>
224 const int64_t block_size,
225 const int64_t blocks,
226 const T* segment_grad,
231 const auto in_grad = 1.0 / blocks;
232 for (
int j = 0; j < block_size; ++j) {
233 const T out_grad = *(segment_grad++);
234 for (
int i = 0; i < blocks; ++i) {
235 auto idx = i * block_size + j;
236 data_grad[idx] = out_grad * in_grad;
243 template <
typename T,
class Context>
245 template <
typename T,
class Context>
247 static constexpr
const char* name =
"Mean";
248 static constexpr
const char* doc =
249 "Mean computation is done element-wise, so that each element of the " 250 "output slice corresponds to the average value of the respective " 251 "elements in the input slices. Operation doesn't change the shape of " 252 "individual blocks.";
255 template <
typename T,
class Context>
257 template <
typename T,
class Context>
260 template <
typename T>
264 const int64_t block_size,
265 const int64_t blocks,
269 for (
int j = 0; j < block_size; ++j) {
270 T max_value = std::numeric_limits<T>::lowest();
271 for (
int i = 0; i < blocks; ++i) {
272 max_value = std::max(max_value, in[i * block_size + j]);
274 *(out++) = max_value;
279 template <
typename T,
class Context>
283 const int64_t block_size,
284 const int64_t blocks,
285 const T* segment_grad,
291 static_cast<void*>(data_grad), 0, blocks * block_size *
sizeof(
T));
292 for (
int j = 0; j < block_size; ++j) {
293 const T out_grad = *(segment_grad++);
294 const T out = data_out[j];
295 for (
int i = 0; i < blocks; ++i) {
296 auto idx = i * block_size + j;
297 if (out == data_in[idx]) {
298 data_grad[idx] = out_grad;
306 template <
typename T,
class Context>
308 template <
typename T,
class Context>
310 static constexpr
const char* name =
"Max";
311 static constexpr
const char* doc =
312 "Max computation is done element-wise, so that each element of the " 313 "output slice corresponds to the max value of the respective " 314 "elements in the input slices. Operation doesn't change the shape of " 315 "individual blocks. This implementation imitates torch nn.Max operator. " 316 "If the maximum value occurs more than once, the operator will return " 317 "the first occurence of value. When computing the gradient using the " 318 "backward propagation, the gradient input corresponding to the first " 319 "occurence of the maximum value will be used.";
329 static constexpr
int kInputCount = 1;
333 vector<int64_t> block_shape;
336 explicit Meta(
bool first =
true) : first_dim(first) {}
339 first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
340 : block_shape.assign(dims.begin(), dims.end() - skip_dims);
342 : size_from_dim_(dims.
size() - skip_dims, dims);
345 void observeInput(
int input,
const Tensor& value,
int skip_dims) {
347 auto dims = value.sizes();
348 computeMeta(dims, skip_dims);
351 void appendOutputShape(vector<int64_t>* output_shape) {
352 output_shape->insert(
353 output_shape->end(), block_shape.begin(), block_shape.end());
356 vector<int64_t> getOutputShape(
const TensorShape& in,
int skip_dims) {
357 vector<int64_t> dims(in.dims().begin(), in.dims().end());
358 computeMeta(dims, skip_dims);
363 template <
int FixedSize>
370 static constexpr std::array<int, 0> originalInputs() {
371 return std::array<int, 0>();
374 static constexpr
bool computeLength() {
378 static int numAuxInputsWithGrads(
const OperatorDef& ) {
382 static bool requiresDataInput(
const OperatorDef& ) {
387 static bool requiresForwardOutput() {
393 vector<int64_t> block_shape;
396 Meta(
const Tensor& out_grad,
int skip_dims,
bool first_dim =
true)
397 : first_dim(first_dim) {
398 auto dims = out_grad.sizes();
399 first_dim ? block_shape.assign(dims.begin() + skip_dims, dims.end())
400 : block_shape.assign(dims.begin(), dims.end() - skip_dims);
401 block_size = first_dim
402 ? out_grad.size_from_dim(skip_dims)
403 : out_grad.size_from_dim(out_grad.dim() - skip_dims);
406 void observeOriginalInput(
412 void appendGradShape(vector<int64_t>* output_shape) {
413 output_shape->insert(
414 output_shape->end(), block_shape.begin(), block_shape.end());
420 template <
typename T,
class Context>
422 template <
typename T,
class Context>
425 template <
typename T>
431 : current_size_(0), out_(out) {
433 if (meta.first_dim) {
434 memset(out, 0,
sizeof(
T) * meta.block_size);
437 template <
int FixedSize>
443 if (meta.first_dim) {
444 math::AxpyFixedSize<T, CPUContext, FixedSize>(
445 meta.block_size, 1, in, out_, context);
447 math::Sum<T, CPUContext>(
448 meta.block_size, in, out_ + current_size_++, context);
457 template <
typename T,
class Context>
468 template <
int FixedSize>
475 if (FixedSize == 1) {
476 *data_grad = *s_grad_;
477 }
else if (meta.first_dim) {
478 context->template CopySameDevice<T>(meta.block_size, s_grad_, data_grad);
480 math::Set<T, Context>(length, s_grad_[offset], data_grad, context);
489 template <
typename T,
class Context>
491 template <
typename T,
class Context>
493 static constexpr
const char* name =
"Sum";
494 static constexpr
const char* doc =
495 "Summation is done element-wise across slices of the input tensor and " 496 "doesn't change the shape of the individual blocks.";
497 static void PopulateSchema(
OpSchema& ) {}
501 template <
typename T,
class Context>
503 template <
typename T,
class Context>
506 template <
typename T>
509 static constexpr
int kInputCount = 2;
518 explicit Meta(
bool first =
true) : first_dim(first) {}
520 void observeInput(
int input,
const Tensor& value,
int skip_dims) {
523 skip_dims, value.dim(),
"SCALARS mustn't have extra dimensions");
524 scalars = value.data<
T>();
527 BaseReducer::Meta::observeInput(input, value, skip_dims);
534 memset(out, 0,
sizeof(
T) * meta.block_size);
536 template <
int FixedSize>
538 process(
const Meta& meta,
const T* in, int64_t offset,
CPUContext* context) {
541 "WeightedSumReducer implemented only for " 542 "front dimensions reduction");
543 math::AxpyFixedSize<T, CPUContext, FixedSize>(
544 meta.block_size, meta.scalars[offset], in, out_, context);
551 template <
typename T,
class Context>
555 static constexpr std::array<int, 1> originalInputs() {
559 static int numAuxInputsWithGrads(
const OperatorDef& def) {
560 return GetFlagArgument(def,
"grad_on_weights");
563 static bool requiresDataInput(
const OperatorDef& def) {
564 return numAuxInputsWithGrads(def) > 0;
573 using BaseReducerGradient::Meta::Meta;
575 void observeOriginalInput(
580 CAFFE_ENFORCE_EQ(1, original_input);
581 scalars = value.data<
T>();
583 input_grad->ResizeLike(value);
584 scalars_grad = input_grad->template mutable_data<T>();
595 template <
int FixedSize>
602 math::ScaleFixedSize<T, CPUContext, FixedSize>(
603 meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
608 template <
int FixedSize>
609 void fillGradWithMainInput(
616 math::ScaleFixedSize<T, CPUContext, FixedSize>(
617 meta.block_size, meta.scalars[offset], s_grad_, data_grad, context);
619 meta.block_size, s_grad_, data, meta.scalars_grad + offset, context);
627 template <
typename T,
class Context>
629 template <
typename T,
class Context>
631 static constexpr
const char* name =
"WeightedSum";
632 static constexpr
const char* doc =
633 "Input slices are first scaled by SCALARS and then summed element-wise. " 634 "It doesn't change the shape of the individual blocks.";
635 static void PopulateSchema(
OpSchema& schema) {
636 schema.Input(0,
"DATA",
"Input tensor for the summation");
640 "Scalar multipliers for the input slices. Must be a vector with the " 641 "length matching the number of slices");
644 "Produce also gradient for `weights`. For now it's only supported in " 645 "`Lengths`-based operators");
649 template <
typename T,
class Context>
651 template <
typename T,
class Context>
654 template <
typename T>
660 : out_(out), current_size_(0) {
661 if (meta.first_dim) {
662 memset(out, 0,
sizeof(
T) * meta.block_size);
666 template <
int FixedSize>
672 if (meta.first_dim) {
673 math::AxpyFixedSize<T, CPUContext, FixedSize>(
674 meta.block_size, 1, in, out_, context);
676 math::Sum<T, CPUContext>(
677 meta.block_size, in, out_ + current_size_, context);
682 template <
int FixedSize>
684 if (meta.first_dim) {
685 if (current_size_ > 0) {
686 math::ScaleFixedSize<T, CPUContext, FixedSize>(
687 meta.block_size, 1.0 / current_size_, out_, out_, context);
690 math::ScaleFixedSize<T, CPUContext, FixedSize>(
691 current_size_, 1.0 / meta.block_size, out_, out_, context);
700 template <
typename T,
class Context>
703 static constexpr
bool computeLength() {
715 template <
int FixedSize>
722 CAFFE_ENFORCE_GT(length, 0,
"Segment length must be > 0");
723 if (meta.first_dim) {
724 math::ScaleFixedSize<T, CPUContext, FixedSize>(
725 meta.block_size, 1.0 / length, s_grad_, data_grad, context);
727 math::Set<T, CPUContext>(
728 length, s_grad_[offset] * 1.0f / length, data_grad, context);
737 template <
typename T,
class Context>
739 template <
typename T,
class Context>
741 static constexpr
const char* name =
"Mean";
742 static constexpr
const char* doc =
743 "Mean computes the element-wise mean of the input slices. " 744 "Operation doesn't change the shape of the individual blocks.";
745 static void PopulateSchema(
OpSchema& ) {}
748 template <
typename T,
class Context>
750 template <
typename T,
class Context>
753 template <
typename T>
759 : out_(out), current_size_(0) {
761 memset(out, 0,
sizeof(
T) * meta.block_size);
764 template <
int FixedSize>
772 "MaxReducer implemented only for front dimensions reduction");
773 if (current_size_ > 0) {
774 EigenVectorMap<T> output_vec(out_, meta.block_size);
776 output_vec.cwiseMax(ConstEigenVectorMap<T>(in, meta.block_size));
778 memcpy(out_, in,
sizeof(
T) * meta.block_size);
788 template <
typename T,
class Context>
791 static bool requiresDataInput(
const OperatorDef& ) {
795 static bool requiresForwardOutput() {
807 template <
int FixedSize>
808 void fillGradWithMainInputAndForwardOutput(
812 const T* forward_output,
816 for (int64_t i = 0; i < meta.block_size; ++i) {
817 data_grad[i] = data[i] == forward_output[i] ? s_grad_[i] : 0;
826 template <
typename T,
class Context>
828 template <
typename T,
class Context>
830 static constexpr
const char* name =
"Max";
831 static constexpr
const char* doc =
832 "Max computes the element-wise max of the input slices. " 833 "Operation doesn't change the shape of the individual blocks.";
834 static void PopulateSchema(
OpSchema& ) {}
839 #endif // CAFFE2_OPERATORS_RECUDER_FUNCTORS_H_
A class to record the schema of an op.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
int64_t size_from_dim_(int k, IntArrayRef dims)
Return product of all dimensions starting from k.
constexpr size_t size() const
size - Get the array size.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...