2 #ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_ 3 #define CAFFE2_OPERATORS_CONV_OP_IMPL_H_ 5 #include "caffe2/operators/conv_op.h" 10 #include "caffe2/core/context.h" 11 #include "caffe2/core/flags.h" 12 #include "caffe2/core/logging.h" 13 #include "caffe2/core/operator.h" 14 #include "caffe2/operators/conv_pool_op_base.h" 15 #include "caffe2/utils/eigen_utils.h" 16 #include "caffe2/utils/math.h" 20 template <
typename T,
class Context>
21 bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
22 const auto& X = Input(INPUT);
23 const auto& filter = Input(FILTER);
25 const int N = X.dim32(0);
26 const int C = X.dim32(1);
28 CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
29 const int M = filter.dim32(0);
33 "Convolution op: input channels does not match: # of input channels ",
35 " is not equal to kernel channels * group: ",
40 M % G, 0,
"The number of output channels is not divisible by group.");
43 for (std::size_t i = 0; i < kernel_.size(); ++i) {
44 CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
45 kernel_size *= kernel_[i];
47 ConvPoolOpBase<Context>::SetOutputSize(X, Y, M);
50 Y->template mutable_data<T>();
54 const vector<int> X_dims = GetDims(X);
55 const vector<int> Y_dims = GetDims(*Y);
56 const int X_HxW = X.numel() / (N * C);
57 const int Y_HxW = Y->numel() / (N * M);
58 const vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend());
59 vector<int> buffer_shape(Y_dims.size() + 1);
60 buffer_shape[0] = C * kernel_size;
61 std::copy(Y_dims.cbegin(), Y_dims.cend(), buffer_shape.begin() + 1);
63 const int buffer_size = C * kernel_size * Y_HxW;
66 const int kernel_dim = C / G * kernel_size;
67 const int X_stride = C * X_HxW;
68 const int Y_stride = M * Y_HxW;
69 const int filter_stride = filter.numel() / G;
73 const T* X_data = X.template data<T>();
74 const T* filter_data = filter.template data<T>();
75 const T* bias_data =
nullptr;
76 if (InputSize() == 3) {
77 const auto& bias = Input(BIAS);
78 CAFFE_ENFORCE_EQ(bias.dim(), 1);
79 CAFFE_ENFORCE_EQ(bias.dim32(0), M);
80 bias_data = bias.template data<T>();
81 ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
82 Y_HxW, &bias_multiplier_);
84 T* Y_data = Y->template mutable_data<T>();
87 if (kernel_size == 1 && !HasPad() && !HasStride()) {
88 return Run1x1ConvOnDeviceWithOrderNCHW(
89 N, C, X_HxW, M, X_data, filter_data, bias_data, Y_data);
92 const auto func = [&](
Tensor* col_buffer) {
93 col_buffer->Resize(buffer_shape);
94 T* col_buffer_data = col_buffer->template mutable_data<T>();
96 for (
int image_id = 0; image_id < N; ++image_id) {
97 if (kernel_.size() == 2) {
98 math::Im2Col<T, Context, StorageOrder::NCHW>(
116 math::Im2ColNd<T, Context, StorageOrder::NCHW>(
132 math::Gemm<T, Context>(
145 math::GemmStridedBatched<T, Context>(
162 if (bias_data !=
nullptr) {
165 math::Gemm<T, Context>(
173 bias_multiplier_.template data<T>(),
182 if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
183 runWithSharedBuffer<Context>(ws_, func);
191 template <
typename T,
class Context>
192 bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
196 "Only 1-3d convolution is supported for NHWC storage type");
197 const Tensor& X = Input(INPUT);
198 const auto& filter = Input(FILTER);
200 const int N = X.dim32(0), C = X.dim32(X.dim() - 1);
201 const int G = group_;
202 CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
203 const int M = filter.dim32(0);
206 filter.dim32(filter.dim() - 1) * G,
207 "Convolution op: input channels does not match: # of input channels ",
209 " is not equal to kernel channels * group: ",
210 filter.dim32(filter.dim() - 1),
214 M % G, 0,
"The number of output channels is not divisible by group.");
217 for (std::size_t i = 0; i < kernel_.size(); ++i) {
218 CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
219 kernel_size *= kernel_[i];
221 ConvPoolOpBase<Context>::SetOutputSize(X, Y, M);
224 Y->template mutable_data<T>();
228 const vector<int> Y_dims = GetDims(*Y);
229 const int X_HxW = X.numel() / (N * C);
230 const int Y_HxW = Y->numel() / (N * M);
231 const vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend());
232 vector<int> buffer_shape(Y_dims.size() + 1);
233 std::copy(Y_dims.cbegin(), Y_dims.cend(), buffer_shape.begin());
234 buffer_shape.back() = C * kernel_size;
236 const int buffer_size = C * kernel_size * Y_HxW;
239 const int kernel_dim = C / G * kernel_size;
242 const int input_offset = X_HxW * C;
243 const int output_offset = Y->numel() / Y->dim32(0);
248 const T* X_data = X.template data<T>();
249 const T* filter_data = filter.template data<T>();
250 const T* bias_data =
nullptr;
251 if (InputSize() == 3) {
252 const auto& bias = Input(BIAS);
253 CAFFE_ENFORCE_EQ(bias.dim(), 1);
254 CAFFE_ENFORCE_EQ(bias.dim32(0), M);
255 bias_data = bias.template data<T>();
257 T* Y_data = Y->template mutable_data<T>();
261 if (kernel_dim == (C / group_) && !HasPad() && !HasStride()) {
262 if (bias_data !=
nullptr) {
265 ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
266 N * X_HxW, &bias_multiplier_);
268 return Run1x1ConvOnDeviceWithOrderNHWC(
269 N, C, X_HxW, M, X_data, filter_data, bias_data, Y_data);
272 if (bias_data !=
nullptr) {
273 ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
274 Y_HxW, &bias_multiplier_);
276 auto f = [&](
Tensor* col_buffer) {
277 col_buffer->Resize(buffer_shape);
278 T* col_buffer_data = col_buffer->template mutable_data<T>();
280 for (
int image_id = 0; image_id < N; ++image_id) {
281 if (kernel_.size() <= 2) {
282 math::Im2Col<T, Context, StorageOrder::NHWC>(
285 kernel_.size() == 2 ? X.dim32(2) : 1,
287 kernel_.size() == 2 ? kernel_w() : 1,
289 kernel_.size() == 2 ? dilation_w() : 1,
291 kernel_.size() == 2 ? pad_l() : 0,
292 kernel_.size() == 2 ? pad_b() : pad_l(),
293 kernel_.size() == 2 ? pad_r() : 0,
295 kernel_.size() == 2 ? stride_w() : 1,
301 math::Im2ColNd<T, Context, StorageOrder::NHWC>(
317 for (
int group_id = 0; group_id < group_; ++group_id) {
320 math::GemmEx<T, Context>(
327 col_buffer_data + group_id * kernel_dim,
329 filter_data + group_id * (M / group_) * kernel_dim,
332 Y_data + group_id * (M / group_),
336 if (bias_data !=
nullptr) {
338 math::Gemm<T, Context>(
345 bias_multiplier_.template data<T>(),
351 X_data += input_offset;
352 Y_data += output_offset;
355 if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
356 runWithSharedBuffer<Context>(ws_, f);
363 template <
typename T,
class Context>
364 bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNCHW(
373 const int G = group_;
375 math::GemmStridedBatched<T, Context>(
392 const int batch_size = N * G;
393 const int D_X = C / G;
394 const int D_Y = M / G;
395 const int X_stride = D_X * HxW;
396 const int W_stride = D_Y * D_X;
397 const int Y_stride = D_Y * HxW;
398 std::vector<const T*> X_ptr(N * G);
399 std::vector<const T*> W_ptr(N * G);
400 std::vector<T*> Y_ptr(N * G);
401 for (
int i = 0; i < N; ++i) {
402 for (
int j = 0; j < G; ++j) {
403 const int index = i * G + j;
404 X_ptr[index] = X + index * X_stride;
405 W_ptr[index] = filter + j * W_stride;
406 Y_ptr[index] = Y + index * Y_stride;
409 math::GemmBatched<T, Context>(
423 if (bias !=
nullptr) {
424 const T* bias_multiplier_data = bias_multiplier_.template data<T>();
425 math::GemmStridedBatched<T, Context>(
435 bias_multiplier_data,
445 template <
typename T,
class Context>
446 bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNHWC(
455 const int G = group_;
456 const int kernel_dim = C / G;
457 for (
int group_id = 0; group_id < group_; ++group_id) {
458 math::GemmEx<T, Context>(
465 X + group_id * kernel_dim,
467 filter + group_id * (M / group_) * kernel_dim,
470 Y + group_id * (M / group_),
474 if (bias !=
nullptr) {
475 const T* bias_multiplier_data = bias_multiplier_.template data<T>();
476 math::Gemm<T, Context>(
483 bias_multiplier_data,
492 template <
typename T,
class Context>
493 bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
494 auto& X = Input(INPUT);
495 auto& filter = Input(FILTER);
496 auto& dY = Input(OUTPUT_GRAD);
498 const int N = X.dim32(0), C = X.dim32(1);
500 const vector<int> input_dims = this->GetDims(X);
501 const int input_image_size = this->GetDimsSize(X);
503 const vector<int> output_dims = this->GetDims(dY);
505 const int output_image_size = this->GetDimsSize(dY);
507 ConvPoolOpBase<Context>::ComputePads(input_dims);
508 CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
509 const int M = filter.dim32(0);
510 CAFFE_ENFORCE_EQ(C, filter.dim32(1) * group_);
512 int kernel_dims_size = 1;
513 for (
int i = 0; i < kernel_.size(); ++i) {
514 CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
515 kernel_dims_size *= kernel_[i];
518 CAFFE_ENFORCE_EQ(M % group_, 0);
519 auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>());
521 const int kernel_dim = C / group_ * kernel_dims_size;
524 vector<int> img_shape;
525 img_shape.assign(X.sizes().begin() + 1, X.sizes().end());
526 vector<int> col_buffer_shape;
527 col_buffer_shape.push_back(C / group_ * kernel_dims_size);
528 col_buffer_shape.insert(
529 col_buffer_shape.end(), output_dims.begin(), output_dims.end());
530 vector<int64_t> col_buffer_shape_64;
532 col_buffer_shape.cbegin(),
533 col_buffer_shape.cend(),
534 std::back_inserter(col_buffer_shape_64));
538 at::dtype<T>().device(Context::GetDeviceType()));
540 if (kernel_.size() != 2) {
542 SetDeviceTensor(img_shape, &img_shape_device_);
543 SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
546 const int col_buffer_size =
547 (C / group_) * kernel_dims_size * output_image_size;
548 const T* Xdata = X.template data<T>();
549 const T* filter_data = filter.template data<T>();
550 const T* dYdata = dY.template data<T>();
551 T* col_buffer_data = col_buffer_.template mutable_data<T>();
552 T* dfilter_data = dfilter->template mutable_data<T>();
555 math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_);
557 T* dbias_data =
nullptr;
559 auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>());
563 vector<int64_t>(1, output_image_size),
564 at::dtype<T>().device(Context::GetDeviceType()));
565 math::Set<T, Context>(
568 bias_multiplier_.template mutable_data<T>(),
570 dbias_data = dbias->template mutable_data<T>();
571 math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_);
575 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
577 no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
580 dX->template mutable_data<T>();
587 const int input_offset = C / group_ * input_image_size;
588 const int output_offset = dY.numel() / dY.dim32(0) / group_;
589 const int filter_offset = filter.numel() / group_;
590 for (
int image_id = 0; image_id < N; ++image_id) {
591 for (
int group_id = 0; group_id < group_; ++group_id) {
594 if (kernel_.size() == 2) {
595 math::Im2Col<T, Context, StorageOrder::NCHW>(
609 Xdata + group_id * input_offset,
613 math::Im2ColNd<T, Context, StorageOrder::NCHW>(
618 col_buffer_shape.data(),
623 Xdata + group_id * input_offset,
628 math::Gemm<T, Context>(
635 dYdata + group_id * output_offset,
638 dfilter_data + group_id * filter_offset,
643 math::Gemv<T, Context>(
649 bias_multiplier_.template data<T>(),
654 Xdata += input_offset * group_;
655 dYdata += output_offset * group_;
657 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
661 no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
662 T* dXdata = dX->template mutable_data<T>();
663 dYdata = dY.template data<T>();
664 for (
int image_id = 0; image_id < N; ++image_id) {
665 for (
int group_id = 0; group_id < group_; ++group_id) {
667 math::Gemm<T, Context>(
674 filter_data + group_id * filter_offset,
679 if (kernel_.size() == 2) {
680 math::Col2Im<T, Context, StorageOrder::NCHW>(
698 math::Col2ImNd<T, Context, StorageOrder::NCHW>(
703 col_buffer_shape.data(),
712 dXdata += input_offset;
713 dYdata += output_offset;
720 template <
typename T,
class Context>
721 bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
722 auto& X = Input(INPUT);
723 auto& filter = Input(FILTER);
724 auto& dY = Input(OUTPUT_GRAD);
726 const int N = X.dim32(0), C = X.dim32(X.dim() - 1);
728 const vector<int> input_dims = this->GetDims(X);
729 const int input_image_size = this->GetDimsSize(X);
731 const vector<int> output_dims = this->GetDims(dY);
733 const int output_image_size = this->GetDimsSize(dY);
735 ConvPoolOpBase<Context>::ComputePads(input_dims);
736 CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
737 const int M = filter.dim32(0);
738 CAFFE_ENFORCE_EQ(C, filter.dim32(filter.dim() - 1) * group_);
740 int kernel_dims_size = 1;
741 for (
size_t i = 0; i < kernel_.size(); ++i) {
742 CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
743 kernel_dims_size *= kernel_[i];
746 CAFFE_ENFORCE_EQ(M % group_, 0);
747 auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>());
749 const int kernel_dim = C / group_ * kernel_dims_size;
753 vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend());
754 vector<int> col_buffer_shape(output_dims.size() + 1);
755 std::copy(output_dims.cbegin(), output_dims.cend(), col_buffer_shape.begin());
756 col_buffer_shape.back() = C * kernel_dims_size;
757 vector<int64_t> col_buffer_shape_64;
759 col_buffer_shape.cbegin(),
760 col_buffer_shape.cend(),
761 std::back_inserter(col_buffer_shape_64));
765 at::dtype<T>().device(Context::GetDeviceType()));
767 if (kernel_.size() != 2) {
768 SetDeviceTensor(img_shape, &img_shape_device_);
769 SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
772 const int col_buffer_size = C * kernel_dims_size * output_image_size;
773 const T* Xdata = X.template data<T>();
774 const T*
const filter_data = filter.template data<T>();
775 const T*
const dYdata = dY.template data<T>();
776 T* col_buffer_data = col_buffer_.template mutable_data<T>();
777 T* dfilter_data = dfilter->template mutable_data<T>();
780 math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_);
782 T* dbias_data =
nullptr;
784 auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>());
785 dbias_data = dbias->template mutable_data<T>();
786 math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_);
790 vector<int64_t>(1, output_image_size),
791 at::dtype<T>().device(Context::GetDeviceType()));
792 math::Set<T, Context>(
795 bias_multiplier_.template mutable_data<T>(),
800 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
802 no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
805 dX->template mutable_data<T>();
812 const int input_offset = C * input_image_size;
813 const int output_offset = dY.numel() / dY.dim32(0);
814 for (
int image_id = 0; image_id < N; ++image_id) {
817 if (kernel_.size() <= 2) {
818 math::Im2Col<T, Context, StorageOrder::NHWC>(
821 kernel_.size() == 2 ? X.dim32(2) : 1,
823 kernel_.size() == 2 ? kernel_w() : 1,
825 kernel_.size() == 2 ? dilation_w() : 1,
827 kernel_.size() == 2 ? pad_l() : 0,
828 kernel_.size() == 2 ? pad_b() : pad_l(),
829 kernel_.size() == 2 ? pad_r() : 0,
831 kernel_.size() == 2 ? stride_w() : 1,
837 math::Im2ColNd<T, Context, StorageOrder::NHWC>(
839 C * input_image_size,
842 col_buffer_shape.data(),
853 for (
int group_id = 0; group_id < group_; ++group_id) {
854 math::GemmEx<T, Context>(
861 dYdata + output_offset * image_id + group_id * (M / group_),
863 col_buffer_data + group_id * kernel_dim,
866 dfilter_data + group_id * (M / group_) * kernel_dim,
872 math::Gemv<T, Context>(
877 dYdata + output_offset * image_id,
878 bias_multiplier_.template data<T>(),
883 Xdata += input_offset;
886 if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
890 no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
891 T* dXdata = dX->template mutable_data<T>();
892 for (
int image_id = 0; image_id < N; ++image_id) {
894 for (
int group_id = 0; group_id < group_; ++group_id) {
895 math::GemmEx<T, Context>(
902 dYdata + output_offset * image_id + group_id * (M / group_),
904 filter_data + group_id * (M / group_) * kernel_dim,
907 col_buffer_data + group_id * kernel_dim,
911 if (kernel_.size() <= 2) {
912 math::Col2Im<T, Context, StorageOrder::NHWC>(
915 kernel_.size() == 2 ? X.dim32(2) : 1,
917 kernel_.size() == 2 ? kernel_w() : 1,
919 kernel_.size() == 2 ? dilation_w() : 1,
921 kernel_.size() == 2 ? pad_l() : 0,
922 kernel_.size() == 2 ? pad_b() : pad_l(),
923 kernel_.size() == 2 ? pad_r() : 0,
925 kernel_.size() == 2 ? stride_w() : 1,
931 math::Col2ImNd<T, Context, StorageOrder::NHWC>(
933 C * input_image_size,
936 col_buffer_shape.data(),
946 dXdata += input_offset;
953 #endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_
void ReinitializeTensor(Tensor *tensor, at::IntArrayRef dims, at::TensorOptions options)
Reinitialize a Tensor to given dims and options if necessary, note that this will not do anything if ...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...