Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_op_impl.h
1 // conv_op_impl.h is the templated implementation of the conv_op.h file.
2 #ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
3 #define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
4 
5 #include "caffe2/operators/conv_op.h"
6 
7 #include <array>
8 #include <vector>
9 
10 #include "caffe2/core/context.h"
11 #include "caffe2/core/flags.h"
12 #include "caffe2/core/logging.h"
13 #include "caffe2/core/operator.h"
14 #include "caffe2/operators/conv_pool_op_base.h"
15 #include "caffe2/utils/eigen_utils.h"
16 #include "caffe2/utils/math.h"
17 
18 namespace caffe2 {
19 
20 template <typename T, class Context>
21 bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
22  const auto& X = Input(INPUT);
23  const auto& filter = Input(FILTER);
24  auto* Y = Output(0);
25  const int N = X.dim32(0);
26  const int C = X.dim32(1);
27  const int G = group_;
28  CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
29  const int M = filter.dim32(0);
30  CAFFE_ENFORCE_EQ(
31  C,
32  filter.dim32(1) * G,
33  "Convolution op: input channels does not match: # of input channels ",
34  C,
35  " is not equal to kernel channels * group: ",
36  filter.dim32(1),
37  "*",
38  G);
39  CAFFE_ENFORCE_EQ(
40  M % G, 0, "The number of output channels is not divisible by group.");
41 
42  int kernel_size = 1;
43  for (std::size_t i = 0; i < kernel_.size(); ++i) {
44  CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
45  kernel_size *= kernel_[i];
46  }
47  ConvPoolOpBase<Context>::SetOutputSize(X, Y, M);
48 
49  if (N == 0) {
50  Y->template mutable_data<T>();
51  return true;
52  }
53 
54  const vector<int> X_dims = GetDims(X);
55  const vector<int> Y_dims = GetDims(*Y);
56  const int X_HxW = X.numel() / (N * C);
57  const int Y_HxW = Y->numel() / (N * M);
58  const vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend());
59  vector<int> buffer_shape(Y_dims.size() + 1);
60  buffer_shape[0] = C * kernel_size;
61  std::copy(Y_dims.cbegin(), Y_dims.cend(), buffer_shape.begin() + 1);
62 
63  const int buffer_size = C * kernel_size * Y_HxW;
64 
65  // The dimension of each kernel
66  const int kernel_dim = C / G * kernel_size;
67  const int X_stride = C * X_HxW;
68  const int Y_stride = M * Y_HxW;
69  const int filter_stride = filter.numel() / G;
70 
71  // The col buffer is stored in CHW order as well - kernel_dim, and the height
72  // and width.
73  const T* X_data = X.template data<T>();
74  const T* filter_data = filter.template data<T>();
75  const T* bias_data = nullptr;
76  if (InputSize() == 3) {
77  const auto& bias = Input(BIAS);
78  CAFFE_ENFORCE_EQ(bias.dim(), 1);
79  CAFFE_ENFORCE_EQ(bias.dim32(0), M);
80  bias_data = bias.template data<T>();
81  ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
82  Y_HxW, &bias_multiplier_);
83  }
84  T* Y_data = Y->template mutable_data<T>();
85 
86  // Shortcut for 1x1 conv.
87  if (kernel_size == 1 && !HasPad() && !HasStride()) {
88  return Run1x1ConvOnDeviceWithOrderNCHW(
89  N, C, X_HxW, M, X_data, filter_data, bias_data, Y_data);
90  }
91 
92  const auto func = [&](Tensor* col_buffer) {
93  col_buffer->Resize(buffer_shape);
94  T* col_buffer_data = col_buffer->template mutable_data<T>();
95  // Im2Col, followed by gemm.
96  for (int image_id = 0; image_id < N; ++image_id) {
97  if (kernel_.size() == 2) {
98  math::Im2Col<T, Context, StorageOrder::NCHW>(
99  C,
100  X_dims[0],
101  X_dims[1],
102  kernel_h(),
103  kernel_w(),
104  dilation_h(),
105  dilation_w(),
106  pad_t(),
107  pad_l(),
108  pad_b(),
109  pad_r(),
110  stride_h(),
111  stride_w(),
112  X_data,
113  col_buffer_data,
114  &context_);
115  } else {
116  math::Im2ColNd<T, Context, StorageOrder::NCHW>(
117  kernel_.size(),
118  C * X_HxW,
119  buffer_size,
120  img_shape.data(),
121  buffer_shape.data(),
122  kernel_.data(),
123  stride_.data(),
124  dilation_.data(),
125  pads_.data(),
126  X_data,
127  col_buffer_data,
128  &context_);
129  }
130  // Weight term
131  if (G == 1) {
132  math::Gemm<T, Context>(
133  CblasNoTrans,
134  CblasNoTrans,
135  M,
136  Y_HxW,
137  kernel_dim,
138  1.0f,
139  filter_data,
140  col_buffer_data,
141  0.0f,
142  Y_data,
143  &context_);
144  } else {
145  math::GemmStridedBatched<T, Context>(
146  CblasNoTrans,
147  CblasNoTrans,
148  G,
149  M / G,
150  Y_HxW,
151  kernel_dim,
152  1.0f,
153  filter_data,
154  filter_stride,
155  col_buffer_data,
156  buffer_size / G,
157  0.0f,
158  Y_data,
159  Y_stride / G,
160  &context_);
161  }
162  if (bias_data != nullptr) {
163  // Bias term can be carried out outside the group definition
164  // to be efficient.
165  math::Gemm<T, Context>(
166  CblasNoTrans,
167  CblasNoTrans,
168  M,
169  Y_HxW,
170  1,
171  1.0f,
172  bias_data,
173  bias_multiplier_.template data<T>(),
174  1.0f,
175  Y_data,
176  &context_);
177  }
178  X_data += X_stride;
179  Y_data += Y_stride;
180  }
181  };
182  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
183  runWithSharedBuffer<Context>(ws_, func);
184  } else {
185  func(&col_buffer_);
186  }
187  return true;
188 }
189 
190 // The implementations.
191 template <typename T, class Context>
192 bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
193  CAFFE_ENFORCE_LE(
194  kernel_.size(),
195  3,
196  "Only 1-3d convolution is supported for NHWC storage type");
197  const Tensor& X = Input(INPUT);
198  const auto& filter = Input(FILTER);
199  Tensor* Y = Output(0);
200  const int N = X.dim32(0), C = X.dim32(X.dim() - 1);
201  const int G = group_;
202  CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
203  const int M = filter.dim32(0);
204  CAFFE_ENFORCE_EQ(
205  C,
206  filter.dim32(filter.dim() - 1) * G,
207  "Convolution op: input channels does not match: # of input channels ",
208  C,
209  " is not equal to kernel channels * group: ",
210  filter.dim32(filter.dim() - 1),
211  "*",
212  G);
213  CAFFE_ENFORCE_EQ(
214  M % G, 0, "The number of output channels is not divisible by group.");
215 
216  int kernel_size = 1;
217  for (std::size_t i = 0; i < kernel_.size(); ++i) {
218  CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
219  kernel_size *= kernel_[i];
220  }
221  ConvPoolOpBase<Context>::SetOutputSize(X, Y, M);
222 
223  if (N == 0) {
224  Y->template mutable_data<T>();
225  return true;
226  }
227 
228  const vector<int> Y_dims = GetDims(*Y);
229  const int X_HxW = X.numel() / (N * C);
230  const int Y_HxW = Y->numel() / (N * M);
231  const vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend());
232  vector<int> buffer_shape(Y_dims.size() + 1);
233  std::copy(Y_dims.cbegin(), Y_dims.cend(), buffer_shape.begin());
234  buffer_shape.back() = C * kernel_size;
235 
236  const int buffer_size = C * kernel_size * Y_HxW;
237 
238  // The dimension of each kernel
239  const int kernel_dim = C / G * kernel_size;
240  // The offset corresponding to a single input image, and a single output
241  // image.
242  const int input_offset = X_HxW * C;
243  const int output_offset = Y->numel() / Y->dim32(0);
244 
245  // The output image size is the spatial size of the output.
246  // The col buffer is stored in HWC order as well - the height and width, and
247  // kernel_dim.
248  const T* X_data = X.template data<T>();
249  const T* filter_data = filter.template data<T>();
250  const T* bias_data = nullptr;
251  if (InputSize() == 3) {
252  const auto& bias = Input(BIAS);
253  CAFFE_ENFORCE_EQ(bias.dim(), 1);
254  CAFFE_ENFORCE_EQ(bias.dim32(0), M);
255  bias_data = bias.template data<T>();
256  }
257  T* Y_data = Y->template mutable_data<T>();
258 
259  // Specialized path for 1 by 1 convolution with stride 1, pad 0 - we
260  // can skip im2col.
261  if (kernel_dim == (C / group_) && !HasPad() && !HasStride()) {
262  if (bias_data != nullptr) {
263  // For this specialized path, we need a bigger bias_multiplier_ because
264  // we're doing just 1 big GEMM.
265  ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
266  N * X_HxW, &bias_multiplier_);
267  }
268  return Run1x1ConvOnDeviceWithOrderNHWC(
269  N, C, X_HxW, M, X_data, filter_data, bias_data, Y_data);
270  }
271 
272  if (bias_data != nullptr) {
273  ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
274  Y_HxW, &bias_multiplier_);
275  }
276  auto f = [&](Tensor* col_buffer) {
277  col_buffer->Resize(buffer_shape);
278  T* col_buffer_data = col_buffer->template mutable_data<T>();
279  // Im2Col, followed by gemm.
280  for (int image_id = 0; image_id < N; ++image_id) {
281  if (kernel_.size() <= 2) {
282  math::Im2Col<T, Context, StorageOrder::NHWC>(
283  C,
284  X.dim32(1),
285  kernel_.size() == 2 ? X.dim32(2) : 1,
286  kernel_h(),
287  kernel_.size() == 2 ? kernel_w() : 1,
288  dilation_h(),
289  kernel_.size() == 2 ? dilation_w() : 1,
290  pad_t(),
291  kernel_.size() == 2 ? pad_l() : 0,
292  kernel_.size() == 2 ? pad_b() : pad_l(),
293  kernel_.size() == 2 ? pad_r() : 0,
294  stride_h(),
295  kernel_.size() == 2 ? stride_w() : 1,
296  X_data,
297  col_buffer_data,
298  &context_,
299  group_);
300  } else {
301  math::Im2ColNd<T, Context, StorageOrder::NHWC>(
302  kernel_.size(),
303  C * X_HxW,
304  buffer_size,
305  img_shape.data(),
306  buffer_shape.data(),
307  kernel_.data(),
308  stride_.data(),
309  dilation_.data(),
310  pads_.data(),
311  X_data,
312  col_buffer_data,
313  &context_,
314  group_);
315  }
316  // Weight term
317  for (int group_id = 0; group_id < group_; ++group_id) {
318  // col_buffer_data in G (H W) (R S C/G) layout
319  // filter_data in G K/G (R S C/G) layout
320  math::GemmEx<T, Context>(
321  CblasNoTrans,
322  CblasTrans,
323  Y_HxW,
324  M / group_,
325  kernel_dim,
326  1,
327  col_buffer_data + group_id * kernel_dim,
328  group_ * kernel_dim,
329  filter_data + group_id * (M / group_) * kernel_dim,
330  kernel_dim,
331  0,
332  Y_data + group_id * (M / group_),
333  M,
334  &context_);
335  }
336  if (bias_data != nullptr) {
337  // Bias term
338  math::Gemm<T, Context>(
339  CblasNoTrans,
340  CblasNoTrans,
341  Y_HxW,
342  M,
343  1,
344  1,
345  bias_multiplier_.template data<T>(),
346  bias_data,
347  1,
348  Y_data,
349  &context_);
350  }
351  X_data += input_offset;
352  Y_data += output_offset;
353  }
354  };
355  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
356  runWithSharedBuffer<Context>(ws_, f);
357  } else {
358  f(&col_buffer_);
359  }
360  return true;
361 }
362 
363 template <typename T, class Context>
364 bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNCHW(
365  const int N,
366  const int C,
367  const int HxW,
368  const int M,
369  const T* X,
370  const T* filter,
371  const T* bias,
372  T* Y) {
373  const int G = group_;
374  if (G == 1) {
375  math::GemmStridedBatched<T, Context>(
376  CblasNoTrans,
377  CblasNoTrans,
378  N,
379  M,
380  HxW,
381  C,
382  1.0f,
383  filter,
384  0,
385  X,
386  C * HxW,
387  0.0f,
388  Y,
389  M * HxW,
390  &context_);
391  } else {
392  const int batch_size = N * G;
393  const int D_X = C / G;
394  const int D_Y = M / G;
395  const int X_stride = D_X * HxW;
396  const int W_stride = D_Y * D_X;
397  const int Y_stride = D_Y * HxW;
398  std::vector<const T*> X_ptr(N * G);
399  std::vector<const T*> W_ptr(N * G);
400  std::vector<T*> Y_ptr(N * G);
401  for (int i = 0; i < N; ++i) {
402  for (int j = 0; j < G; ++j) {
403  const int index = i * G + j;
404  X_ptr[index] = X + index * X_stride;
405  W_ptr[index] = filter + j * W_stride;
406  Y_ptr[index] = Y + index * Y_stride;
407  }
408  }
409  math::GemmBatched<T, Context>(
410  CblasNoTrans,
411  CblasNoTrans,
412  batch_size,
413  D_Y,
414  HxW,
415  D_X,
416  1.0f,
417  W_ptr.data(),
418  X_ptr.data(),
419  0.0f,
420  Y_ptr.data(),
421  &context_);
422  }
423  if (bias != nullptr) {
424  const T* bias_multiplier_data = bias_multiplier_.template data<T>();
425  math::GemmStridedBatched<T, Context>(
426  CblasNoTrans,
427  CblasNoTrans,
428  N,
429  M,
430  HxW,
431  1,
432  1.0f,
433  bias,
434  0,
435  bias_multiplier_data,
436  0,
437  1.0f,
438  Y,
439  M * HxW,
440  &context_);
441  }
442  return true;
443 }
444 
445 template <typename T, class Context>
446 bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNHWC(
447  const int N,
448  const int C,
449  const int HxW,
450  const int M,
451  const T* X,
452  const T* filter,
453  const T* bias,
454  T* Y) {
455  const int G = group_;
456  const int kernel_dim = C / G;
457  for (int group_id = 0; group_id < group_; ++group_id) {
458  math::GemmEx<T, Context>(
459  CblasNoTrans,
460  CblasTrans,
461  N * HxW,
462  M / group_,
463  kernel_dim,
464  1.0f,
465  X + group_id * kernel_dim,
466  C,
467  filter + group_id * (M / group_) * kernel_dim,
468  kernel_dim,
469  0.0f,
470  Y + group_id * (M / group_),
471  M,
472  &context_);
473  }
474  if (bias != nullptr) {
475  const T* bias_multiplier_data = bias_multiplier_.template data<T>();
476  math::Gemm<T, Context>(
477  CblasNoTrans,
478  CblasNoTrans,
479  N * HxW,
480  M,
481  1,
482  1.0f,
483  bias_multiplier_data,
484  bias,
485  1.0f,
486  Y,
487  &context_);
488  }
489  return true;
490 }
491 
492 template <typename T, class Context>
493 bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
494  auto& X = Input(INPUT);
495  auto& filter = Input(FILTER);
496  auto& dY = Input(OUTPUT_GRAD);
497 
498  const int N = X.dim32(0), C = X.dim32(1);
499 
500  const vector<int> input_dims = this->GetDims(X);
501  const int input_image_size = this->GetDimsSize(X);
502 
503  const vector<int> output_dims = this->GetDims(dY);
504  // The output image size is the spatial size of the output.
505  const int output_image_size = this->GetDimsSize(dY);
506 
507  ConvPoolOpBase<Context>::ComputePads(input_dims);
508  CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
509  const int M = filter.dim32(0);
510  CAFFE_ENFORCE_EQ(C, filter.dim32(1) * group_);
511 
512  int kernel_dims_size = 1;
513  for (int i = 0; i < kernel_.size(); ++i) {
514  CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
515  kernel_dims_size *= kernel_[i];
516  }
517 
518  CAFFE_ENFORCE_EQ(M % group_, 0);
519  auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>());
520  // The dimension of each kernel
521  const int kernel_dim = C / group_ * kernel_dims_size;
522  // The col buffer is stored in CHW order as well - kernel_dim, and the height
523  // and width.
524  vector<int> img_shape;
525  img_shape.assign(X.sizes().begin() + 1, X.sizes().end());
526  vector<int> col_buffer_shape;
527  col_buffer_shape.push_back(C / group_ * kernel_dims_size);
528  col_buffer_shape.insert(
529  col_buffer_shape.end(), output_dims.begin(), output_dims.end());
530  vector<int64_t> col_buffer_shape_64;
531  std::copy(
532  col_buffer_shape.cbegin(),
533  col_buffer_shape.cend(),
534  std::back_inserter(col_buffer_shape_64));
536  &col_buffer_,
537  col_buffer_shape_64,
538  at::dtype<T>().device(Context::GetDeviceType()));
539 
540  if (kernel_.size() != 2) {
541  // TODO: SetDeviceTensor accept vector<int64_t>
542  SetDeviceTensor(img_shape, &img_shape_device_);
543  SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
544  }
545 
546  const int col_buffer_size =
547  (C / group_) * kernel_dims_size * output_image_size;
548  const T* Xdata = X.template data<T>();
549  const T* filter_data = filter.template data<T>();
550  const T* dYdata = dY.template data<T>();
551  T* col_buffer_data = col_buffer_.template mutable_data<T>();
552  T* dfilter_data = dfilter->template mutable_data<T>();
553 
554  // Pre-setting the gradients to zero.
555  math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_);
556 
557  T* dbias_data = nullptr;
558  if (!no_bias_) {
559  auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>());
560  // Removed the check for whether bias_multiplier_ has correct size or not
562  &bias_multiplier_,
563  vector<int64_t>(1, output_image_size),
564  at::dtype<T>().device(Context::GetDeviceType()));
565  math::Set<T, Context>(
566  output_image_size,
567  static_cast<T>(1),
568  bias_multiplier_.template mutable_data<T>(),
569  &context_);
570  dbias_data = dbias->template mutable_data<T>();
571  math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_);
572  }
573 
574  if (N == 0) {
575  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
576  auto* dX = Output(
577  no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
578  X.sizes(),
579  at::dtype<T>());
580  dX->template mutable_data<T>();
581  }
582  return true;
583  }
584 
585  // The offset corresponding to a single input image, and a single output
586  // image.
587  const int input_offset = C / group_ * input_image_size;
588  const int output_offset = dY.numel() / dY.dim32(0) / group_;
589  const int filter_offset = filter.numel() / group_;
590  for (int image_id = 0; image_id < N; ++image_id) {
591  for (int group_id = 0; group_id < group_; ++group_id) {
592  // When we compute the gradient with respect to the filters, we need to do
593  // im2col to allow gemm-type computation.
594  if (kernel_.size() == 2) {
595  math::Im2Col<T, Context, StorageOrder::NCHW>(
596  C / group_,
597  input_dims[0],
598  input_dims[1],
599  kernel_h(),
600  kernel_w(),
601  dilation_h(),
602  dilation_w(),
603  pad_t(),
604  pad_l(),
605  pad_b(),
606  pad_r(),
607  stride_h(),
608  stride_w(),
609  Xdata + group_id * input_offset,
610  col_buffer_data,
611  &context_);
612  } else {
613  math::Im2ColNd<T, Context, StorageOrder::NCHW>(
614  kernel_.size(),
615  input_offset,
616  col_buffer_size,
617  img_shape.data(),
618  col_buffer_shape.data(),
619  kernel_.data(),
620  stride_.data(),
621  dilation_.data(),
622  pads_.data(),
623  Xdata + group_id * input_offset,
624  col_buffer_data,
625  &context_);
626  }
627  // Gradient with respect to filter.
628  math::Gemm<T, Context>(
629  CblasNoTrans,
630  CblasTrans,
631  M / group_,
632  kernel_dim,
633  output_image_size,
634  1,
635  dYdata + group_id * output_offset,
636  col_buffer_data,
637  1,
638  dfilter_data + group_id * filter_offset,
639  &context_);
640  }
641  if (!no_bias_) {
642  // Gradient with respect to bias can be computed independent from group.
643  math::Gemv<T, Context>(
644  CblasNoTrans,
645  M,
646  output_image_size,
647  1,
648  dYdata,
649  bias_multiplier_.template data<T>(),
650  1,
651  dbias_data,
652  &context_);
653  }
654  Xdata += input_offset * group_;
655  dYdata += output_offset * group_;
656  }
657  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
658  // Compute the gradient w.r.t. the input.
659 
660  auto* dX = Output(
661  no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
662  T* dXdata = dX->template mutable_data<T>();
663  dYdata = dY.template data<T>();
664  for (int image_id = 0; image_id < N; ++image_id) {
665  for (int group_id = 0; group_id < group_; ++group_id) {
666  // Compute gradient into col_buffer.
667  math::Gemm<T, Context>(
668  CblasTrans,
669  CblasNoTrans,
670  kernel_dim,
671  output_image_size,
672  M / group_,
673  1,
674  filter_data + group_id * filter_offset,
675  dYdata,
676  0,
677  col_buffer_data,
678  &context_);
679  if (kernel_.size() == 2) {
680  math::Col2Im<T, Context, StorageOrder::NCHW>(
681  C / group_,
682  input_dims[0],
683  input_dims[1],
684  kernel_h(),
685  kernel_w(),
686  dilation_h(),
687  dilation_w(),
688  pad_t(),
689  pad_l(),
690  pad_b(),
691  pad_r(),
692  stride_h(),
693  stride_w(),
694  col_buffer_data,
695  dXdata,
696  &context_);
697  } else {
698  math::Col2ImNd<T, Context, StorageOrder::NCHW>(
699  kernel_.size(),
700  input_offset,
701  col_buffer_size,
702  img_shape.data(),
703  col_buffer_shape.data(),
704  kernel_.data(),
705  stride_.data(),
706  dilation_.data(),
707  pads_.data(),
708  col_buffer_data,
709  dXdata,
710  &context_);
711  }
712  dXdata += input_offset;
713  dYdata += output_offset;
714  }
715  }
716  }
717  return true;
718 }
719 
720 template <typename T, class Context>
721 bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
722  auto& X = Input(INPUT);
723  auto& filter = Input(FILTER);
724  auto& dY = Input(OUTPUT_GRAD);
725 
726  const int N = X.dim32(0), C = X.dim32(X.dim() - 1);
727 
728  const vector<int> input_dims = this->GetDims(X);
729  const int input_image_size = this->GetDimsSize(X);
730 
731  const vector<int> output_dims = this->GetDims(dY);
732  // The output image size is the spatial size of the output.
733  const int output_image_size = this->GetDimsSize(dY);
734 
735  ConvPoolOpBase<Context>::ComputePads(input_dims);
736  CAFFE_ENFORCE_EQ(X.dim(), filter.dim());
737  const int M = filter.dim32(0);
738  CAFFE_ENFORCE_EQ(C, filter.dim32(filter.dim() - 1) * group_);
739 
740  int kernel_dims_size = 1;
741  for (size_t i = 0; i < kernel_.size(); ++i) {
742  CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
743  kernel_dims_size *= kernel_[i];
744  }
745 
746  CAFFE_ENFORCE_EQ(M % group_, 0);
747  auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>());
748  // The dimension of each kernel
749  const int kernel_dim = C / group_ * kernel_dims_size;
750 
751  // The col buffer is stored in HWC order as well - the height and width, and
752  // kernel_dim.
753  vector<int> img_shape(X.sizes().cbegin() + 1, X.sizes().cend());
754  vector<int> col_buffer_shape(output_dims.size() + 1);
755  std::copy(output_dims.cbegin(), output_dims.cend(), col_buffer_shape.begin());
756  col_buffer_shape.back() = C * kernel_dims_size;
757  vector<int64_t> col_buffer_shape_64;
758  std::copy(
759  col_buffer_shape.cbegin(),
760  col_buffer_shape.cend(),
761  std::back_inserter(col_buffer_shape_64));
763  &col_buffer_,
764  col_buffer_shape_64,
765  at::dtype<T>().device(Context::GetDeviceType()));
766 
767  if (kernel_.size() != 2) {
768  SetDeviceTensor(img_shape, &img_shape_device_);
769  SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
770  }
771 
772  const int col_buffer_size = C * kernel_dims_size * output_image_size;
773  const T* Xdata = X.template data<T>();
774  const T* const filter_data = filter.template data<T>();
775  const T* const dYdata = dY.template data<T>();
776  T* col_buffer_data = col_buffer_.template mutable_data<T>();
777  T* dfilter_data = dfilter->template mutable_data<T>();
778 
779  // Pre-setting the gradients to zero.
780  math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_);
781 
782  T* dbias_data = nullptr;
783  if (!no_bias_) {
784  auto* dbias = Output(BIAS_OR_INPUT_GRAD, {M}, at::dtype<T>());
785  dbias_data = dbias->template mutable_data<T>();
786  math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_);
787  // Removed the check for whether bias_multiplier_ has correct size or not
789  &bias_multiplier_,
790  vector<int64_t>(1, output_image_size),
791  at::dtype<T>().device(Context::GetDeviceType()));
792  math::Set<T, Context>(
793  output_image_size,
794  static_cast<T>(1),
795  bias_multiplier_.template mutable_data<T>(),
796  &context_);
797  }
798 
799  if (N == 0) {
800  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
801  auto* dX = Output(
802  no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD,
803  X.sizes(),
804  at::dtype<T>());
805  dX->template mutable_data<T>();
806  }
807  return true;
808  }
809 
810  // The offset corresponding to a single input image, and a single output
811  // image.
812  const int input_offset = C * input_image_size;
813  const int output_offset = dY.numel() / dY.dim32(0);
814  for (int image_id = 0; image_id < N; ++image_id) {
815  // When we compute the gradient with respect to the filters, we need to do
816  // im2col to allow gemm-type computation.
817  if (kernel_.size() <= 2) {
818  math::Im2Col<T, Context, StorageOrder::NHWC>(
819  C,
820  X.size(1),
821  kernel_.size() == 2 ? X.dim32(2) : 1,
822  kernel_h(),
823  kernel_.size() == 2 ? kernel_w() : 1,
824  dilation_h(),
825  kernel_.size() == 2 ? dilation_w() : 1,
826  pad_t(),
827  kernel_.size() == 2 ? pad_l() : 0,
828  kernel_.size() == 2 ? pad_b() : pad_l(),
829  kernel_.size() == 2 ? pad_r() : 0,
830  stride_h(),
831  kernel_.size() == 2 ? stride_w() : 1,
832  Xdata,
833  col_buffer_data,
834  &context_,
835  group_);
836  } else {
837  math::Im2ColNd<T, Context, StorageOrder::NHWC>(
838  kernel_.size(),
839  C * input_image_size,
840  col_buffer_size,
841  img_shape.data(),
842  col_buffer_shape.data(),
843  kernel_.data(),
844  stride_.data(),
845  dilation_.data(),
846  pads_.data(),
847  Xdata,
848  col_buffer_data,
849  &context_,
850  group_);
851  }
852  // Gradient with respect to filter.
853  for (int group_id = 0; group_id < group_; ++group_id) {
854  math::GemmEx<T, Context>(
855  CblasTrans,
856  CblasNoTrans,
857  M / group_,
858  kernel_dim,
859  output_image_size,
860  1,
861  dYdata + output_offset * image_id + group_id * (M / group_),
862  M,
863  col_buffer_data + group_id * kernel_dim,
864  group_ * kernel_dim,
865  1,
866  dfilter_data + group_id * (M / group_) * kernel_dim,
867  kernel_dim,
868  &context_);
869  }
870  if (!no_bias_) {
871  // Gradient with respect to bias
872  math::Gemv<T, Context>(
873  CblasTrans,
874  output_image_size,
875  M,
876  1,
877  dYdata + output_offset * image_id,
878  bias_multiplier_.template data<T>(),
879  1,
880  dbias_data,
881  &context_);
882  }
883  Xdata += input_offset;
884  } // for each image
885 
886  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
887  // Compute the gradient w.r.t. the input.
888 
889  auto* dX = Output(
890  no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
891  T* dXdata = dX->template mutable_data<T>();
892  for (int image_id = 0; image_id < N; ++image_id) {
893  // Compute gradient into col_buffer.
894  for (int group_id = 0; group_id < group_; ++group_id) {
895  math::GemmEx<T, Context>(
896  CblasNoTrans,
897  CblasNoTrans,
898  output_image_size,
899  kernel_dim,
900  M / group_,
901  1,
902  dYdata + output_offset * image_id + group_id * (M / group_),
903  M,
904  filter_data + group_id * (M / group_) * kernel_dim,
905  kernel_dim,
906  0,
907  col_buffer_data + group_id * kernel_dim,
908  group_ * kernel_dim,
909  &context_);
910  }
911  if (kernel_.size() <= 2) {
912  math::Col2Im<T, Context, StorageOrder::NHWC>(
913  C,
914  X.size(1),
915  kernel_.size() == 2 ? X.dim32(2) : 1,
916  kernel_h(),
917  kernel_.size() == 2 ? kernel_w() : 1,
918  dilation_h(),
919  kernel_.size() == 2 ? dilation_w() : 1,
920  pad_t(),
921  kernel_.size() == 2 ? pad_l() : 0,
922  kernel_.size() == 2 ? pad_b() : pad_l(),
923  kernel_.size() == 2 ? pad_r() : 0,
924  stride_h(),
925  kernel_.size() == 2 ? stride_w() : 1,
926  col_buffer_data,
927  dXdata,
928  &context_,
929  group_);
930  } else {
931  math::Col2ImNd<T, Context, StorageOrder::NHWC>(
932  kernel_.size(),
933  C * input_image_size,
934  col_buffer_size,
935  img_shape.data(),
936  col_buffer_shape.data(),
937  kernel_.data(),
938  stride_.data(),
939  dilation_.data(),
940  pads_.data(),
941  col_buffer_data,
942  dXdata,
943  &context_,
944  group_);
945  }
946  dXdata += input_offset;
947  } // for each image
948  }
949  return true;
950 }
951 } // namespace caffe2
952 
953 #endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_
Definition: any.cpp:108
void ReinitializeTensor(Tensor *tensor, at::IntArrayRef dims, at::TensorOptions options)
Reinitialize a Tensor to given dims and options if necessary, note that this will not do anything if ...
Definition: tensor.cc:127
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64