Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_op_impl.h
1 
17 // conv_op_impl.h is the templated implementation of the conv_op.h file.
18 #ifndef CAFFE2_OPERATORS_CONV_OP_IMPL_H_
19 #define CAFFE2_OPERATORS_CONV_OP_IMPL_H_
20 
21 #include "caffe2/core/context.h"
22 #include "caffe2/core/flags.h"
23 #include "caffe2/core/logging.h"
24 #include "caffe2/core/operator.h"
25 #include "caffe2/operators/conv_op.h"
26 #include "caffe2/operators/conv_pool_op_base.h"
27 #include "caffe2/utils/math.h"
28 
29 namespace caffe2 {
30 
31 template <typename T, class Context>
32 bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
33  const Tensor<Context>& X = Input(INPUT);
34  auto& filter = Input(FILTER);
35  Tensor<Context>* Y = Output(0);
36  const int N = X.dim32(0), C = X.dim32(1);
37  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
38  const int M = filter.dim32(0);
39  CAFFE_ENFORCE(
40  C == filter.dim32(1) * group_,
41  "Convolution op: input channels does not match: # of input channels ",
42  C,
43  " is not equal to kernel channels * group:",
44  filter.dim32(1),
45  "*",
46  group_);
47  CAFFE_ENFORCE(
48  M % group_ == 0,
49  "The number of output channels is not divisible by group.");
50 
51  int kernel_dims_size = 1;
52  for (int i = 0; i < kernel_.size(); ++i) {
53  CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
54  kernel_dims_size *= kernel_[i];
55  }
56 
57  ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
58 
59  const vector<int> input_dims = GetDims(X);
60  const vector<int> output_dims = GetDims(*Y);
61  const int input_image_size = this->GetDimsSize(X);
62  const int output_image_size = this->GetDimsSize(*Y);
63 
64  vector<int> img_shape;
65  img_shape.assign(X.dims().begin() + 1, X.dims().end());
66 
67  vector<int> buffer_shape;
68  buffer_shape.push_back(C / group_ * kernel_dims_size);
69  buffer_shape.insert(
70  buffer_shape.end(), output_dims.begin(), output_dims.end());
71 
72  if (kernel_.size() != 2) {
73  SetDeviceTensor(img_shape, &img_shape_device_);
74  SetDeviceTensor(buffer_shape, &col_buffer_shape_device_);
75  }
76 
77  const int col_buffer_size =
78  (C / group_) * kernel_dims_size * output_image_size;
79 
80  // The dimension of each kernel
81  const int kernel_dim = C / group_ * kernel_dims_size;
82  // The offset corresponding to a single input image, and a single output
83  // image.
84  const int input_offset = C / group_ * input_image_size;
85  const int output_offset = Y->size() / Y->dim32(0) / group_;
86  const int filter_offset = filter.size() / group_;
87 
88  // The col buffer is stored in CHW order as well - kernel_dim, and the height
89  // and width.
90  const T* Xdata = X.template data<T>();
91  if (InputSize() == 3) {
92  const auto& bias = Input(BIAS);
93  CAFFE_ENFORCE(bias.ndim() == 1);
94  CAFFE_ENFORCE(bias.dim32(0) == M);
95  ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
96  output_image_size, &bias_multiplier_);
97  }
98  T* Ydata = Y->template mutable_data<T>();
99 
100  auto f = [&](Tensor<Context>* col_buffer) {
101  col_buffer->Resize(buffer_shape);
102  T* col_buffer_data = col_buffer->template mutable_data<T>();
103  // Im2col, followed by gemm.
104  for (int image_id = 0; image_id < N; ++image_id) {
105  for (int group_id = 0; group_id < group_; ++group_id) {
106  if (kernel_.size() == 2) {
107  math::Im2col<T, Context, StorageOrder::NCHW>(
108  Xdata + group_id * input_offset,
109  C / group_,
110  input_dims[0],
111  input_dims[1],
112  kernel_h(),
113  kernel_w(),
114  dilation_h(),
115  dilation_w(),
116  pad_t(),
117  pad_l(),
118  pad_b(),
119  pad_r(),
120  stride_h(),
121  stride_w(),
122  col_buffer_data,
123  &context_);
124  } else {
125  math::Im2colNd<T, Context, StorageOrder::NCHW>(
126  Xdata + group_id * input_offset,
127  img_shape_device_.template data<int>(),
128  col_buffer_shape_device_.template data<int>(),
129  C * input_image_size,
130  col_buffer_size,
131  kernel_device_.template data<int>(),
132  stride_device_.template data<int>(),
133  dilation_device_.template data<int>(),
134  pads_device_.template data<int>(),
135  kernel_.size(),
136  col_buffer_data,
137  &context_);
138  }
139  // Weight term
140  math::Gemm<T, Context>(
141  CblasNoTrans,
142  CblasNoTrans,
143  M / group_,
144  output_image_size,
145  kernel_dim,
146  1,
147  filter.template data<T>() + group_id * filter_offset,
148  col_buffer_data,
149  0,
150  Ydata + group_id * output_offset,
151  &context_);
152  }
153  if (InputSize() == 3) {
154  // Bias term can be carried out outside the group definition
155  // to be efficient.
156  auto* bias_data = Input(BIAS).template data<T>();
157  math::Gemm<T, Context>(
158  CblasNoTrans,
159  CblasNoTrans,
160  M,
161  output_image_size,
162  1,
163  1,
164  bias_data,
165  bias_multiplier_.template data<T>(),
166  1,
167  Ydata,
168  &context_);
169  }
170  Xdata += input_offset * group_;
171  Ydata += output_offset * group_;
172  }
173  };
174 
175  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
176  runWithSharedBuffer<Context>(ws_, f);
177  } else {
178  f(&col_buffer_);
179  }
180  return true;
181 }
182 
183 // The implementations.
184 template <typename T, class Context>
185 bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
186  const Tensor<Context>& X = Input(INPUT);
187  auto& filter = Input(FILTER);
188  Tensor<Context>* Y = Output(0);
189  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
190 
191  CAFFE_ENFORCE_EQ(
192  kernel_.size(),
193  2,
194  "Only 2d convolution is supported for NHWC storage type");
195 
196  CAFFE_ENFORCE(X.ndim(), filter.ndim());
197  const int M = filter.dim32(0);
198  CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
199  CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
200  CAFFE_ENFORCE(filter.dim32(3) == C);
201 
202  ConvPoolOpBase<Context>::SetOutputSize(X, Y, filter.dim32(0));
203  // The dimension of each kernel
204  const int kernel_dim = kernel_h() * kernel_w() * C;
205  // The offset corresponding to a single input image, and a single output
206  // image.
207  const int input_offset = H * W * C;
208  const int output_offset = Y->size() / Y->dim32(0);
209  // The output image size is the spatial size of the output.
210  const int output_image_size = Y->dim32(1) * Y->dim32(2);
211  // The col buffer is stored in HWC order as well - kernel_dim, and the height
212  // and width.
213  const T* Xdata = X.template data<T>();
214  T* Ydata = Y->template mutable_data<T>();
215  // Specialized path for 1 by 1 convolution with stride 1, pad 0 - we
216  // can skip im2col.
217  if (kernel_dim == C && Y->dim32(1) == X.dim32(1) &&
218  Y->dim32(2) == X.dim32(2) && stride_h() == 1 && stride_w() == 1 &&
219  pad_t() == 0 && pad_b() == 0 && pad_l() == 0 && pad_r() == 0) {
220  math::Gemm<T, Context>(
221  CblasNoTrans,
222  CblasTrans,
223  N * H * W,
224  M,
225  C,
226  1,
227  Xdata,
228  filter.template data<T>(),
229  0,
230  Ydata,
231  &context_);
232  if (InputSize() == 3) {
233  auto& bias = Input(BIAS);
234  CAFFE_ENFORCE(1 == bias.ndim());
235  CAFFE_ENFORCE(bias.dim32(0) == M);
236  if (bias_multiplier_.size() != N * H * W) {
237  // If the helper bias multiplier is not M, reshape and fill it with one.
238  bias_multiplier_.Resize(vector<TIndex>(1, N * H * W));
239  math::Set<T, Context>(
240  N * H * W,
241  static_cast<T>(1),
242  bias_multiplier_.template mutable_data<T>(),
243  &context_);
244  }
245  math::Gemm<T, Context>(
246  CblasNoTrans,
247  CblasNoTrans,
248  N * H * W,
249  M,
250  1,
251  1,
252  bias_multiplier_.template data<T>(),
253  bias.template data<T>(),
254  1,
255  Ydata,
256  &context_);
257  }
258  } else {
259  if (InputSize() == 3) {
260  const auto& bias = Input(BIAS);
261  CAFFE_ENFORCE(1 == bias.ndim());
262  CAFFE_ENFORCE(bias.dim32(0) == M);
263  ConvPoolOpBase<Context>::template SetBiasMultiplier<T>(
264  output_image_size, &bias_multiplier_);
265  }
266  auto f = [&](Tensor<Context>* col_buffer) {
267  col_buffer->Resize(
268  vector<TIndex>{Y->dim32(1), Y->dim32(2), kernel_h(), kernel_w(), C});
269  T* col_buffer_data = col_buffer->template mutable_data<T>();
270  // Im2col, followed by gemm.
271  for (int image_id = 0; image_id < N; ++image_id) {
272  math::Im2col<T, Context, StorageOrder::NHWC>(
273  Xdata,
274  C,
275  H,
276  W,
277  kernel_h(),
278  kernel_w(),
279  dilation_h(),
280  dilation_w(),
281  pad_t(),
282  pad_l(),
283  pad_b(),
284  pad_r(),
285  stride_h(),
286  stride_w(),
287  col_buffer_data,
288  &context_);
289  // Weight term
290  math::Gemm<T, Context>(
291  CblasNoTrans,
292  CblasTrans,
293  output_image_size,
294  M,
295  kernel_dim,
296  1,
297  col_buffer_data,
298  filter.template data<T>(),
299  0,
300  Ydata,
301  &context_);
302  if (InputSize() == 3) {
303  // Bias term
304  math::Gemm<T, Context>(
305  CblasNoTrans,
306  CblasNoTrans,
307  output_image_size,
308  M,
309  1,
310  1,
311  bias_multiplier_.template data<T>(),
312  Input(BIAS).template data<T>(),
313  1,
314  Ydata,
315  &context_);
316  }
317  Xdata += input_offset;
318  Ydata += output_offset;
319  }
320  };
321  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
322  runWithSharedBuffer<Context>(ws_, f);
323  } else {
324  f(&col_buffer_);
325  }
326  }
327  return true;
328 }
329 
330 template <typename T, class Context>
331 bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
332  auto& X = Input(INPUT);
333  auto& filter = Input(FILTER);
334  auto& dY = Input(OUTPUT_GRAD);
335  auto* dfilter = Output(FILTER_GRAD);
336  const int N = X.dim32(0), C = X.dim32(1);
337 
338  const vector<int> input_dims = this->GetDims(X);
339  const int input_image_size = this->GetDimsSize(X);
340 
341  const vector<int> output_dims = this->GetDims(dY);
342  // The output image size is the spatial size of the output.
343  const int output_image_size = this->GetDimsSize(dY);
344 
345  ConvPoolOpBase<Context>::ComputePads(input_dims);
346  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
347  const int M = filter.dim32(0);
348  CAFFE_ENFORCE(filter.dim32(1) * group_ == C);
349 
350  int kernel_dims_size = 1;
351  for (int i = 0; i < kernel_.size(); ++i) {
352  CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
353  kernel_dims_size *= kernel_[i];
354  }
355 
356  CAFFE_ENFORCE(M % group_ == 0);
357  dfilter->ResizeLike(filter);
358  // The dimension of each kernel
359  const int kernel_dim = C / group_ * kernel_dims_size;
360  // The offset corresponding to a single input image, and a single output
361  // image.
362  const int input_offset = C / group_ * input_image_size;
363  const int output_offset = dY.size() / dY.dim32(0) / group_;
364  const int filter_offset = filter.size() / group_;
365  // The col buffer is stored in CHW order as well - kernel_dim, and the height
366  // and width.
367 
368  vector<int> img_shape;
369  img_shape.assign(X.dims().begin() + 1, X.dims().end());
370  vector<int> col_buffer_shape;
371  col_buffer_shape.push_back(C / group_ * kernel_dims_size);
372  col_buffer_shape.insert(
373  col_buffer_shape.end(), output_dims.begin(), output_dims.end());
374  col_buffer_.Resize(col_buffer_shape);
375 
376  if (kernel_.size() != 2) {
377  SetDeviceTensor(img_shape, &img_shape_device_);
378  SetDeviceTensor(col_buffer_shape, &col_buffer_shape_device_);
379  }
380 
381  const int col_buffer_size =
382  (C / group_) * kernel_dims_size * output_image_size;
383  const T* Xdata = X.template data<T>();
384  const T* filter_data = filter.template data<T>();
385  const T* dYdata = dY.template data<T>();
386  T* col_buffer_data = col_buffer_.template mutable_data<T>();
387  T* dfilter_data = dfilter->template mutable_data<T>();
388 
389  // Pre-setting the gradients to zero.
390  math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
391 
392  T* dbias_data = nullptr;
393  if (!no_bias_) {
394  auto* dbias = Output(BIAS_OR_INPUT_GRAD);
395  dbias->Resize(M);
396  if (bias_multiplier_.size() != output_image_size) {
397  // If the helper bias multiplier is not M, reshape and fill it with one.
398  bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
399  math::Set<T, Context>(
400  output_image_size,
401  static_cast<T>(1),
402  bias_multiplier_.template mutable_data<T>(),
403  &context_);
404  }
405  dbias_data = dbias->template mutable_data<T>();
406  math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
407  }
408 
409  for (int image_id = 0; image_id < N; ++image_id) {
410  for (int group_id = 0; group_id < group_; ++group_id) {
411  // When we compute the gradient with respect to the filters, we need to do
412  // im2col to allow gemm-type computation.
413  if (kernel_.size() == 2) {
414  math::Im2col<T, Context, StorageOrder::NCHW>(
415  Xdata + group_id * input_offset,
416  C / group_,
417  input_dims[0],
418  input_dims[1],
419  kernel_h(),
420  kernel_w(),
421  dilation_h(),
422  dilation_w(),
423  pad_t(),
424  pad_l(),
425  pad_b(),
426  pad_r(),
427  stride_h(),
428  stride_w(),
429  col_buffer_data,
430  &context_);
431  } else {
432  math::Im2colNd<T, Context, StorageOrder::NCHW>(
433  Xdata + group_id * input_offset,
434  img_shape_device_.template data<int>(),
435  col_buffer_shape_device_.template data<int>(),
436  C * input_image_size,
437  col_buffer_size,
438  kernel_device_.template data<int>(),
439  stride_device_.template data<int>(),
440  dilation_device_.template data<int>(),
441  pads_device_.template data<int>(),
442  kernel_.size(),
443  col_buffer_data,
444  &context_);
445  }
446  // Gradient with respect to filter.
447  math::Gemm<T, Context>(
448  CblasNoTrans,
449  CblasTrans,
450  M / group_,
451  kernel_dim,
452  output_image_size,
453  1,
454  dYdata + group_id * output_offset,
455  col_buffer_data,
456  1,
457  dfilter_data + group_id * filter_offset,
458  &context_);
459  }
460  if (!no_bias_) {
461  // Gradient with respect to bias can be computed independent from group.
462  math::Gemv<T, Context>(
463  CblasNoTrans,
464  M,
465  output_image_size,
466  1,
467  dYdata,
468  bias_multiplier_.template data<T>(),
469  1,
470  dbias_data,
471  &context_);
472  }
473  Xdata += input_offset * group_;
474  dYdata += output_offset * group_;
475  }
476  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
477  // Compute the gradient w.r.t. the input.
478  auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
479  dX->ResizeLike(X);
480  T* dXdata = dX->template mutable_data<T>();
481  dYdata = dY.template data<T>();
482  for (int image_id = 0; image_id < N; ++image_id) {
483  for (int group_id = 0; group_id < group_; ++group_id) {
484  // Compute gradient into col_buffer.
485  math::Gemm<T, Context>(
486  CblasTrans,
487  CblasNoTrans,
488  kernel_dim,
489  output_image_size,
490  M / group_,
491  1,
492  filter_data + group_id * filter_offset,
493  dYdata,
494  0,
495  col_buffer_data,
496  &context_);
497  if (kernel_.size() == 2) {
498  math::Col2im<T, Context, StorageOrder::NCHW>(
499  col_buffer_data,
500  C / group_,
501  input_dims[0],
502  input_dims[1],
503  kernel_h(),
504  kernel_w(),
505  dilation_h(),
506  dilation_w(),
507  pad_t(),
508  pad_l(),
509  pad_b(),
510  pad_r(),
511  stride_h(),
512  stride_w(),
513  dXdata,
514  &context_);
515  } else {
516  math::Col2imNd<T, Context, StorageOrder::NCHW>(
517  col_buffer_data,
518  img_shape_device_.template data<int>(),
519  col_buffer_shape_device_.template data<int>(),
520  C * input_image_size,
521  col_buffer_size,
522  kernel_device_.template data<int>(),
523  stride_device_.template data<int>(),
524  dilation_device_.template data<int>(),
525  pads_device_.template data<int>(),
526  kernel_.size(),
527  dXdata,
528  &context_);
529  }
530  dXdata += input_offset;
531  dYdata += output_offset;
532  }
533  }
534  }
535  return true;
536 }
537 
538 template <typename T, class Context>
539 bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
540  auto& X = Input(INPUT);
541  auto& filter = Input(FILTER);
542  auto& dY = Input(OUTPUT_GRAD);
543  auto* dfilter = Output(FILTER_GRAD);
544 
545  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
546  ConvPoolOpBase<Context>::ComputePads({H, W});
547  CAFFE_ENFORCE(4 == filter.ndim());
548  const int M = filter.dim32(0);
549  CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
550  CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
551  CAFFE_ENFORCE(filter.dim32(3) == C);
552  dfilter->ResizeLike(filter);
553 
554  // The dimension of each kernel
555  const int kernel_dim = kernel_h() * kernel_w() * C;
556  // The offset corresponding to a single input image, and a single output
557  // image.
558  const int input_offset = H * W * C;
559  const int output_offset = dY.size() / dY.dim32(0);
560  // The output image size is the spatial size of the output.
561  const int output_image_size = dY.dim32(1) * dY.dim32(2);
562  // The col buffer is stored in CHW order as well - kernel_dim, and the height
563  // and width.
564  col_buffer_.Resize(output_image_size, kernel_dim);
565 
566  const T* Xdata = X.template data<T>();
567  const T* const filter_data = filter.template data<T>();
568  const T* const dYdata = dY.template data<T>();
569  T* col_buffer_data = col_buffer_.template mutable_data<T>();
570  T* dfilter_data = dfilter->template mutable_data<T>();
571 
572  // Pre-setting the gradients to zero.
573  math::Set<T, Context>(dfilter->size(), 0, dfilter_data, &context_);
574 
575  T* dbias_data = nullptr;
576  if (!no_bias_) {
577  auto* dbias = Output(BIAS_OR_INPUT_GRAD);
578  dbias->Resize(M);
579  dbias_data = dbias->template mutable_data<T>();
580  math::Set<T, Context>(dbias->size(), 0, dbias_data, &context_);
581  if (bias_multiplier_.size() != output_image_size) {
582  // If the helper bias multiplier is not M, reshape and fill it with one.
583  bias_multiplier_.Resize(vector<TIndex>(1, output_image_size));
584  math::Set<T, Context>(
585  output_image_size,
586  static_cast<T>(1),
587  bias_multiplier_.template mutable_data<T>(),
588  &context_);
589  }
590  }
591 
592  for (int image_id = 0; image_id < N; ++image_id) {
593  // When we compute the gradient with respect to the filters, we need to do
594  // im2col to allow gemm-type computation.
595  math::Im2col<T, Context, StorageOrder::NHWC>(
596  Xdata,
597  C,
598  H,
599  W,
600  kernel_h(),
601  kernel_w(),
602  dilation_h(),
603  dilation_w(),
604  pad_t(),
605  pad_l(),
606  pad_b(),
607  pad_r(),
608  stride_h(),
609  stride_w(),
610  col_buffer_data,
611  &context_);
612  // Gradient with respect to filter.
613  math::Gemm<T, Context>(
614  CblasTrans,
615  CblasNoTrans,
616  M,
617  kernel_dim,
618  output_image_size,
619  1,
620  dYdata + output_offset * image_id,
621  col_buffer_data,
622  1,
623  dfilter_data,
624  &context_);
625  if (!no_bias_) {
626  // Gradient with respect to bias
627  math::Gemv<T, Context>(
628  CblasTrans,
629  output_image_size,
630  M,
631  1,
632  dYdata + output_offset * image_id,
633  bias_multiplier_.template data<T>(),
634  1,
635  dbias_data,
636  &context_);
637  }
638  Xdata += input_offset;
639  }
640 
641  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
642  // Compute the gradient w.r.t. the input.
643  auto* dX = Output(no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD);
644  dX->ResizeLike(X);
645  T* dXdata = dX->template mutable_data<T>();
646  for (int image_id = 0; image_id < N; ++image_id) {
647  // Compute gradient into col_buffer.
648  math::Gemm<T, Context>(
649  CblasNoTrans,
650  CblasNoTrans,
651  output_image_size,
652  kernel_dim,
653  M,
654  1,
655  dYdata + output_offset * image_id,
656  filter_data,
657  0,
658  col_buffer_data,
659  &context_);
660  math::Col2im<T, Context, StorageOrder::NHWC>(
661  col_buffer_data,
662  C,
663  H,
664  W,
665  kernel_h(),
666  kernel_w(),
667  dilation_h(),
668  dilation_w(),
669  pad_t(),
670  pad_l(),
671  pad_b(),
672  pad_r(),
673  stride_h(),
674  stride_w(),
675  dXdata,
676  &context_);
677  dXdata += input_offset;
678  }
679  }
680  return true;
681 }
682 } // namespace caffe2
683 
684 #endif // CAFFE2_OPERATORS_CONV_OP_IMPL_H_
Copyright (c) 2016-present, Facebook, Inc.
Copyright (c) 2016-present, Facebook, Inc.
Copyright (c) 2016-present, Facebook, Inc.