Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_transpose_op_impl.h
1 // conv_transpose_op_impl.h is the templated implementation of the
2 // conv_transpose_op.h file.
3 #ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_IMPL_H_
4 #define CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_IMPL_H_
5 
6 #include "caffe2/core/context.h"
7 #include "caffe2/core/logging.h"
8 #include "caffe2/core/operator.h"
9 #include "caffe2/operators/conv_op_shared.h"
10 #include "caffe2/operators/conv_transpose_op.h"
11 #include "caffe2/operators/conv_transpose_unpool_op_base.h"
12 #include "caffe2/utils/math.h"
13 
14 C10_DECLARE_bool(caffe2_force_shared_col_buffer);
15 
16 namespace caffe2 {
17 
18 template <typename T, class Context>
19 bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
20  const Tensor& X = Input(INPUT);
21  auto& filter = Input(FILTER);
22  const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
23  CAFFE_ENFORCE(filter.dim() == 4, "filter must be 4D tensor");
24  CAFFE_ENFORCE(
25  filter.dim32(0) == M,
26  "filter number must be equal to input channel number");
27  const int C = filter.dim32(1);
28  CAFFE_ENFORCE(
29  filter.dim32(2) == this->kernel_h(),
30  "filter height must be equal to kernel height");
31  CAFFE_ENFORCE(
32  filter.dim32(3) == this->kernel_w(),
33  "filter width must be equal to kernel width");
34  auto sizes = ConvTransposeUnpoolBase<Context>::GetOutputSize(X, C);
35  Tensor* Y = Output(0, sizes, at::dtype<T>());
36 
37  const int kernel_dim = C * this->kernel_h() * this->kernel_w();
38  const int input_image_size = H * W;
39  const int output_image_size = Y->dim32(2) * Y->dim32(3);
40 
41  if (InputSize() == 3) {
42  auto& bias = Input(BIAS);
43  CAFFE_ENFORCE(bias.dim() == 1, "bias must be 1D tensor");
44  CAFFE_ENFORCE(
45  bias.dim32(0) == C,
46  "bias dimension must be equal to output channel number");
48  &bias_multiplier_,
49  {1, output_image_size},
50  at::dtype<T>().device(Context::GetDeviceType()));
51  T* bm_data = bias_multiplier_.template mutable_data<T>();
52  math::Set<T, Context>(
53  output_image_size,
54  static_cast<T>(1),
55  bm_data,
56  &context_);
57  }
58 
59  const T* Xdata = X.template data<T>();
60  const T* filter_data = filter.template data<T>();
61  T* Ydata = Y->template mutable_data<T>();
62 
63  auto f = [&](Tensor* col_buffer) {
64  ReinitializeTensor(col_buffer, vector<int64_t>{C, this->kernel_h(), this->kernel_w(), H, W}, at::dtype<T>().device(Context::GetDeviceType()));
65  T* col_buffer_data = col_buffer->template mutable_data<T>();
66  for (auto image_id = 0; image_id < N; ++image_id) {
67  // Weight term
68  math::Gemm<T, Context>(
69  CblasTrans,
70  CblasNoTrans,
71  kernel_dim,
72  input_image_size,
73  M,
74  1,
75  filter_data,
76  Xdata,
77  0,
78  col_buffer_data,
79  &context_);
80 
81  // Col2Im
82  math::Col2Im<T, Context, StorageOrder::NCHW>(
83  C,
84  Y->dim32(2),
85  Y->dim32(3),
86  this->kernel_h(),
87  this->kernel_w(),
88  1,
89  1,
90  this->pad_t(),
91  this->pad_l(),
92  this->pad_b(),
93  this->pad_r(),
94  this->stride_h(),
95  this->stride_w(),
96  col_buffer_data,
97  Ydata,
98  &context_);
99 
100  // Bias term
101  if (InputSize() == 3) {
102  const T* bias_data = Input(BIAS).template data<T>();
103  const T* bm_data = bias_multiplier_.template data<T>();
104 #if !defined(__ARM_NEON__) && !defined(__ARM_NEON)
105  math::Gemm<T, Context>(
106  CblasNoTrans,
107  CblasNoTrans,
108  C,
109  output_image_size,
110  1,
111  1,
112  bias_data,
113  bm_data,
114  1,
115  Ydata,
116  &context_);
117 #else
118  math::BiasCHW<T, Context>(
119  bias_data,
120  bm_data,
121  C,
122  output_image_size,
123  Ydata,
124  &context_);
125 #endif // !defined(__ARM_NEON__) && !defined(__ARM_NEON)
126  }
127 
128  Xdata += M * H * W;
129  Ydata += Y->numel() / Y->dim32(0);
130  }
131  };
132  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
133  runWithSharedBuffer<Context>(ws_, f);
134  } else {
135  f(&col_buffer_);
136  }
137  return true;
138 }
139 
140 template <typename T, class Context>
141 bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
142  const Tensor& X = Input(INPUT);
143  auto& filter = Input(FILTER);
144  const auto N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), M = X.dim32(3);
145  CAFFE_ENFORCE(filter.dim() == 4, "filter must be 4D tensor");
146  CAFFE_ENFORCE(
147  filter.dim32(0) == M,
148  "filter number must be equal to input channel number");
149  CAFFE_ENFORCE(
150  filter.dim32(1) == this->kernel_h(),
151  "filter height must be equal to kernel height");
152  CAFFE_ENFORCE(
153  filter.dim32(2) == this->kernel_w(),
154  "filter width must be equal to kernel width");
155  const int C = filter.dim32(3);
156  auto sizes = ConvTransposeUnpoolBase<Context>::GetOutputSize(X, C);
157  Tensor* Y = Output(0, sizes, at::dtype<T>());
158 
159  const auto kernel_dim = C * this->kernel_h() * this->kernel_w();
160  const auto input_image_size = H * W;
161  const auto output_image_size = Y->dim32(1) * Y->dim32(2);
162 
163  if (InputSize() == 3) {
164  auto& bias = Input(BIAS);
165  CAFFE_ENFORCE(bias.dim() == 1, "bias must be 1D tensor");
166  CAFFE_ENFORCE(
167  bias.dim32(0) == C,
168  "bias dimension must be equal to output channel number");
169  // TODO(jerryzh): is it OK to remove the check of whether numel is output_image_size
171  &bias_multiplier_,
172  {1, output_image_size},
173  at::dtype<T>().device(Context::GetDeviceType()));
174  T* bm_data = bias_multiplier_.template mutable_data<T>();
175  math::Set<T, Context>(
176  output_image_size,
177  static_cast<T>(1),
178  bm_data,
179  &context_);
180  }
181  const T* Xdata = X.template data<T>();
182  const T* filter_data = filter.template data<T>();
183  T* Ydata = Y->template mutable_data<T>();
184 
185  auto f = [&](Tensor* /*col_buffer*/) {
187  &col_buffer_,
188  vector<int64_t>{H, W, this->kernel_h(), this->kernel_w(), C},
189  at::dtype<T>().device(Context::GetDeviceType()));
190  T* col_buffer_data = col_buffer_.template mutable_data<T>();
191  for (auto image_id = 0; image_id < N; ++image_id) {
192  // Weight term
193  math::Gemm<T, Context>(
194  CblasNoTrans,
195  CblasNoTrans,
196  input_image_size,
197  kernel_dim,
198  M,
199  1,
200  Xdata,
201  filter_data,
202  0,
203  col_buffer_data,
204  &context_);
205  // Col2Im
206  math::Col2Im<T, Context, StorageOrder::NHWC>(
207  C,
208  Y->dim32(1),
209  Y->dim32(2),
210  this->kernel_h(),
211  this->kernel_w(),
212  1,
213  1,
214  this->pad_t(),
215  this->pad_l(),
216  this->pad_b(),
217  this->pad_r(),
218  this->stride_h(),
219  this->stride_w(),
220  col_buffer_data,
221  Ydata,
222  &context_);
223  // Bias term
224  if (InputSize() == 3) {
225  const T* bm_data = bias_multiplier_.template data<T>();
226  const T* bias_data = Input(BIAS).template data<T>();
227  math::Gemm<T, Context>(
228  CblasNoTrans,
229  CblasNoTrans,
230  output_image_size,
231  C,
232  1,
233  1,
234  bm_data,
235  bias_data,
236  1,
237  Ydata,
238  &context_);
239  }
240  Xdata += M * H * W;
241  Ydata += Y->numel() / Y->dim32(0);
242  }
243  };
244  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
245  runWithSharedBuffer<Context>(ws_, f);
246  } else {
247  f(&col_buffer_);
248  }
249  return true;
250 }
251 
252 template <typename T, class Context>
253 bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
254  auto& X = Input(INPUT);
255  auto& filter = Input(FILTER);
256  auto& dY = Input(OUTPUT_GRAD);
257 
258  const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
259  // We only handle LegacyPadding::NOTSET case and ignore cases of
260  // LegacyPadding::VALID and LegacyPadding::SAME
261  // Thus, we don't need to manually compute padding values
262  // We simply use the values from the user
263  CAFFE_ENFORCE(filter.dim() == 4);
264  const int C = filter.dim32(1);
265  CAFFE_ENFORCE(
266  filter.dim32(2) == this->kernel_h(),
267  "filter height must be equal to kernel height");
268  CAFFE_ENFORCE(
269  filter.dim32(3) == this->kernel_w(),
270  "filter width must be equal to kernel width");
271  auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>());
272 
273  const int kernel_dim = C * this->kernel_h() * this->kernel_w();
274  const int output_image_size = dY.dim32(2) * dY.dim32(3);
275  // The col buffer is stored in CHW order as well
277  &col_buffer_,
278  vector<int64_t>{C, this->kernel_h(), this->kernel_w(), H, W},
279  at::dtype<T>().device(Context::GetDeviceType()));
280  if (!no_bias_) {
281  auto* dbias = Output(BIAS_OR_INPUT_GRAD);
282  dbias->Resize(C);
283  // TODO(jerryzh): is it OK to remove the check of whether numel is output_image_size
285  &bias_multiplier_,
286  {1, output_image_size},
287  at::dtype<T>().device(Context::GetDeviceType()));
288  T* bm_data = bias_multiplier_.template mutable_data<T>();
289  math::Set<T, Context>(
290  output_image_size,
291  static_cast<T>(1),
292  bm_data,
293  &context_);
294  }
295  T* col_buffer_data = col_buffer_.template mutable_data<T>();
296  const T* Xdata = X.template data<T>();
297  const T* filter_data = filter.template data<T>();
298  const T* dYdata = dY.template data<T>();
299  T* dfilter_data = dfilter->template mutable_data<T>();
300  // Pre-setting the gradients to zero
301  math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_);
302  if (!no_bias_) {
303  auto* dbias = Output(BIAS_OR_INPUT_GRAD);
304  T* dbias_data = dbias->template mutable_data<T>();
305  math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_);
306  }
307  for (auto image_id = 0; image_id < N; ++image_id) {
308  // gradient w.r.t. filters. Im2Col followed by Gemm
309  // Im2Col.
310  math::Im2Col<T, Context, StorageOrder::NCHW>(
311  C,
312  dY.dim32(2),
313  dY.dim32(3),
314  this->kernel_h(),
315  this->kernel_w(),
316  1,
317  1,
318  this->pad_t(),
319  this->pad_l(),
320  this->pad_b(),
321  this->pad_r(),
322  this->stride_h(),
323  this->stride_w(),
324  dYdata,
325  col_buffer_data,
326  &context_);
327  // Gemm
328  math::Gemm<T, Context>(
329  CblasNoTrans,
330  CblasTrans,
331  M,
332  kernel_dim,
333  H * W,
334  1,
335  Xdata,
336  col_buffer_data,
337  1,
338  dfilter_data,
339  &context_);
340  // gradient w.r.t. bias
341  if (!no_bias_) {
342  const T* bm_data = bias_multiplier_.template data<T>();
343  T* input_grad_data = Output(BIAS_OR_INPUT_GRAD)->template mutable_data<T>();
344  math::Gemm<T, Context>(
345  CblasNoTrans,
346  CblasNoTrans,
347  C,
348  1,
349  output_image_size,
350  1,
351  dYdata,
352  bm_data,
353  1,
354  input_grad_data,
355  &context_);
356  }
357  dYdata += dY.numel() / dY.dim32(0);
358  Xdata += X.numel() / X.dim32(0);
359  }
360  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
361  // Compute gradients w.r.t. the input
362  // Since we have changed dYdata in the above loop, we will need to reset.
363  dYdata = dY.template data<T>();
364 
365  auto* dX = Output(
366  no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
367  T* dXdata = dX->template mutable_data<T>();
368  for (auto image_id = 0; image_id < N; ++image_id) {
369  // Im2Col.
370  // TODO(zyan3): Probably duplicate work as in gradient computation
371  // w.r.t filters
372  math::Im2Col<T, Context, StorageOrder::NCHW>(
373  C,
374  dY.dim32(2),
375  dY.dim32(3),
376  this->kernel_h(),
377  this->kernel_w(),
378  1,
379  1,
380  this->pad_t(),
381  this->pad_l(),
382  this->pad_b(),
383  this->pad_r(),
384  this->stride_h(),
385  this->stride_w(),
386  dYdata,
387  col_buffer_data,
388  &context_);
389  // Gemm
390  math::Gemm<T, Context>(
391  CblasNoTrans,
392  CblasNoTrans,
393  M,
394  H * W,
395  kernel_dim,
396  1,
397  filter_data,
398  col_buffer_data,
399  0,
400  dXdata,
401  &context_);
402  dYdata += dY.numel() / dY.dim32(0);
403  dXdata += X.numel() / X.dim32(0);
404  }
405  }
406  return true;
407 }
408 
409 template <typename T, class Context>
410 bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
411  auto& X = Input(INPUT);
412  auto& filter = Input(FILTER);
413  auto& dY = Input(OUTPUT_GRAD);
414 
415  const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), M = X.dim32(3);
416  // We only handle LegacyPadding::NOTSET case and ignore cases of
417  // LegacyPadding::VALID and LegacyPadding::SAME
418  // Thus, we don't need to manually compute padding values
419  // We simply use the values from the user
420  CAFFE_ENFORCE(filter.dim() == 4, "filter must be 4D tensor");
421  CAFFE_ENFORCE(
422  filter.dim32(1) == this->kernel_h(),
423  "filter height must be equal to kernel height");
424  CAFFE_ENFORCE(
425  filter.dim32(2) == this->kernel_w(),
426  "filter width must be equal to kernel width");
427  const int C = filter.dim32(3);
428  auto* dfilter = Output(FILTER_GRAD, filter.sizes(), at::dtype<T>());
429 
430  const int kernel_dim = C * this->kernel_h() * this->kernel_w();
431  const int output_image_size = dY.dim32(1) * dY.dim32(2);
432  // The col buffer is stored in HWC order as well
434  &col_buffer_,
435  vector<int64_t>{H, W, this->kernel_h(), this->kernel_w(), C},
436  at::dtype<T>().device(Context::GetDeviceType()));
437  if (!no_bias_) {
438  auto* dbias = Output(BIAS_OR_INPUT_GRAD);
439  dbias->Resize(C);
440  // TODO(jerryzh): is it OK to remove the check of whether numel is output_image_size
442  &bias_multiplier_,
443  {1, output_image_size},
444  at::dtype<T>().device(Context::GetDeviceType()));
445  T* bm_data = bias_multiplier_.template mutable_data<T>();
446  math::Set<T, Context>(
447  output_image_size,
448  static_cast<T>(1),
449  bm_data,
450  &context_);
451  }
452  T* col_buffer_data = col_buffer_.template mutable_data<T>();
453  const T* Xdata = X.template data<T>();
454  const T* filter_data = filter.template data<T>();
455  const T* dYdata = dY.template data<T>();
456  T* dfilter_data = dfilter->template mutable_data<T>();
457  // Pre-setting the gradients to zero
458  math::Set<T, Context>(dfilter->numel(), 0, dfilter_data, &context_);
459  if (!no_bias_) {
460  auto* dbias = Output(BIAS_OR_INPUT_GRAD);
461  T* dbias_data = dbias->template mutable_data<T>();
462  math::Set<T, Context>(dbias->numel(), 0, dbias_data, &context_);
463  }
464  for (auto image_id = 0; image_id < N; ++image_id) {
465  // gradient w.r.t. filters. Im2Col followed by Gemm
466  // Im2Col.
467  math::Im2Col<T, Context, StorageOrder::NHWC>(
468  C,
469  dY.dim32(1),
470  dY.dim32(2),
471  this->kernel_h(),
472  this->kernel_w(),
473  1,
474  1,
475  this->pad_t(),
476  this->pad_l(),
477  this->pad_b(),
478  this->pad_r(),
479  this->stride_h(),
480  this->stride_w(),
481  dYdata,
482  col_buffer_data,
483  &context_);
484  // Gemm
485  math::Gemm<T, Context>(
486  CblasTrans,
487  CblasNoTrans,
488  M,
489  kernel_dim,
490  H * W,
491  1,
492  Xdata,
493  col_buffer_data,
494  1,
495  dfilter_data,
496  &context_);
497  // gradients w.r.t. bias
498  if (!no_bias_) {
499  const T* bm_data = bias_multiplier_.template data<T>();
500  T* input_grad_data = Output(BIAS_OR_INPUT_GRAD)->template mutable_data<T>();
501  math::Gemm<T, Context>(
502  CblasTrans,
503  CblasNoTrans,
504  C,
505  1,
506  output_image_size,
507  1,
508  dYdata,
509  bm_data,
510  1,
511  input_grad_data,
512  &context_);
513  }
514  dYdata += dY.numel() / dY.dim32(0);
515  Xdata += X.numel() / X.dim32(0);
516  }
517  if (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))) {
518  // Compute gradients w.r.t. the input
519  // Since we have changed dYdata in the above loop, we will need to reset.
520  dYdata = dY.template data<T>();
521 
522  auto* dX = Output(
523  no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
524  T* dXdata = dX->template mutable_data<T>();
525  for (auto image_id = 0; image_id < N; ++image_id) {
526  // Im2Col.
527  // TODO(zyan3): Probably duplicate work as in gradient computation
528  // w.r.t filters
529  math::Im2Col<T, Context, StorageOrder::NHWC>(
530  C,
531  dY.dim32(1),
532  dY.dim32(2),
533  this->kernel_h(),
534  this->kernel_w(),
535  1,
536  1,
537  this->pad_t(),
538  this->pad_l(),
539  this->pad_b(),
540  this->pad_r(),
541  this->stride_h(),
542  this->stride_w(),
543  dYdata,
544  col_buffer_data,
545  &context_);
546  // Gemm
547  math::Gemm<T, Context>(
548  CblasNoTrans,
549  CblasTrans,
550  H * W,
551  M,
552  kernel_dim,
553  1,
554  col_buffer_data,
555  filter_data,
556  0,
557  dXdata,
558  &context_);
559  dYdata += dY.numel() / dY.dim32(0);
560  dXdata += X.numel() / X.dim32(0);
561  }
562  }
563  return true;
564 }
565 
566 } // namespace caffe2
567 #endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_OP_IMPL_H_
Definition: any.cpp:108
void ReinitializeTensor(Tensor *tensor, at::IntArrayRef dims, at::TensorOptions options)
Reinitialize a Tensor to given dims and options if necessary, note that this will not do anything if ...
Definition: tensor.cc:127
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64