Caffe2 - C++ API
A deep learning, cross platform ML framework
pool_op_cudnn.cc
1 #include "caffe2/operators/pool_op.h"
2 
3 #include <algorithm>
4 #include <array>
5 #include <type_traits>
6 #include <vector>
7 
8 #include "caffe2/core/context_gpu.h"
9 #include "caffe2/core/cudnn_wrappers.h"
10 
11 namespace caffe2 {
12 
13 namespace {
14 
15 void SetTensorDescriptor(
16  const cudnnDataType_t data_type,
17  const StorageOrder order,
18  const std::vector<std::int64_t>& dims,
19  cudnnTensorDescriptor_t* desc) {
20  const int ndim = dims.size();
21  const int N = dims[0];
22  const int C = order == StorageOrder::NCHW ? dims[1] : dims[ndim - 1];
23  switch (ndim) {
24  case 4: {
25  const int H = order == StorageOrder::NCHW ? dims[2] : dims[1];
26  const int W = order == StorageOrder::NCHW ? dims[3] : dims[2];
27  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
28  *desc, GetCudnnTensorFormat(order), data_type, N, C, H, W));
29  break;
30  }
31  case 5: {
32  const int D = order == StorageOrder::NCHW ? dims[2] : dims[1];
33  const int H = order == StorageOrder::NCHW ? dims[3] : dims[2];
34  const int W = order == StorageOrder::NCHW ? dims[4] : dims[3];
35  const std::array<int, 5> dims_arr = {N, C, D, H, W};
36  const std::array<int, 5> strides_arr = order == StorageOrder::NCHW
37  ? std::array<int, 5>{C * D * H * W, D * H * W, H * W, W, 1}
38  : std::array<int, 5>{D * H * W * C, 1, H * W * C, W * C, C};
39  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
40  *desc, data_type, 5, dims_arr.data(), strides_arr.data()));
41  break;
42  }
43  default: {
44  CAFFE_THROW("Unsupported tensor dim: ", ndim);
45  break;
46  }
47  }
48 }
49 
50 template <class Functor>
51 class CuDNNPoolOp final : public ConvPoolOpBase<CUDAContext> {
52  public:
53  template <class... Args>
54  explicit CuDNNPoolOp(Args&&... args)
55  : ConvPoolOpBase<CUDAContext>(std::forward<Args>(args)...),
56  cudnn_wrapper_(&context_),
57  functor_(*this),
58  equal_padding_(std::equal(
59  pads_.cbegin(),
60  pads_.cbegin() + kernel_.size(),
61  pads_.cbegin() + kernel_.size())) {
62  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
63  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_));
64  CUDNN_ENFORCE(cudnnCreatePoolingDescriptor(&pooling_desc_));
65  if (!global_pooling_ && equal_padding_) {
66  if (kernel_.size() == 2) {
67  CUDNN_ENFORCE(cudnnSetPooling2dDescriptor(
68  pooling_desc_,
69  functor_.GetPoolingMode(),
70  CUDNN_NOT_PROPAGATE_NAN,
71  kernel_h(),
72  kernel_w(),
73  pad_t(),
74  pad_l(),
75  stride_h(),
76  stride_w()));
77  } else if (kernel_.size() == 3) {
78  CUDNN_ENFORCE(cudnnSetPoolingNdDescriptor(
79  pooling_desc_,
80  functor_.GetPoolingMode(),
81  CUDNN_NOT_PROPAGATE_NAN,
82  kernel_.size(),
83  kernel_.data(),
84  pads_.data(),
85  stride_.data()));
86  }
87  }
88  }
89 
90  ~CuDNNPoolOp() override {
91  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
92  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_));
93  CUDNN_ENFORCE(cudnnDestroyPoolingDescriptor(pooling_desc_));
94  }
95 
96  bool RunOnDevice() override {
97  return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
98  }
99 
100  template <typename T>
101  bool DoRunWithType() {
102  const auto& X = Input(0);
103  const int ndim = X.dim();
104  const int N = X.dim32(0);
105  const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
106  auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, C);
107  auto* Y = Output(0, sizes, at::dtype<T>());
108  const T* X_data = X.template data<T>();
109  T* Y_data = Y->template mutable_data<T>();
110 
111  if (N == 0) {
112  return true;
113  }
114 
115  if (global_pooling_) {
116  const int HxW = X.numel() / (N * C);
117  if (order_ == StorageOrder::NCHW) {
118  return functor_.template GlobalPoolingForward<T, StorageOrder::NCHW>(
119  N, C, HxW, X_data, Y_data, &context_);
120  } else {
121  return functor_.template GlobalPoolingForward<T, StorageOrder::NHWC>(
122  N, C, HxW, X_data, Y_data, &context_);
123  }
124  }
125 
126  const std::vector<int> X_HW_dims = GetDims(X);
127  const std::vector<int> Y_HW_dims = GetDims(*Y);
128  if (order_ == StorageOrder::NHWC) {
129  // CuDNN Pooling on NHWC order is very slow, fallback to CUDA
130  // implementation.
131  return functor_.template Forward<T, StorageOrder::NHWC>(
132  N,
133  C,
134  X_HW_dims,
135  Y_HW_dims,
136  kernel_,
137  dilation_,
138  stride_,
139  pads_,
140  X.template data<T>(),
141  Y->template mutable_data<T>(),
142  &context_);
143  } else if (!equal_padding_ || ndim == 3) {
144  return functor_.template Forward<T, StorageOrder::NCHW>(
145  N,
146  C,
147  X_HW_dims,
148  Y_HW_dims,
149  kernel_,
150  dilation_,
151  stride_,
152  pads_,
153  X.template data<T>(),
154  Y->template mutable_data<T>(),
155  &context_);
156  }
157 
158  const std::vector<std::int64_t> X_dims = X.sizes().vec();
159  const std::vector<std::int64_t> Y_dims = Y->sizes().vec();
160  if (cached_X_dims_ != X_dims) {
161  constexpr cudnnDataType_t data_type = cudnnTypeWrapper<T>::type;
162  SetTensorDescriptor(data_type, order_, X_dims, &X_desc_);
163  SetTensorDescriptor(data_type, order_, Y_dims, &Y_desc_);
164  cached_X_dims_ = X_dims;
165  }
166  CUDNN_ENFORCE(cudnnPoolingForward(
167  cudnn_wrapper_.inline_cudnn_handle(),
168  pooling_desc_,
169  cudnnTypeWrapper<T>::kOne(),
170  X_desc_,
171  X_data,
172  cudnnTypeWrapper<T>::kZero(),
173  Y_desc_,
174  Y_data));
175 
176  return true;
177  }
178 
179  private:
180  CuDNNWrapper cudnn_wrapper_;
181  cudnnTensorDescriptor_t X_desc_;
182  cudnnTensorDescriptor_t Y_desc_;
183  cudnnPoolingDescriptor_t pooling_desc_;
184 
185  const Functor functor_;
186 
187  const bool equal_padding_;
188  std::vector<std::int64_t> cached_X_dims_;
189 };
190 
191 template <class Functor>
192 class CuDNNPoolGradientOp final : public ConvPoolOpBase<CUDAContext> {
193  public:
194  template <class... Args>
195  explicit CuDNNPoolGradientOp(Args&&... args)
196  : ConvPoolOpBase<CUDAContext>(std::forward<Args>(args)...),
197  cudnn_wrapper_(&context_),
198  functor_(*this),
199  equal_padding_(std::equal(
200  pads_.cbegin(),
201  pads_.cbegin() + kernel_.size(),
202  pads_.cbegin() + kernel_.size())) {
203  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
204  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_));
205  CUDNN_ENFORCE(cudnnCreatePoolingDescriptor(&pooling_desc_));
206  if (!global_pooling_ && equal_padding_) {
207  if (kernel_.size() == 2) {
208  CUDNN_ENFORCE(cudnnSetPooling2dDescriptor(
209  pooling_desc_,
210  functor_.GetPoolingMode(),
211  CUDNN_NOT_PROPAGATE_NAN,
212  kernel_h(),
213  kernel_w(),
214  pad_t(),
215  pad_l(),
216  stride_h(),
217  stride_w()));
218  } else if (kernel_.size() == 3) {
219  CUDNN_ENFORCE(cudnnSetPoolingNdDescriptor(
220  pooling_desc_,
221  functor_.GetPoolingMode(),
222  CUDNN_NOT_PROPAGATE_NAN,
223  kernel_.size(),
224  kernel_.data(),
225  pads_.data(),
226  stride_.data()));
227  }
228  }
229  }
230 
231  ~CuDNNPoolGradientOp() override {
232  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
233  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_));
234  CUDNN_ENFORCE(cudnnDestroyPoolingDescriptor(pooling_desc_));
235  }
236 
237  bool RunOnDevice() override {
238  return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
239  }
240 
241  template <typename T>
242  bool DoRunWithType() {
243  const auto& X = Input(0);
244  const auto& Y = Input(1);
245  const auto& dY = Input(2);
246  auto* dX = Output(0, X.sizes(), at::dtype<T>());
247  const int ndim = X.dim();
248  const int N = X.dim32(0);
249  const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
250  const std::vector<int> X_HW_dims = GetDims(X);
251  const std::vector<int> Y_HW_dims = GetDims(Y);
252  ConvPoolOpBase<CUDAContext>::ComputePads(X_HW_dims);
253  const T* dY_data = dY.template data<T>();
254  const T* X_data = X.template data<T>();
255  const T* Y_data = Y.template data<T>();
256  T* dX_data = dX->template mutable_data<T>();
257 
258  if (N == 0) {
259  return true;
260  }
261 
262  if (global_pooling_) {
263  const int HxW = X.numel() / (N * C);
264  if (order_ == StorageOrder::NCHW) {
265  return functor_.template GlobalPoolingBackward<T, StorageOrder::NCHW>(
266  N, C, HxW, dY_data, X_data, Y_data, dX_data, &context_);
267  } else {
268  return functor_.template GlobalPoolingBackward<T, StorageOrder::NHWC>(
269  N, C, HxW, dY_data, X_data, Y_data, dX_data, &context_);
270  }
271  }
272 
273  if (order_ == StorageOrder::NHWC) {
274  // CuDNN Pooling on NHWC order is very slow, fallback to CUDA
275  // implementation.
276  return functor_.template Backward<T, StorageOrder::NHWC>(
277  N,
278  C,
279  X_HW_dims,
280  Y_HW_dims,
281  kernel_,
282  dilation_,
283  stride_,
284  pads_,
285  dY_data,
286  X_data,
287  Y_data,
288  dX_data,
289  &context_);
290  } else if (!equal_padding_ || ndim == 3) {
291  return functor_.template Backward<T, StorageOrder::NCHW>(
292  N,
293  C,
294  X_HW_dims,
295  Y_HW_dims,
296  kernel_,
297  dilation_,
298  stride_,
299  pads_,
300  dY_data,
301  X_data,
302  Y_data,
303  dX_data,
304  &context_);
305  }
306 
307  const std::vector<std::int64_t> X_dims = X.sizes().vec();
308  const std::vector<std::int64_t> Y_dims = Y.sizes().vec();
309  if (cached_X_dims_ != X_dims) {
310  constexpr cudnnDataType_t data_type = cudnnTypeWrapper<T>::type;
311  SetTensorDescriptor(data_type, order_, X_dims, &X_desc_);
312  SetTensorDescriptor(data_type, order_, Y_dims, &Y_desc_);
313  cached_X_dims_ = X_dims;
314  }
315  CUDNN_ENFORCE(cudnnPoolingBackward(
316  cudnn_wrapper_.inline_cudnn_handle(),
317  pooling_desc_,
318  cudnnTypeWrapper<T>::kOne(),
319  Y_desc_,
320  Y_data,
321  Y_desc_,
322  dY_data,
323  X_desc_,
324  X_data,
325  cudnnTypeWrapper<T>::kZero(),
326  X_desc_,
327  dX_data));
328 
329  return true;
330  }
331 
332  private:
333  CuDNNWrapper cudnn_wrapper_;
334  cudnnTensorDescriptor_t X_desc_;
335  cudnnTensorDescriptor_t Y_desc_;
336  cudnnPoolingDescriptor_t pooling_desc_;
337 
338  const Functor functor_;
339 
340  const bool equal_padding_;
341  std::vector<std::int64_t> cached_X_dims_;
342 };
343 
344 struct CuDNNAveragePoolFunctor {
345  explicit CuDNNAveragePoolFunctor(const OperatorBase& op)
346  : avg_pool_functor(op) {}
347 
348  cudnnPoolingMode_t GetPoolingMode() const {
349  return avg_pool_functor.count_include_pad
350  ? CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
351  : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
352  }
353 
354  template <typename T, StorageOrder kOrder>
355  bool GlobalPoolingForward(
356  const int N,
357  const int C,
358  const int HxW,
359  const T* X,
360  T* Y,
361  CUDAContext* context) const {
362  return avg_pool_functor.GlobalPoolingForward<T, kOrder>(
363  N, C, HxW, X, Y, context);
364  }
365 
366  template <typename T, StorageOrder kOrder>
367  bool Forward(
368  const int N,
369  const int C,
370  const std::vector<int>& X_dims,
371  const std::vector<int>& Y_dims,
372  const std::vector<int>& kernel,
373  const std::vector<int>& dilation,
374  const std::vector<int>& stride,
375  const std::vector<int>& pads,
376  const T* X,
377  T* Y,
378  CUDAContext* context) const {
379  return avg_pool_functor.Forward<T, kOrder>(
380  N, C, X_dims, Y_dims, kernel, dilation, stride, pads, X, Y, context);
381  }
382 
383  template <typename T, StorageOrder kOrder>
384  bool GlobalPoolingBackward(
385  const int N,
386  const int C,
387  const int HxW,
388  const T* dY,
389  const T* X,
390  const T* Y,
391  T* dX,
392  CUDAContext* context) const {
393  return avg_pool_functor.GlobalPoolingBackward<T, kOrder>(
394  N, C, HxW, dY, X, Y, dX, context);
395  }
396 
397  template <typename T, StorageOrder kOrder>
398  bool Backward(
399  const int N,
400  const int C,
401  const std::vector<int>& X_dims,
402  const std::vector<int>& Y_dims,
403  const std::vector<int>& kernel,
404  const std::vector<int>& dilation,
405  const std::vector<int>& stride,
406  const std::vector<int>& pads,
407  const T* dY,
408  const T* X,
409  const T* Y,
410  T* dX,
411  CUDAContext* context) const {
412  return avg_pool_functor.Backward<T, kOrder>(
413  N,
414  C,
415  X_dims,
416  Y_dims,
417  kernel,
418  dilation,
419  stride,
420  pads,
421  dY,
422  X,
423  Y,
424  dX,
425  context);
426  }
427 
428  const AveragePoolFunctor<CUDAContext> avg_pool_functor;
429 };
430 
431 struct CuDNNMaxPoolFunctor {
432  explicit CuDNNMaxPoolFunctor(const OperatorBase& op)
433  : max_pool_functor(op),
434  deterministic(op.GetSingleArgument<bool>("deterministic", false)) {}
435 
436  cudnnPoolingMode_t GetPoolingMode() const {
437 #if CUDNN_VERSION_MIN(6, 0, 0)
438  return deterministic ? CUDNN_POOLING_MAX_DETERMINISTIC : CUDNN_POOLING_MAX;
439 #else
440  return CUDNN_POOLING_MAX;
441 #endif
442  }
443 
444  template <typename T, StorageOrder kOrder>
445  bool GlobalPoolingForward(
446  const int N,
447  const int C,
448  const int HxW,
449  const T* X,
450  T* Y,
451  CUDAContext* context) const {
452  return max_pool_functor.GlobalPoolingForward<T, kOrder>(
453  N, C, HxW, X, Y, context);
454  }
455 
456  template <typename T, StorageOrder kOrder>
457  bool Forward(
458  const int N,
459  const int C,
460  const std::vector<int>& X_dims,
461  const std::vector<int>& Y_dims,
462  const std::vector<int>& kernel,
463  const std::vector<int>& dilation,
464  const std::vector<int>& stride,
465  const std::vector<int>& pads,
466  const T* X,
467  T* Y,
468  CUDAContext* context) const {
469  return max_pool_functor.Forward<T, kOrder>(
470  N, C, X_dims, Y_dims, kernel, dilation, stride, pads, X, Y, context);
471  }
472 
473  template <typename T, StorageOrder kOrder>
474  bool GlobalPoolingBackward(
475  const int N,
476  const int C,
477  const int HxW,
478  const T* dY,
479  const T* X,
480  const T* Y,
481  T* dX,
482  CUDAContext* context) const {
483  return max_pool_functor.GlobalPoolingBackward<T, kOrder>(
484  N, C, HxW, dY, X, Y, dX, context);
485  }
486 
487  template <typename T, StorageOrder kOrder>
488  bool Backward(
489  const int N,
490  const int C,
491  const std::vector<int>& X_dims,
492  const std::vector<int>& Y_dims,
493  const std::vector<int>& kernel,
494  const std::vector<int>& dilation,
495  const std::vector<int>& stride,
496  const std::vector<int>& pads,
497  const T* dY,
498  const T* X,
499  const T* Y,
500  T* dX,
501  CUDAContext* context) const {
502  return max_pool_functor.Backward<T, kOrder>(
503  N,
504  C,
505  X_dims,
506  Y_dims,
507  kernel,
508  dilation,
509  stride,
510  pads,
511  dY,
512  X,
513  Y,
514  dX,
515  context);
516  }
517 
518  const MaxPoolFunctor<CUDAContext> max_pool_functor;
519  const bool deterministic;
520 };
521 
522 } // namespace
523 
524 REGISTER_CUDNN_OPERATOR(AveragePool, CuDNNPoolOp<CuDNNAveragePoolFunctor>);
525 REGISTER_CUDNN_OPERATOR(
526  AveragePoolGradient,
527  CuDNNPoolGradientOp<CuDNNAveragePoolFunctor>);
528 
529 REGISTER_CUDNN_OPERATOR(AveragePool1D, CuDNNPoolOp<CuDNNAveragePoolFunctor>);
530 REGISTER_CUDNN_OPERATOR(
531  AveragePool1DGradient,
532  CuDNNPoolGradientOp<CuDNNAveragePoolFunctor>);
533 
534 REGISTER_CUDNN_OPERATOR(AveragePool2D, CuDNNPoolOp<CuDNNAveragePoolFunctor>);
535 REGISTER_CUDNN_OPERATOR(
536  AveragePool2DGradient,
537  CuDNNPoolGradientOp<CuDNNAveragePoolFunctor>);
538 
539 REGISTER_CUDNN_OPERATOR(AveragePool3D, CuDNNPoolOp<CuDNNAveragePoolFunctor>);
540 REGISTER_CUDNN_OPERATOR(
541  AveragePool3DGradient,
542  CuDNNPoolGradientOp<CuDNNAveragePoolFunctor>);
543 
544 REGISTER_CUDNN_OPERATOR(MaxPool, CuDNNPoolOp<CuDNNMaxPoolFunctor>);
545 REGISTER_CUDNN_OPERATOR(
546  MaxPoolGradient,
547  CuDNNPoolGradientOp<CuDNNMaxPoolFunctor>);
548 
549 REGISTER_CUDNN_OPERATOR(MaxPool1D, CuDNNPoolOp<CuDNNMaxPoolFunctor>);
550 REGISTER_CUDNN_OPERATOR(
551  MaxPool1DGradient,
552  CuDNNPoolGradientOp<CuDNNMaxPoolFunctor>);
553 
554 REGISTER_CUDNN_OPERATOR(MaxPool2D, CuDNNPoolOp<CuDNNMaxPoolFunctor>);
555 REGISTER_CUDNN_OPERATOR(
556  MaxPool2DGradient,
557  CuDNNPoolGradientOp<CuDNNMaxPoolFunctor>);
558 
559 REGISTER_CUDNN_OPERATOR(MaxPool3D, CuDNNPoolOp<CuDNNMaxPoolFunctor>);
560 REGISTER_CUDNN_OPERATOR(
561  MaxPool3DGradient,
562  CuDNNPoolGradientOp<CuDNNMaxPoolFunctor>);
563 
564 } // namespace caffe2
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Definition: common_cudnn.h:192
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64
Definition: static.cpp:70