1 #include "caffe2/operators/pool_op.h" 8 #include "caffe2/core/context_gpu.h" 9 #include "caffe2/core/cudnn_wrappers.h" 15 void SetTensorDescriptor(
16 const cudnnDataType_t data_type,
17 const StorageOrder order,
18 const std::vector<std::int64_t>& dims,
19 cudnnTensorDescriptor_t* desc) {
20 const int ndim = dims.size();
21 const int N = dims[0];
22 const int C = order == StorageOrder::NCHW ? dims[1] : dims[ndim - 1];
25 const int H = order == StorageOrder::NCHW ? dims[2] : dims[1];
26 const int W = order == StorageOrder::NCHW ? dims[3] : dims[2];
27 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
32 const int D = order == StorageOrder::NCHW ? dims[2] : dims[1];
33 const int H = order == StorageOrder::NCHW ? dims[3] : dims[2];
34 const int W = order == StorageOrder::NCHW ? dims[4] : dims[3];
35 const std::array<int, 5> dims_arr = {N, C, D, H, W};
36 const std::array<int, 5> strides_arr = order == StorageOrder::NCHW
37 ? std::array<int, 5>{C * D * H * W, D * H * W, H * W, W, 1}
38 : std::array<int, 5>{D * H * W * C, 1, H * W * C, W * C, C};
39 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
40 *desc, data_type, 5, dims_arr.data(), strides_arr.data()));
44 CAFFE_THROW(
"Unsupported tensor dim: ", ndim);
50 template <
class Functor>
51 class CuDNNPoolOp final :
public ConvPoolOpBase<CUDAContext> {
53 template <
class... Args>
54 explicit CuDNNPoolOp(Args&&... args)
55 : ConvPoolOpBase<CUDAContext>(
std::forward<Args>(args)...),
56 cudnn_wrapper_(&context_),
58 equal_padding_(
std::equal(
60 pads_.cbegin() + kernel_.size(),
61 pads_.cbegin() + kernel_.size())) {
62 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
63 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_));
64 CUDNN_ENFORCE(cudnnCreatePoolingDescriptor(&pooling_desc_));
65 if (!global_pooling_ && equal_padding_) {
66 if (kernel_.size() == 2) {
67 CUDNN_ENFORCE(cudnnSetPooling2dDescriptor(
69 functor_.GetPoolingMode(),
70 CUDNN_NOT_PROPAGATE_NAN,
77 }
else if (kernel_.size() == 3) {
78 CUDNN_ENFORCE(cudnnSetPoolingNdDescriptor(
80 functor_.GetPoolingMode(),
81 CUDNN_NOT_PROPAGATE_NAN,
90 ~CuDNNPoolOp()
override {
91 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
92 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_));
93 CUDNN_ENFORCE(cudnnDestroyPoolingDescriptor(pooling_desc_));
96 bool RunOnDevice()
override {
97 return DispatchHelper<TensorTypes<float>>::call(
this, Input(0));
100 template <
typename T>
101 bool DoRunWithType() {
102 const auto& X = Input(0);
103 const int ndim = X.dim();
104 const int N = X.dim32(0);
105 const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
106 auto sizes = ConvPoolOpBase<CUDAContext>::GetOutputSize(X, C);
107 auto* Y = Output(0, sizes, at::dtype<T>());
108 const T* X_data = X.template data<T>();
109 T* Y_data = Y->template mutable_data<T>();
115 if (global_pooling_) {
116 const int HxW = X.numel() / (N * C);
117 if (order_ == StorageOrder::NCHW) {
118 return functor_.template GlobalPoolingForward<T, StorageOrder::NCHW>(
119 N, C, HxW, X_data, Y_data, &context_);
121 return functor_.template GlobalPoolingForward<T, StorageOrder::NHWC>(
122 N, C, HxW, X_data, Y_data, &context_);
126 const std::vector<int> X_HW_dims = GetDims(X);
127 const std::vector<int> Y_HW_dims = GetDims(*Y);
128 if (order_ == StorageOrder::NHWC) {
131 return functor_.template Forward<T, StorageOrder::NHWC>(
140 X.template data<T>(),
141 Y->template mutable_data<T>(),
143 }
else if (!equal_padding_ || ndim == 3) {
144 return functor_.template Forward<T, StorageOrder::NCHW>(
153 X.template data<T>(),
154 Y->template mutable_data<T>(),
158 const std::vector<std::int64_t> X_dims = X.sizes().vec();
159 const std::vector<std::int64_t> Y_dims = Y->sizes().vec();
160 if (cached_X_dims_ != X_dims) {
161 constexpr cudnnDataType_t data_type = cudnnTypeWrapper<T>::type;
162 SetTensorDescriptor(data_type, order_, X_dims, &X_desc_);
163 SetTensorDescriptor(data_type, order_, Y_dims, &Y_desc_);
164 cached_X_dims_ = X_dims;
166 CUDNN_ENFORCE(cudnnPoolingForward(
167 cudnn_wrapper_.inline_cudnn_handle(),
169 cudnnTypeWrapper<T>::kOne(),
172 cudnnTypeWrapper<T>::kZero(),
180 CuDNNWrapper cudnn_wrapper_;
181 cudnnTensorDescriptor_t X_desc_;
182 cudnnTensorDescriptor_t Y_desc_;
183 cudnnPoolingDescriptor_t pooling_desc_;
185 const Functor functor_;
187 const bool equal_padding_;
188 std::vector<std::int64_t> cached_X_dims_;
191 template <
class Functor>
192 class CuDNNPoolGradientOp final :
public ConvPoolOpBase<CUDAContext> {
194 template <
class... Args>
195 explicit CuDNNPoolGradientOp(Args&&... args)
196 : ConvPoolOpBase<CUDAContext>(
std::forward<Args>(args)...),
197 cudnn_wrapper_(&context_),
199 equal_padding_(
std::equal(
201 pads_.cbegin() + kernel_.size(),
202 pads_.cbegin() + kernel_.size())) {
203 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
204 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_));
205 CUDNN_ENFORCE(cudnnCreatePoolingDescriptor(&pooling_desc_));
206 if (!global_pooling_ && equal_padding_) {
207 if (kernel_.size() == 2) {
208 CUDNN_ENFORCE(cudnnSetPooling2dDescriptor(
210 functor_.GetPoolingMode(),
211 CUDNN_NOT_PROPAGATE_NAN,
218 }
else if (kernel_.size() == 3) {
219 CUDNN_ENFORCE(cudnnSetPoolingNdDescriptor(
221 functor_.GetPoolingMode(),
222 CUDNN_NOT_PROPAGATE_NAN,
231 ~CuDNNPoolGradientOp()
override {
232 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
233 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_));
234 CUDNN_ENFORCE(cudnnDestroyPoolingDescriptor(pooling_desc_));
237 bool RunOnDevice()
override {
238 return DispatchHelper<TensorTypes<float>>::call(
this, Input(0));
241 template <
typename T>
242 bool DoRunWithType() {
243 const auto& X = Input(0);
244 const auto& Y = Input(1);
245 const auto& dY = Input(2);
246 auto* dX = Output(0, X.sizes(), at::dtype<T>());
247 const int ndim = X.dim();
248 const int N = X.dim32(0);
249 const int C = order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1);
250 const std::vector<int> X_HW_dims = GetDims(X);
251 const std::vector<int> Y_HW_dims = GetDims(Y);
252 ConvPoolOpBase<CUDAContext>::ComputePads(X_HW_dims);
253 const T* dY_data = dY.template data<T>();
254 const T* X_data = X.template data<T>();
255 const T* Y_data = Y.template data<T>();
256 T* dX_data = dX->template mutable_data<T>();
262 if (global_pooling_) {
263 const int HxW = X.numel() / (N * C);
264 if (order_ == StorageOrder::NCHW) {
265 return functor_.template GlobalPoolingBackward<T, StorageOrder::NCHW>(
266 N, C, HxW, dY_data, X_data, Y_data, dX_data, &context_);
268 return functor_.template GlobalPoolingBackward<T, StorageOrder::NHWC>(
269 N, C, HxW, dY_data, X_data, Y_data, dX_data, &context_);
273 if (order_ == StorageOrder::NHWC) {
276 return functor_.template Backward<T, StorageOrder::NHWC>(
290 }
else if (!equal_padding_ || ndim == 3) {
291 return functor_.template Backward<T, StorageOrder::NCHW>(
307 const std::vector<std::int64_t> X_dims = X.sizes().vec();
308 const std::vector<std::int64_t> Y_dims = Y.sizes().vec();
309 if (cached_X_dims_ != X_dims) {
310 constexpr cudnnDataType_t data_type = cudnnTypeWrapper<T>::type;
311 SetTensorDescriptor(data_type, order_, X_dims, &X_desc_);
312 SetTensorDescriptor(data_type, order_, Y_dims, &Y_desc_);
313 cached_X_dims_ = X_dims;
315 CUDNN_ENFORCE(cudnnPoolingBackward(
316 cudnn_wrapper_.inline_cudnn_handle(),
318 cudnnTypeWrapper<T>::kOne(),
325 cudnnTypeWrapper<T>::kZero(),
333 CuDNNWrapper cudnn_wrapper_;
334 cudnnTensorDescriptor_t X_desc_;
335 cudnnTensorDescriptor_t Y_desc_;
336 cudnnPoolingDescriptor_t pooling_desc_;
338 const Functor functor_;
340 const bool equal_padding_;
341 std::vector<std::int64_t> cached_X_dims_;
344 struct CuDNNAveragePoolFunctor {
345 explicit CuDNNAveragePoolFunctor(
const OperatorBase& op)
346 : avg_pool_functor(op) {}
348 cudnnPoolingMode_t GetPoolingMode()
const {
349 return avg_pool_functor.count_include_pad
350 ? CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
351 : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
354 template <
typename T, StorageOrder kOrder>
355 bool GlobalPoolingForward(
361 CUDAContext* context)
const {
362 return avg_pool_functor.GlobalPoolingForward<
T, kOrder>(
363 N, C, HxW, X, Y, context);
366 template <
typename T, StorageOrder kOrder>
370 const std::vector<int>& X_dims,
371 const std::vector<int>& Y_dims,
372 const std::vector<int>& kernel,
373 const std::vector<int>& dilation,
374 const std::vector<int>& stride,
375 const std::vector<int>& pads,
378 CUDAContext* context)
const {
379 return avg_pool_functor.Forward<
T, kOrder>(
380 N, C, X_dims, Y_dims, kernel, dilation, stride, pads, X, Y, context);
383 template <
typename T, StorageOrder kOrder>
384 bool GlobalPoolingBackward(
392 CUDAContext* context)
const {
393 return avg_pool_functor.GlobalPoolingBackward<
T, kOrder>(
394 N, C, HxW, dY, X, Y, dX, context);
397 template <
typename T, StorageOrder kOrder>
401 const std::vector<int>& X_dims,
402 const std::vector<int>& Y_dims,
403 const std::vector<int>& kernel,
404 const std::vector<int>& dilation,
405 const std::vector<int>& stride,
406 const std::vector<int>& pads,
411 CUDAContext* context)
const {
412 return avg_pool_functor.Backward<
T, kOrder>(
428 const AveragePoolFunctor<CUDAContext> avg_pool_functor;
431 struct CuDNNMaxPoolFunctor {
432 explicit CuDNNMaxPoolFunctor(
const OperatorBase& op)
433 : max_pool_functor(op),
434 deterministic(op.GetSingleArgument<bool>(
"deterministic", false)) {}
436 cudnnPoolingMode_t GetPoolingMode()
const {
437 #if CUDNN_VERSION_MIN(6, 0, 0) 438 return deterministic ? CUDNN_POOLING_MAX_DETERMINISTIC : CUDNN_POOLING_MAX;
440 return CUDNN_POOLING_MAX;
444 template <
typename T, StorageOrder kOrder>
445 bool GlobalPoolingForward(
451 CUDAContext* context)
const {
452 return max_pool_functor.GlobalPoolingForward<
T, kOrder>(
453 N, C, HxW, X, Y, context);
456 template <
typename T, StorageOrder kOrder>
460 const std::vector<int>& X_dims,
461 const std::vector<int>& Y_dims,
462 const std::vector<int>& kernel,
463 const std::vector<int>& dilation,
464 const std::vector<int>& stride,
465 const std::vector<int>& pads,
468 CUDAContext* context)
const {
469 return max_pool_functor.Forward<
T, kOrder>(
470 N, C, X_dims, Y_dims, kernel, dilation, stride, pads, X, Y, context);
473 template <
typename T, StorageOrder kOrder>
474 bool GlobalPoolingBackward(
482 CUDAContext* context)
const {
483 return max_pool_functor.GlobalPoolingBackward<
T, kOrder>(
484 N, C, HxW, dY, X, Y, dX, context);
487 template <
typename T, StorageOrder kOrder>
491 const std::vector<int>& X_dims,
492 const std::vector<int>& Y_dims,
493 const std::vector<int>& kernel,
494 const std::vector<int>& dilation,
495 const std::vector<int>& stride,
496 const std::vector<int>& pads,
501 CUDAContext* context)
const {
502 return max_pool_functor.Backward<
T, kOrder>(
518 const MaxPoolFunctor<CUDAContext> max_pool_functor;
519 const bool deterministic;
524 REGISTER_CUDNN_OPERATOR(
AveragePool, CuDNNPoolOp<CuDNNAveragePoolFunctor>);
525 REGISTER_CUDNN_OPERATOR(
527 CuDNNPoolGradientOp<CuDNNAveragePoolFunctor>);
529 REGISTER_CUDNN_OPERATOR(AveragePool1D, CuDNNPoolOp<CuDNNAveragePoolFunctor>);
530 REGISTER_CUDNN_OPERATOR(
531 AveragePool1DGradient,
532 CuDNNPoolGradientOp<CuDNNAveragePoolFunctor>);
534 REGISTER_CUDNN_OPERATOR(AveragePool2D, CuDNNPoolOp<CuDNNAveragePoolFunctor>);
535 REGISTER_CUDNN_OPERATOR(
536 AveragePool2DGradient,
537 CuDNNPoolGradientOp<CuDNNAveragePoolFunctor>);
539 REGISTER_CUDNN_OPERATOR(AveragePool3D, CuDNNPoolOp<CuDNNAveragePoolFunctor>);
540 REGISTER_CUDNN_OPERATOR(
541 AveragePool3DGradient,
542 CuDNNPoolGradientOp<CuDNNAveragePoolFunctor>);
544 REGISTER_CUDNN_OPERATOR(
MaxPool, CuDNNPoolOp<CuDNNMaxPoolFunctor>);
545 REGISTER_CUDNN_OPERATOR(
547 CuDNNPoolGradientOp<CuDNNMaxPoolFunctor>);
549 REGISTER_CUDNN_OPERATOR(MaxPool1D, CuDNNPoolOp<CuDNNMaxPoolFunctor>);
550 REGISTER_CUDNN_OPERATOR(
552 CuDNNPoolGradientOp<CuDNNMaxPoolFunctor>);
554 REGISTER_CUDNN_OPERATOR(MaxPool2D, CuDNNPoolOp<CuDNNMaxPoolFunctor>);
555 REGISTER_CUDNN_OPERATOR(
557 CuDNNPoolGradientOp<CuDNNMaxPoolFunctor>);
559 REGISTER_CUDNN_OPERATOR(MaxPool3D, CuDNNPoolOp<CuDNNMaxPoolFunctor>);
560 REGISTER_CUDNN_OPERATOR(
562 CuDNNPoolGradientOp<CuDNNMaxPoolFunctor>);
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...