1 #include "caffe2/operators/transpose_op.h" 7 #include "caffe2/core/context_gpu.h" 8 #include "caffe2/core/cudnn_wrappers.h" 9 #include "caffe2/core/types.h" 10 #include "caffe2/utils/math.h" 16 class CuDNNTransposeOp final :
public Operator<CUDAContext> {
18 USE_OPERATOR_FUNCTIONS(CUDAContext);
20 template <
class... Args>
21 explicit CuDNNTransposeOp(Args&&... args)
22 : Operator<CUDAContext>(
std::forward<Args>(args)...),
23 cudnn_wrapper_(&context_),
24 axes_(OperatorBase::GetRepeatedArgument<int>(
"axes")) {
26 std::vector<int> axes_sorted(axes_);
27 std::sort(axes_sorted.begin(), axes_sorted.end());
28 for (std::size_t i = 0; i < axes_sorted.size(); ++i) {
29 if (axes_sorted[i] != i) {
30 CAFFE_THROW(
"Axes should be a permutation of 0 to ndim.");
34 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&X_desc_));
35 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&Y_desc_));
38 ~CuDNNTransposeOp()
override {
39 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(X_desc_));
40 CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(Y_desc_));
43 bool RunOnDevice()
override {
44 return DispatchHelper<TensorTypes<float, int>>::call(
this, Input(0));
48 bool DoRunWithType() {
49 const auto& X = Input(0);
50 const int ndim = X.dim();
53 std::iota(axes_.rbegin(), axes_.rend(), 0);
55 CAFFE_ENFORCE_EQ(axes_.size(), ndim);
57 std::vector<std::int64_t> X_dims = X.sizes().vec();
58 std::vector<std::int64_t> Y_dims(ndim);
59 for (
int i = 0; i < ndim; ++i) {
60 Y_dims[i] = X_dims[axes_[i]];
62 auto* Y = Output(0, Y_dims, at::dtype<T>());
63 const T* X_data = X.template data<T>();
64 T* Y_data = Y->template mutable_data<T>();
68 if (ndim < 3 || ndim > CUDNN_DIM_MAX ||
69 X.numel() > std::numeric_limits<std::int32_t>::max()) {
70 math::Transpose<std::int64_t, T, CUDAContext>(
71 ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_);
74 if (X_dims != cached_X_dims_) {
75 SetTensorDescriptor(cudnnTypeWrapper<T>::type, X_dims, Y_dims);
76 cached_X_dims_ = X_dims;
78 CUDNN_ENFORCE(cudnnTransformTensor(
79 cudnn_wrapper_.inline_cudnn_handle(),
80 cudnnTypeWrapper<T>::kOne(),
83 cudnnTypeWrapper<T>::kZero(),
90 void SetTensorDescriptor(
91 const cudnnDataType_t data_type,
92 const std::vector<std::int64_t>& X_dims,
93 const std::vector<std::int64_t>& Y_dims) {
94 const int ndim = X_dims.size();
95 std::vector<int> dims(Y_dims.cbegin(), Y_dims.cend());
96 std::vector<int> X_strides(ndim);
97 std::vector<int> X_buff(ndim);
98 std::vector<int> Y_strides(ndim);
100 Y_strides.back() = 1;
101 for (
int i = ndim - 1; i > 0; --i) {
102 X_buff[i - 1] = X_buff[i] * X_dims[i];
103 Y_strides[i - 1] = Y_strides[i] * Y_dims[i];
105 for (
int i = 0; i < ndim; ++i) {
106 X_strides[i] = X_buff[axes_[i]];
108 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
109 X_desc_, data_type, ndim, dims.data(), X_strides.data()));
110 CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
111 Y_desc_, data_type, ndim, dims.data(), Y_strides.data()));
114 CuDNNWrapper cudnn_wrapper_;
115 cudnnTensorDescriptor_t X_desc_;
116 cudnnTensorDescriptor_t Y_desc_;
118 std::vector<std::int64_t> cached_X_dims_;
119 std::vector<std::int32_t> axes_;
122 #if !CUDNN_VERSION_MIN(6, 0, 0) 126 bool CuDNNTransposeOp::DoRunWithType<int>() {
127 const auto& X = Input(0);
128 const int ndim = X.dim();
131 std::iota(axes_.rbegin(), axes_.rend(), 0);
133 CAFFE_ENFORCE_EQ(axes_.size(), ndim);
135 std::vector<std::int64_t> X_dims = X.sizes().vec();
136 std::vector<std::int64_t> Y_dims(ndim);
137 for (
int i = 0; i < ndim; ++i) {
138 Y_dims[i] = X_dims[axes_[i]];
140 auto* Y = Output(0, Y_dims, at::dtype<T>());
141 const T* X_data = X.template data<T>();
142 T* Y_data = Y->template mutable_data<T>();
143 math::Transpose<std::int64_t, T, CUDAContext>(
144 ndim, X_dims.data(), axes_.data(), X_data, Y_data, &context_);
148 #endif // !CUDNN_VERSION_MIN(6, 0, 0) 152 REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...