Caffe2 - C++ API
A deep learning, cross platform ML framework
transpose_op_cudnn.cc
1 
17 #include "caffe2/core/context_gpu.h"
18 #include "caffe2/core/cudnn_wrappers.h"
19 #include "caffe2/core/types.h"
20 #include "caffe2/operators/transpose_op.h"
21 #include "caffe2/utils/math.h"
22 
23 namespace caffe2 {
24 
25 #define MAX_DIMS 8
26 
27 class CuDNNTransposeOp final : public Operator<CUDAContext> {
28  public:
29  USE_OPERATOR_FUNCTIONS(CUDAContext);
30  USE_DISPATCH_HELPER;
31 
32  CuDNNTransposeOp(const OperatorDef& operator_def, Workspace* ws)
33  : Operator<CUDAContext>(operator_def, ws),
34  cudnn_wrapper_(&context_),
35  axes_(OperatorBase::GetRepeatedArgument<int>("axes")) {
36  // We will check the legality of axes_: it should be from 0 to axes_.size().
37  std::vector<int> axes_sorted(axes_);
38  std::sort(axes_sorted.begin(), axes_sorted.end());
39  for (int i = 0; i < axes_sorted.size(); ++i) {
40  if (axes_sorted[i] != i) {
41  CAFFE_THROW("Axes should be a permutation of 0 to ndim.");
42  }
43  }
44 
45  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&xDesc_));
46  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&yDesc_));
47  }
48 
49  ~CuDNNTransposeOp() {
50  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(xDesc_));
51  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(yDesc_));
52  }
53 
54  bool RunOnDevice() override {
55  const auto& X = Input(0);
56  auto* Y = Output(0);
57  const int num_axes = X.ndim();
58  const std::vector<int> x_dims(X.dims().cbegin(), X.dims().cend());
59  std::vector<int> y_dims(num_axes);
60  if (axes_.empty()) {
61  axes_.resize(num_axes);
62  for (int i = 0; i < num_axes; ++i) {
63  axes_[i] = num_axes - 1 - i;
64  }
65  y_dims.assign(X.dims().rbegin(), X.dims().rend());
66  } else {
67  CAFFE_ENFORCE_EQ(X.ndim(), axes_.size());
68  for (int i = 0; i < num_axes; ++i) {
69  y_dims[i] = X.dim32(axes_[i]);
70  }
71  }
72  Y->Resize(y_dims);
73  SetDeviceTensor(x_dims, &x_dims_device_);
74  SetDeviceTensor(y_dims, &y_dims_device_);
75  SetDeviceTensor(axes_, &axes_device_);
76  // Do the actual transpose, which is implemented in DoRunWithType().
77 #if CUDNN_VERSION_MIN(6, 0, 0)
78  return DispatchHelper<TensorTypes<float, int>>::call(this, Input(0));
79 #else
80  // CUDNN 5.1 does not have int support yet.
81  return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
82 #endif
83  }
84 
85  protected:
86  void SetDeviceTensor(
87  const std::vector<int>& data,
88  Tensor<CUDAContext>* tensor) {
89  tensor->Resize(data.size());
90  context_.template Copy<int, CPUContext, CUDAContext>(
91  data.size(), data.data(), tensor->template mutable_data<int>());
92  }
93 
94  template <typename T>
95  bool DoRunWithType() {
96  const auto& input = Input(0);
97  auto* output = Output(0);
98  int ndim = input.ndim();
99 
100  if (ndim == 0) {
101  return true;
102  }
103  if (ndim == 1) {
104  output->CopyFrom(input);
105  return true;
106  }
107 
108  cudnnDataType_t typedesc = cudnnTypeWrapper<T>::type;
109 #if CUDNN_VERSION_MIN(6, 0, 0)
110  if (typedesc == CUDNN_DATA_INT32) {
111  // CUDNN Transpose only support float for now
112  math::Transpose<int, CUDAContext>(
113  axes_.size(),
114  x_dims_device_.template data<int>(),
115  y_dims_device_.template data<int>(),
116  axes_device_.template data<int>(),
117  input.size(),
118  input.template data<int>(),
119  output->template mutable_data<int>(),
120  &context_);
121  return true;
122  }
123 #endif
124 
125  CAFFE_ENFORCE(ndim < MAX_DIMS, "Input ndim exceeds compile time max.");
126 
127  stride_y[ndim - 1] = 1;
128  for (int i = ndim - 2; i >= 0; i--) {
129  stride_y[i] = stride_y[i + 1] * output->dim32(i + 1);
130  }
131 
132  CHECK(axes_.size() >= ndim);
133 
134  stride_x[ndim] = 1;
135  for (int i = 0; i < ndim; i++) {
136  stride_x[i] = 1;
137  for (int j = axes_[i] + 1; j < ndim; j++) {
138  stride_x[i] *= input.dim32(j);
139  }
140  dim_y_int[i] = output->dim32(i);
141  }
142 
143  // CuDNN requires at least 3-dim tensors
144  for (int i = ndim; i < MAX_DIMS; i++) {
145  stride_x[i] = 1;
146  stride_y[i] = 1;
147  dim_y_int[i] = 1;
148  }
149 
150  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
151  xDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_x));
152 
153  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
154  yDesc_, typedesc, ndim < 4 ? 4 : ndim, dim_y_int, stride_y));
155 
156  CUDNN_ENFORCE(cudnnTransformTensor(
157  cudnn_wrapper_.inline_cudnn_handle(),
159  xDesc_,
160  static_cast<const void*>(input.template data<T>()),
162  yDesc_,
163  static_cast<void*>(output->template mutable_data<T>())));
164  return true;
165  }
166 
167  int stride_x[MAX_DIMS];
168  int stride_y[MAX_DIMS];
169  int dim_y_int[MAX_DIMS];
170 
171  cudnnTensorDescriptor_t xDesc_;
172  cudnnTensorDescriptor_t yDesc_;
173  CuDNNWrapper cudnn_wrapper_;
174 
175  std::vector<int> axes_;
176 
177  Tensor<CUDAContext> x_dims_device_;
178  Tensor<CUDAContext> y_dims_device_;
179  Tensor<CUDAContext> axes_device_;
180 };
181 
182 REGISTER_CUDNN_OPERATOR(Transpose, CuDNNTransposeOp);
183 
184 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
void Resize(Ts...dim_source)
Resizes a tensor.
Definition: tensor.h:304
Copyright (c) 2016-present, Facebook, Inc.
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
cudnnHandle_t inline_cudnn_handle()
Returns the inline cudnn handle that executes on the current thread&#39;s cuda_stream.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...
Definition: common_cudnn.h:127