Caffe2 - C++ API
A deep learning, cross platform ML framework
Descriptors.h
1 #pragma once
2 
3 #include <ATen/cuda/CUDAContext.h>
4 #include <ATen/cuda/Exceptions.h>
5 
6 #include <ATen/cudnn/cudnn-wrapper.h>
7 #include <ATen/ATen.h>
8 #include <ATen/TensorUtils.h>
9 #include <ATen/cuda/ATenCUDAGeneral.h>
10 #include <cuda.h>
11 
12 namespace at { namespace native {
13 
14 // TODO: Add constructors for all of the descriptors
15 
16 inline int dataSize(cudnnDataType_t dataType)
17 {
18  switch (dataType) {
19  case CUDNN_DATA_HALF: return 2;
20  case CUDNN_DATA_FLOAT: return 4;
21  default: return 8;
22  }
23 }
24 
25 // The stride for a size-1 dimensions is not uniquely determined; in
26 // fact, it can be anything you want, because the fact that the
27 // tensor is size 1 at this dimension means that you will never actually
28 // try advancing your pointer by this stride.
29 //
30 // However, CuDNN has a much more stringent requirement on strides:
31 // if you are passing a contiguous input, it better be the case
32 // that the stride for dim i is the product of the sizes of dims
33 // i+1 to the end. This stride is indeed uniquely determined. This
34 // function modifies 'stride' in place so this invariant holds.
35 static inline void fixSizeOneDimStride(int dim, const int *size, int *stride) {
36  int64_t z = 1;
37  for(int d = dim-1; d >= 0; d--)
38  {
39  if (size[d] == 1) {
40  stride[d] = z;
41  } else {
42  z *= size[d];
43  }
44  }
45 }
46 
47 template <typename T, cudnnStatus_t (*dtor)(T*)>
49  void operator()(T* x) {
50  if (x != nullptr) {
51  AT_CUDNN_CHECK(dtor(x));
52  }
53  }
54 };
55 
56 // A generic class for wrapping cuDNN descriptor types. All you need
57 // is to give the underlying type the Descriptor_t points to (usually,
58 // if it's cudnnTensorDescriptor_t it points to cudnnTensorStruct),
59 // the constructor and the destructor. Subclasses are responsible
60 // for defining a set() function to actually set the descriptor.
61 //
62 // Descriptors default construct to a nullptr, and have a descriptor
63 // initialized the first time you call set() or any other initializing
64 // function.
65 template <typename T, cudnnStatus_t (*ctor)(T**), cudnnStatus_t (*dtor)(T*)>
66 class AT_CUDA_API Descriptor
67 {
68 public:
69  // TODO: Figure out why const-correctness doesn't work here
70 
71  // Use desc() to access the underlying descriptor pointer in
72  // a read-only fashion. Most client code should use this.
73  // If the descriptor was never initialized, this will return
74  // nullptr.
75  T* desc() const { return desc_.get(); }
76  T* desc() { return desc_.get(); }
77 
78  // Use mut_desc() to access the underlying desciptor pointer
79  // if you intend to modify what it points to (e.g., using
80  // cudnnSetFooDescriptor). This will ensure that the descriptor
81  // is initialized. Code in this file will use this function.
82  T* mut_desc() { init(); return desc_.get(); }
83 protected:
84  void init() {
85  if (desc_ == nullptr) {
86  T* raw_desc;
87  AT_CUDNN_CHECK(ctor(&raw_desc));
88  desc_.reset(raw_desc);
89  }
90  }
91 private:
92  std::unique_ptr<T, DescriptorDeleter<T, dtor>> desc_;
93 };
94 
95 class AT_CUDA_API TensorDescriptor
96  : public Descriptor<cudnnTensorStruct,
97  &cudnnCreateTensorDescriptor,
98  &cudnnDestroyTensorDescriptor>
99 {
100 public:
101  TensorDescriptor() {}
102  explicit TensorDescriptor(const at::Tensor &t, size_t pad = 0) {
103  set(t, pad);
104  }
105 
106  // Note [CuDNN broadcast padding]
107  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
108  // pad specifies the minimum dimensionality of the tensor descriptor
109  // we produce (it doesn't have anything to do with, e.g., convolution
110  // padding). If 't' is lower-dimensional than 'pad', the remaining
111  // dimensions (on the right) are padded with ones. This doesn't
112  // affect the underlying data layout. This is particularly useful for
113  // dealing with a pecularity of the CuDNN API, which is that broadcasting in CuDNN is
114  // done in two steps: first, the client code is expected to pad out
115  // (the dimensions) input tensors to be the same dimension as the
116  // target broadcast, and then second, CuDNN takes of actually
117  // broadcasting size 1 dimensions.
118 
119  void set(const at::Tensor &t, size_t pad = 0);
120  void set(cudnnDataType_t dataType, IntArrayRef sizes, IntArrayRef strides, size_t pad = 0);
121 
122  void print();
123 
124 private:
125  void set(cudnnDataType_t dataType, int dim, int* size, int* stride) {
126  fixSizeOneDimStride(dim, size, stride);
127  AT_CUDNN_CHECK(cudnnSetTensorNdDescriptor(mut_desc(), dataType, dim, size, stride));
128  }
129 };
130 
131 std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
132 
134  : public Descriptor<cudnnFilterStruct,
135  &cudnnCreateFilterDescriptor,
136  &cudnnDestroyFilterDescriptor>
137 {
138 public:
139  void set(const at::Tensor &t, int64_t pad = 0);
140 
141 private:
142  void set(cudnnDataType_t dataType, int dim, int* size) {
143  AT_CUDNN_CHECK(cudnnSetFilterNdDescriptor(mut_desc(), dataType, CUDNN_TENSOR_NCHW, dim, size));
144  }
145 };
146 
147 struct AT_CUDA_API ConvolutionDescriptor
148  : public Descriptor<cudnnConvolutionStruct,
149  &cudnnCreateConvolutionDescriptor,
150  &cudnnDestroyConvolutionDescriptor>
151 {
152  void set(cudnnDataType_t dataType, int dim, int* pad, int* stride, int * upscale /* aka dilation */, int groups) {
153  cudnnDataType_t mathType = dataType;
154  if (dataType == CUDNN_DATA_HALF) mathType = CUDNN_DATA_FLOAT;
155  AT_CUDNN_CHECK(cudnnSetConvolutionNdDescriptor(mut_desc(), dim, pad, stride, upscale,
156  CUDNN_CROSS_CORRELATION, mathType));
157  AT_CUDNN_CHECK(cudnnSetConvolutionGroupCount(mut_desc(), groups));
158  // See Note [behavior of cudnnFind and cudnnGet]
159  AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH));
160  if(dataType == CUDNN_DATA_HALF)
161  AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH));
162 
163  }
164 };
165 
167  : public Descriptor<cudnnSpatialTransformerStruct,
168  &cudnnCreateSpatialTransformerDescriptor,
169  &cudnnDestroySpatialTransformerDescriptor>
170 {
171  void set(cudnnDataType_t dataType, int dim, int* size) {
172  AT_CUDNN_CHECK(cudnnSetSpatialTransformerNdDescriptor(mut_desc(), CUDNN_SAMPLER_BILINEAR, dataType, dim, size));
173  }
174 };
175 
176 struct AT_CUDA_API DropoutDescriptor
177  : public Descriptor<cudnnDropoutStruct,
178  &cudnnCreateDropoutDescriptor,
179  &cudnnDestroyDropoutDescriptor>
180 {
181  at::Tensor state;
182 
183  // Initialize a dropout descriptor's RNG state.
184  // WARNING: This function is very expensive, avoid calling this function!
185  // NB: it takes a Type so that we can generate a Variable if necessary.
186  void initialize_rng(cudnnHandle_t handle, float dropout, long long int seed, const TensorOptions& options) {
187  AT_ASSERTM(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout");
188  size_t state_size;
189  AT_CUDNN_CHECK(cudnnDropoutGetStatesSize(handle, &state_size));
190  AT_ASSERT(options.device().type() == kCUDA);
191  AT_ASSERT(options.dtype() == kByte);
192  state = at::empty({static_cast<int64_t>(state_size)}, options);
193  AT_CUDNN_CHECK(cudnnSetDropoutDescriptor(mut_desc(), handle, dropout, state.data_ptr(), state_size, seed));
194  }
195 
196  // Restore a dropout descriptor given a dropout probability and existing RNG state.
197  // See Note [cuDNN dropout descriptor initialization]
198  void set(cudnnHandle_t handle, float dropout, at::Tensor state_) {
199  AT_ASSERTM(dropout > 0, "dropout must be nonzero; otherwise call set_no_dropout");
200  state = state_;
201  void *state_ptr = state.data_ptr();
202  size_t state_size = state.size(0);
203  // NB: The seed doesn't actually matter, so we give a dummy value
204  AT_CUDNN_CHECK(cudnnRestoreDropoutDescriptor(mut_desc(), handle, dropout, state_ptr, state_size, 0 /* seed */));
205  }
206 
207  // Restore a dropout descriptor corresponding to no dropout
208  // See Note [cuDNN dropout descriptor initialization]
209  void set_no_dropout(cudnnHandle_t handle) {
210  // NB: seed doesn't matter when dropout = 0, because no random number
211  // initialization actually takes place when there is no dropout.
212  // NB: Empirically, cudnnSetDropoutDescriptor is cheap when
213  // dropoot == 0
214  AT_CUDNN_CHECK(cudnnSetDropoutDescriptor(mut_desc(), handle, 0 /* dropout */, nullptr, 0 /* state_size */, 0 /* seed */));
215  }
216 };
217 
218 struct AT_CUDA_API RNNDescriptor
219  : public Descriptor<cudnnRNNStruct,
220  &cudnnCreateRNNDescriptor,
221  &cudnnDestroyRNNDescriptor>
222 {
223  DropoutDescriptor dropout_desc_;
224  void set(cudnnHandle_t handle, int hidden_size, int num_layers, DropoutDescriptor&& dropout_desc,
225  cudnnRNNInputMode_t input_mode, cudnnDirectionMode_t bidirectional,
226  cudnnRNNMode_t mode, cudnnDataType_t datatype, cudnnDataType_t input_type, cudnnRNNAlgo_t algo) {
227  dropout_desc_ = std::move(dropout_desc);
228  AT_CUDNN_CHECK(cudnnSetRNNDescriptor_v6(
229  handle,
230  mut_desc(),
231  hidden_size,
232  num_layers,
233  dropout_desc_.desc(),
234  input_mode,
235  bidirectional,
236  mode,
237  algo,
238  datatype));
239 #if CUDA_VERSION >= 9000
240  cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
241  if (prop->major >= 7) {
242  if (input_type == CUDNN_DATA_HALF) {
243  cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH);
244  } else {
245  // Technically, as the default it's not necessary to explicitly
246  // set this.
247  cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_DEFAULT_MATH);
248  }
249  }
250 #endif
251  }
252 };
253 
254 struct AT_CUDA_API CTCLossDescriptor
255  : public Descriptor<cudnnCTCLossStruct,
256  &cudnnCreateCTCLossDescriptor,
257  &cudnnDestroyCTCLossDescriptor>
258 {
259  void set(cudnnDataType_t datatype) {
260  AT_CUDNN_CHECK(cudnnSetCTCLossDescriptor(mut_desc(), datatype));
261  }
262 };
263 
264 union Constant
265 {
266  float f;
267  double d;
268  Constant(cudnnDataType_t dataType, double value) {
269  if (dataType == CUDNN_DATA_HALF || dataType == CUDNN_DATA_FLOAT) {
270  f = static_cast<float>(value);
271  } else {
272  d = value;
273  }
274  }
275 };
276 
277 }} // namespace
C10_NODISCARD TensorOptions device(c10::optional< Device > device) const noexcept
Return a copy of TensorOptions with device set to the given one, or cleared if device is nullopt...
C10_NODISCARD TensorOptions dtype(c10::optional< caffe2::TypeMeta > dtype) const noexcept
Return a copy of TensorOptions with dtype set to the given one.
Flush-To-Zero and Denormals-Are-Zero mode.