Caffe2 - C++ API
A deep learning, cross platform ML framework
dropout_op_cudnn.cc
1 #include "caffe2/core/context_gpu.h"
2 #include "caffe2/core/cudnn_wrappers.h"
3 #include "caffe2/core/operator.h"
4 #include "caffe2/core/types.h"
5 
6 namespace caffe2 {
7 
8 // cudnnRestoreDropoutDescriptor is needed for correctness and
9 // doesn't exist prior to cuDNN v7
10 #if CUDNN_VERSION_MIN(7,0,0)
11 
12 class CuDNNDropoutOp final : public Operator<CUDAContext> {
13  public:
14  USE_OPERATOR_FUNCTIONS(CUDAContext);
15 
16  explicit CuDNNDropoutOp(const OperatorDef& operator_def, Workspace* ws)
17  : Operator<CUDAContext>(operator_def, ws),
18  cudnn_wrapper_(&context_),
19  ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
20  is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
21  states_initialized_(false),
22  random_seed_(operator_def.device_option().random_seed()) {
23  CAFFE_ENFORCE_GE(ratio_, 0);
24  CAFFE_ENFORCE_LT(ratio_, 1);
25  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
26 
27  CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_));
28  CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
29  cudnn_wrapper_.inline_cudnn_handle(),
30  reinterpret_cast<size_t*>(&states_size_in_bytes_)));
31 
32  if (!is_test_) {
33  scratch_blob_ = ws->CreateBlob(scratch_blob_name(operator_def.output(1)));
34  CAFFE_ENFORCE(scratch_blob_);
35  }
36  }
37 
38  ~CuDNNDropoutOp() noexcept override {
39  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
40  CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_));
41  }
42 
43  template <typename T, typename M>
44  bool DoRunWithType();
45 
46  bool RunOnDevice() override;
47 
48  static string scratch_blob_name(string mask_blob_name) {
49  return "cudnn_dropout_scratch_" + mask_blob_name;
50  }
51 
52  protected:
53  CuDNNWrapper cudnn_wrapper_;
54  cudnnTensorDescriptor_t data_desc_;
55  cudnnDropoutDescriptor_t dropout_desc_;
56 
57  vector<int64_t> cudnn_input_dims_;
58 
59  float ratio_;
60  bool is_test_;
61 
62  Blob* scratch_blob_ = nullptr;
63 
64  size_t states_size_in_bytes_, reserve_space_size_in_bytes_;
65  // Input: X, Output: Y, mask_and_states
66 
67  // track whether states have been initialized - only needs to happen once
68  bool states_initialized_;
69 
70  // random seed
71  unsigned long long random_seed_;
72 };
73 
74 class CuDNNDropoutGradientOp final : public Operator<CUDAContext> {
75  public:
76  USE_OPERATOR_FUNCTIONS(CUDAContext);
77  explicit CuDNNDropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
78  : Operator<CUDAContext>(operator_def, ws),
79  cudnn_wrapper_(&context_),
80  ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
81  is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
82  states_initialized_(false),
83  random_seed_(operator_def.device_option().random_seed()) {
84  CAFFE_ENFORCE_GE(ratio_, 0);
85  CAFFE_ENFORCE_LT(ratio_, 1);
86  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
87 
88  CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_));
89  CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
90  cudnn_wrapper_.inline_cudnn_handle(),
91  reinterpret_cast<size_t*>(&states_size_in_bytes_)));
92 
93  // Share scratch with the forward op
94  scratch_blob_ =
95  ws->GetBlob(CuDNNDropoutOp::scratch_blob_name(operator_def.input(1)));
96  CAFFE_ENFORCE(scratch_blob_);
97  }
98 
99  ~CuDNNDropoutGradientOp() noexcept override {
100  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
101  CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_));
102  }
103 
104  template <typename T, typename M>
105  bool DoRunWithType();
106 
107  bool RunOnDevice() override;
108 
109  protected:
110  CuDNNWrapper cudnn_wrapper_;
111  cudnnTensorDescriptor_t data_desc_;
112  cudnnDropoutDescriptor_t dropout_desc_;
113 
114  vector<int64_t> cudnn_input_dims_;
115 
116  Blob* scratch_blob_;
117 
118  float ratio_;
119  bool is_test_;
120 
121  size_t states_size_in_bytes_, reserve_space_size_in_bytes_;
122  // Input: dY, mask_and_states, Output: dX
123 
124  // only need to initialize states once (size is static)
125  bool states_initialized_;
126 
127  unsigned long long random_seed_;
128 };
129 
130 template <typename T, typename M>
131 bool CuDNNDropoutOp::DoRunWithType() {
132  const auto& X = Input(0);
133  auto* Y = Output(0);
134 
135  auto size_prod = 1;
136  for (auto dim : X.sizes()) {
137  size_prod *= dim;
138  }
139  // now actually run the computation
140  if (is_test_) {
141  if (Y != &X) {
142  context_.CopySameDevice<T>(
143  X.numel(), X.template data<T>(), Y->template mutable_data<T>());
144  }
145  return true;
146  } else {
147  // Reshape tensor descriptors if necessary
148  if (X.sizes() != cudnn_input_dims_) {
149  CAFFE_ENFORCE(scratch_blob_);
150  Tensor* states = BlobGetMutableTensor(scratch_blob_, CUDA);
151  cudnn_input_dims_ = X.sizes().vec();
152  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
153  data_desc_,
154  GetCudnnTensorFormat(StorageOrder::NCHW),
155  cudnnTypeWrapper<T>::type,
156  size_prod,
157  1,
158  1,
159  1));
160 
161  // get the reserve space we need
162  CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize(
163  data_desc_, &reserve_space_size_in_bytes_));
164 
165  states->Resize(states_size_in_bytes_);
166 
167  if (!states_initialized_) {
168  // set the dropout descriptor (note: need to allocate the states data
169  // before acquiring the mutex)
170  uint8_t* states_data = states->template mutable_data<uint8_t>();
171  {
172  // Need to protect as clashes with NCCL
173  std::lock_guard<std::mutex> lk(CUDAContext::mutex());
174  CUDNN_ENFORCE(cudnnSetDropoutDescriptor(
175  dropout_desc_,
176  cudnn_wrapper_.inline_cudnn_handle(),
177  ratio_,
178  states_data,
179  states_size_in_bytes_,
180  random_seed_
181  ));
182  }
183  states_initialized_ = true;
184  }
185  }
186  auto* mask = Output(
187  1,
188  {static_cast<int64_t>(reserve_space_size_in_bytes_)},
189  at::dtype<uint8_t>());
190  CUDNN_ENFORCE(cudnnDropoutForward(
191  cudnn_wrapper_.inline_cudnn_handle(),
192  dropout_desc_,
193  data_desc_,
194  X.template data<T>(),
195  data_desc_,
196  Y->template mutable_data<T>(),
197  mask->template mutable_data<uint8_t>(),
198  reserve_space_size_in_bytes_));
199  }
200  return true;
201 }
202 
203 bool CuDNNDropoutOp::RunOnDevice() {
204  // dispatch based on contents of tensor(s)
205  const auto& X = Input(0);
206  auto* Y = Output(0);
207  Y->ResizeLike(X);
208 
209  if (X.IsType<float>()) {
210  return DoRunWithType<float, float>();
211  } else if (X.IsType<at::Half>()) {
212  return DoRunWithType<at::Half, float>();
213  }
214  return false;
215 }
216 
217 template <typename T, typename M>
218 bool CuDNNDropoutGradientOp::DoRunWithType() {
219  const auto& dY = Input(0);
220  const auto& mask = Input(1);
221  const Tensor& states = scratch_blob_->Get<Tensor>();
222  auto* dX = Output(0);
223 
224  auto size_prod = 1;
225  for (auto dim : dY.sizes()) {
226  size_prod *= dim;
227  }
228 
229  if (!states_initialized_) {
230  // set the dropout descriptor
231  {
232  // Need to protect as clashes with NCCL
233  std::lock_guard<std::mutex> lk(CUDAContext::mutex());
234  CUDNN_ENFORCE(cudnnRestoreDropoutDescriptor(
235  dropout_desc_,
236  cudnn_wrapper_.inline_cudnn_handle(),
237  ratio_,
238  const_cast<uint8_t*>(states.data<uint8_t>()),
239  states_size_in_bytes_,
240  random_seed_
241  ));
242  }
243  states_initialized_ = true;
244  }
245 
246  if (dY.sizes() != cudnn_input_dims_) {
247  cudnn_input_dims_ = dY.sizes().vec();
248  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
249  data_desc_,
250  GetCudnnTensorFormat(StorageOrder::NCHW),
251  cudnnTypeWrapper<T>::type,
252  size_prod,
253  1,
254  1,
255  1));
256 
257  // get the reserve space we need
258  CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize(
259  data_desc_, &reserve_space_size_in_bytes_));
260  }
261 
262  // run the computation
263  void* mask_data = const_cast<void*>(mask.raw_data());
264  CUDNN_ENFORCE(cudnnDropoutBackward(
265  cudnn_wrapper_.inline_cudnn_handle(),
266  dropout_desc_,
267  data_desc_,
268  dY.data<T>(),
269  data_desc_,
270  dX->template mutable_data<T>(),
271  mask_data,
272  reserve_space_size_in_bytes_));
273  return true;
274 }
275 
276 bool CuDNNDropoutGradientOp::RunOnDevice() {
277  // dispatch based on contents of tensor(s)
278  const auto& dY = Input(0);
279  auto* dX = Output(0);
280 
281  dX->ResizeLike(dY);
282 
283  if (dY.IsType<float>()) {
284  return DoRunWithType<float, float>();
285  } else if (dY.IsType<at::Half>()) {
286  return DoRunWithType<at::Half, float>();
287  }
288  return false;
289 }
290 
291 namespace {
292 REGISTER_CUDNN_OPERATOR(Dropout, CuDNNDropoutOp);
293 REGISTER_CUDNN_OPERATOR(DropoutGrad, CuDNNDropoutGradientOp);
294 }
295 
296 #endif
297 
298 }; // namespace caffe2
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Definition: common_cudnn.h:192
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13