Caffe2 - C++ API
A deep learning, cross platform ML framework
dropout_op_cudnn.cc
1 
17 #include "caffe2/core/context_gpu.h"
18 #include "caffe2/core/cudnn_wrappers.h"
19 #include "caffe2/core/operator.h"
20 #include "caffe2/core/types.h"
21 
22 namespace caffe2 {
23 
24 // cudnnRestoreDropoutDescriptor is needed for correctness and
25 // doesn't exist prior to cuDNN v7
26 #if CUDNN_VERSION_MIN(7,0,0)
27 
28 class CuDNNDropoutOp final : public Operator<CUDAContext> {
29  public:
30  USE_OPERATOR_FUNCTIONS(CUDAContext);
31 
32  CuDNNDropoutOp(const OperatorDef& operator_def, Workspace* ws)
33  : Operator<CUDAContext>(operator_def, ws),
34  cudnn_wrapper_(&context_),
35  ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
36  is_test_(
37  OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
38  states_initialized_(false),
39  random_seed_(operator_def.device_option().random_seed()) {
40  CAFFE_ENFORCE_GE(ratio_, 0);
41  CAFFE_ENFORCE_LT(ratio_, 1);
42  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
43 
44  CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_));
45  CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
46  cudnn_wrapper_.inline_cudnn_handle(),
47  reinterpret_cast<size_t*>(&states_size_in_bytes_)));
48 
49  if (!is_test_) {
50  scratch_blob_ = ws->CreateBlob(scratch_blob_name(operator_def.output(1)));
51  CAFFE_ENFORCE(scratch_blob_);
52  }
53  }
54 
55  ~CuDNNDropoutOp() noexcept {
56  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
57  CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_));
58  }
59 
60  template <typename T, typename M>
61  bool DoRunWithType();
62 
63  bool RunOnDevice() override;
64 
65  static string scratch_blob_name(string mask_blob_name) {
66  return "cudnn_dropout_scratch_" + mask_blob_name;
67  }
68 
69  protected:
70  CuDNNWrapper cudnn_wrapper_;
71  cudnnTensorDescriptor_t data_desc_;
72  cudnnDropoutDescriptor_t dropout_desc_;
73 
74  vector<TIndex> cudnn_input_dims_;
75 
76  float ratio_;
77  bool is_test_;
78 
79  Blob* scratch_blob_ = nullptr;
80 
81  size_t states_size_in_bytes_, reserve_space_size_in_bytes_;
82  // Input: X, Output: Y, mask_and_states
83 
84  // track whether states have been initialized - only needs to happen once
85  bool states_initialized_;
86 
87  // random seed
88  unsigned long long random_seed_;
89 };
90 
91 class CuDNNDropoutGradientOp final : public Operator<CUDAContext> {
92  public:
93  USE_OPERATOR_FUNCTIONS(CUDAContext);
94  CuDNNDropoutGradientOp(const OperatorDef& operator_def, Workspace* ws)
95  : Operator<CUDAContext>(operator_def, ws),
96  cudnn_wrapper_(&context_),
97  ratio_(OperatorBase::GetSingleArgument<float>("ratio", 0.5)),
98  is_test_(
99  OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
100  states_initialized_(false),
101  random_seed_(operator_def.device_option().random_seed()) {
102  CAFFE_ENFORCE_GE(ratio_, 0);
103  CAFFE_ENFORCE_LT(ratio_, 1);
104  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
105 
106  CUDNN_ENFORCE(cudnnCreateDropoutDescriptor(&dropout_desc_));
107  CUDNN_ENFORCE(cudnnDropoutGetStatesSize(
108  cudnn_wrapper_.inline_cudnn_handle(),
109  reinterpret_cast<size_t*>(&states_size_in_bytes_)));
110 
111  // Share scratch with the forward op
112  scratch_blob_ =
113  ws->GetBlob(CuDNNDropoutOp::scratch_blob_name(operator_def.input(1)));
114  CAFFE_ENFORCE(scratch_blob_);
115  }
116 
117  ~CuDNNDropoutGradientOp() noexcept {
118  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
119  CUDNN_ENFORCE(cudnnDestroyDropoutDescriptor(dropout_desc_));
120  }
121 
122  template <typename T, typename M>
123  bool DoRunWithType();
124 
125  bool RunOnDevice() override;
126 
127  protected:
128  CuDNNWrapper cudnn_wrapper_;
129  cudnnTensorDescriptor_t data_desc_;
130  cudnnDropoutDescriptor_t dropout_desc_;
131 
132  vector<TIndex> cudnn_input_dims_;
133 
134  Blob* scratch_blob_;
135 
136  float ratio_;
137  bool is_test_;
138 
139  size_t states_size_in_bytes_, reserve_space_size_in_bytes_;
140  // Input: dY, mask_and_states, Output: dX
141 
142  // only need to initialize states once (size is static)
143  bool states_initialized_;
144 
145  unsigned long long random_seed_;
146 };
147 
148 template <typename T, typename M>
149 bool CuDNNDropoutOp::DoRunWithType() {
150  const auto& X = Input(0);
151  auto* Y = Output(0);
152 
153  auto size_prod = 1;
154  for (auto dim : X.dims()) {
155  size_prod *= dim;
156  }
157  // now actually run the computation
158  if (is_test_) {
159  if (Y != &X) {
160  context_.Copy<T, CUDAContext, CUDAContext>(
161  X.size(), X.template data<T>(), Y->template mutable_data<T>());
162  }
163  return true;
164  } else {
165  auto* mask = Output(1);
166  // Reshape tensor descriptors if necessary
167  if (X.dims() != cudnn_input_dims_ && !is_test_) {
168  CAFFE_ENFORCE(scratch_blob_);
169  Tensor<CUDAContext>* states =
170  scratch_blob_->GetMutable<Tensor<CUDAContext>>();
171  cudnn_input_dims_ = X.dims();
172  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
173  data_desc_,
174  GetCudnnTensorFormat(StorageOrder::NCHW),
175  cudnnTypeWrapper<T>::type,
176  size_prod,
177  1,
178  1,
179  1));
180 
181  // get the reserve space we need
182  CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize(
183  data_desc_, &reserve_space_size_in_bytes_));
184 
185  mask->Resize(reserve_space_size_in_bytes_);
186  states->Resize(states_size_in_bytes_);
187 
188  if (!states_initialized_) {
189  // set the dropout descriptor (note: need to allocate the states data
190  // before acquiring the mutex)
191  uint8_t* states_data = states->mutable_data<uint8_t>();
192  {
193  // Need to protect as clashes with NCCL
194  std::lock_guard<std::mutex> lk(CUDAContext::mutex());
195  CUDNN_ENFORCE(cudnnSetDropoutDescriptor(
196  dropout_desc_,
197  cudnn_wrapper_.inline_cudnn_handle(),
198  ratio_,
199  states_data,
200  states_size_in_bytes_,
201  random_seed_
202  ));
203  }
204  states_initialized_ = true;
205  }
206  }
207  CUDNN_ENFORCE(cudnnDropoutForward(
208  cudnn_wrapper_.inline_cudnn_handle(),
209  dropout_desc_,
210  data_desc_,
211  X.template data<T>(),
212  data_desc_,
213  Y->template mutable_data<T>(),
214  mask->mutable_data<uint8_t>(),
215  reserve_space_size_in_bytes_));
216  }
217  return true;
218 }
219 
220 bool CuDNNDropoutOp::RunOnDevice() {
221  // dispatch based on contents of tensor(s)
222  const auto& X = Input(0);
223  auto* Y = Output(0);
224  Y->ResizeLike(X);
225 
226  if (X.IsType<float>()) {
227  return DoRunWithType<float, float>();
228  } else if (X.IsType<float16>()) {
229  return DoRunWithType<float16, float>();
230  }
231  return false;
232 }
233 
234 template <typename T, typename M>
235 bool CuDNNDropoutGradientOp::DoRunWithType() {
236  const auto& dY = Input(0);
237  const auto& mask = Input(1);
238  const Tensor<CUDAContext>& states = scratch_blob_->Get<Tensor<CUDAContext>>();
239  auto* dX = Output(0);
240 
241  auto size_prod = 1;
242  for (auto dim : dY.dims()) {
243  size_prod *= dim;
244  }
245 
246  if (!states_initialized_) {
247  // set the dropout descriptor
248  {
249  // Need to protect as clashes with NCCL
250  std::lock_guard<std::mutex> lk(CUDAContext::mutex());
251  CUDNN_ENFORCE(cudnnRestoreDropoutDescriptor(
252  dropout_desc_,
253  cudnn_wrapper_.inline_cudnn_handle(),
254  ratio_,
255  const_cast<uint8_t*>(states.data<uint8_t>()),
256  states_size_in_bytes_,
257  random_seed_
258  ));
259  }
260  states_initialized_ = true;
261  }
262 
263  if (dY.dims() != cudnn_input_dims_) {
264  cudnn_input_dims_ = dY.dims();
265  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
266  data_desc_,
267  GetCudnnTensorFormat(StorageOrder::NCHW),
268  cudnnTypeWrapper<T>::type,
269  size_prod,
270  1,
271  1,
272  1));
273 
274  // get the reserve space we need
275  CUDNN_ENFORCE(cudnnDropoutGetReserveSpaceSize(
276  data_desc_, &reserve_space_size_in_bytes_));
277 
278  }
279 
280  // run the computation
281  void* mask_data = const_cast<void*>(mask.raw_data());
282  CUDNN_ENFORCE(cudnnDropoutBackward(
283  cudnn_wrapper_.inline_cudnn_handle(),
284  dropout_desc_,
285  data_desc_,
286  dY.data<T>(),
287  data_desc_,
288  dX->template mutable_data<T>(),
289  mask_data,
290  reserve_space_size_in_bytes_));
291  return true;
292 }
293 
294 bool CuDNNDropoutGradientOp::RunOnDevice() {
295  // dispatch based on contents of tensor(s)
296  const auto& dY = Input(0);
297  auto* dX = Output(0);
298 
299  dX->ResizeLike(dY);
300 
301  if (dY.IsType<float>()) {
302  return DoRunWithType<float, float>();
303  } else if (dY.IsType<float16>()) {
304  return DoRunWithType<float16, float>();
305  }
306  return false;
307 }
308 
309 namespace {
310 REGISTER_CUDNN_OPERATOR(Dropout, CuDNNDropoutOp);
311 REGISTER_CUDNN_OPERATOR(DropoutGrad, CuDNNDropoutGradientOp);
312 }
313 
314 #endif
315 
316 }; // namespace caffe2
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Definition: common_cudnn.h:199
Copyright (c) 2016-present, Facebook, Inc.
Copyright (c) 2016-present, Facebook, Inc.