Caffe2 - C++ API
A deep learning, cross platform ML framework
spatial_batch_norm_op_cudnn.cc
1 
17 #include <cfloat>
18 
19 #include "caffe2/core/context_gpu.h"
20 #include "caffe2/core/cudnn_wrappers.h"
21 #include "caffe2/operators/spatial_batch_norm_op.h"
22 #include "caffe2/utils/math.h"
23 
24 // Note: Instead of directly failing, we will choose to not build this operator
25 // if cudnn version is not high enough.
26 static_assert(CUDNN_VERSION >= 5000,
27  "CudnnSpatialBN requires cudnn version 5.0 or above.");
28 
29 namespace caffe2 {
30 
31 class CudnnSpatialBNOp final : public SpatialBNOp<CUDAContext> {
32  public:
33  USE_OPERATOR_FUNCTIONS(CUDAContext);
34  CudnnSpatialBNOp(const OperatorDef& operator_def, Workspace* ws)
35  : SpatialBNOp<CUDAContext>(operator_def, ws), cudnn_wrapper_(&context_) {
36  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
37  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bn_param_desc_));
38  if (epsilon_ <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
39  LOG(ERROR) << "Provided epsilon is smaller than "
40  << "CUDNN_BN_MIN_EPSILON. Setting it to "
41  << "CUDNN_BN_MIN_EPSILON instead.";
42  }
43  epsilon_ = std::max(epsilon_, CUDNN_BN_MIN_EPSILON);
44 #if CUDNN_VERSION_MIN(7,0,0)
45  mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
46 #else
47  mode_ = CUDNN_BATCHNORM_SPATIAL;
48 #endif
49  }
50 
51  ~CudnnSpatialBNOp() {
52  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
53  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bn_param_desc_));
54  }
55 
56  template <typename T, typename M>
57  bool DoRunWithType();
58  bool RunOnDevice() override;
59 
60  protected:
61  CuDNNWrapper cudnn_wrapper_;
62  cudnnTensorDescriptor_t data_desc_;
63  cudnnTensorDescriptor_t bn_param_desc_;
64  vector<TIndex> cudnn_input_dims_;
65 
66  cudnnBatchNormMode_t mode_;
67 };
68 
69 class CudnnSpatialBNGradientOp final : public SpatialBNGradientOp<CUDAContext> {
70  public:
71  USE_OPERATOR_FUNCTIONS(CUDAContext);
72  CudnnSpatialBNGradientOp(const OperatorDef& operator_def, Workspace* ws)
73  : SpatialBNGradientOp<CUDAContext>(operator_def, ws),
74  cudnn_wrapper_(&context_) {
75  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
76  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&bn_param_desc_));
77  if (epsilon_ <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
78  LOG(ERROR) << "Provided epsilon is smaller than "
79  << "CUDNN_BN_MIN_EPSILON. Setting it to "
80  << "CUDNN_BN_MIN_EPSILON instead.";
81  }
82  epsilon_ = std::max(epsilon_, CUDNN_BN_MIN_EPSILON);
83 #if CUDNN_VERSION_MIN(7,0,0)
84  mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
85 #else
86  mode_ = CUDNN_BATCHNORM_SPATIAL;
87 #endif
88  }
89 
91  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
92  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(bn_param_desc_));
93  }
94 
95  template <typename T, typename M>
96  bool DoRunWithType();
97 
98  bool RunOnDevice() override;
99 
100  protected:
101  CuDNNWrapper cudnn_wrapper_;
102  cudnnTensorDescriptor_t data_desc_;
103  cudnnTensorDescriptor_t bn_param_desc_;
104  vector<TIndex> cudnn_input_dims_;
105 
106  cudnnBatchNormMode_t mode_;
107 };
108 
109 
111 // Implementations
113 
114 template <typename T, typename M>
115 bool CudnnSpatialBNOp::DoRunWithType() {
116 
117  // QoL
118  typedef typename cudnnTypeWrapper<T>::BNParamType BNParamType;
119 
120  const auto& X = Input(INPUT);
121  const auto& scale = Input(SCALE);
122  const auto& bias = Input(BIAS);
123 
124  CAFFE_ENFORCE_GE(X.ndim(), 3);
125  const int N = X.dim32(0);
126  const int C = X.ndim() > 3
127  ? (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(X.ndim() - 1))
128  : (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(2));
129  const int H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
130  const int W = X.ndim() > 3
131  ? (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2))
132  : 1;
133  const int D = X.ndim() > 4
134  ? (order_ == StorageOrder::NCHW ? X.dim32(4) : X.dim32(3))
135  : 1;
136  CAFFE_ENFORCE_EQ(scale.ndim(), 1);
137  CAFFE_ENFORCE_EQ(bias.ndim(), 1);
138  CAFFE_ENFORCE_EQ(scale.dim32(0), C);
139  CAFFE_ENFORCE_EQ(bias.dim32(0), C);
140  // See if we need to reshape.
141  if (X.dims() != cudnn_input_dims_) {
142  VLOG(1) << "Setting descriptors.";
143  cudnn_input_dims_ = X.dims();
144  if (order_ == StorageOrder::NCHW) {
145  vector<int> dims = {N, C, H, W, D};
146  vector<int> strides = {C * H * W * D, H * W * D, W * D, D, 1};
147  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
148  data_desc_,
150  X.ndim() > 3 ? X.ndim() : 4,
151  dims.data(),
152  strides.data()));
153  } else {
154  vector<int> dims = {N, C, H, W, D};
155  vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
156  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
157  data_desc_,
159  X.ndim() > 3 ? X.ndim() : 4,
160  dims.data(),
161  strides.data()));
162  }
163  CUDNN_ENFORCE(cudnnDeriveBNTensorDescriptor(
164  bn_param_desc_, data_desc_, mode_));
165  }
166 
167  // Now, depending on whether we are running test or not, we have two paths.
168  if (is_test_) {
169  // Run inference mode.
170  const auto& est_mean = Input(EST_MEAN);
171  const auto& est_var = Input(EST_VAR);
172  CAFFE_ENFORCE_EQ(est_mean.ndim(), 1);
173  CAFFE_ENFORCE_EQ(est_var.ndim(), 1);
174  CAFFE_ENFORCE_EQ(est_mean.dim32(0), C);
175  CAFFE_ENFORCE_EQ(est_var.dim32(0), C);
176 
177  auto* Y = Output(OUTPUT);
178  Y->ResizeLike(X);
179  CUDNN_ENFORCE(cudnnBatchNormalizationForwardInference(
180  cudnn_wrapper_.inline_cudnn_handle(),
181  // Note: PERSISTENT not implemented for inference
182  CUDNN_BATCHNORM_SPATIAL,
185  data_desc_,
186  X.template data<T>(),
187  data_desc_,
188  Y->template mutable_data<T>(),
189  bn_param_desc_,
190  scale.template data<BNParamType>(),
191  bias.template data<BNParamType>(),
192  est_mean.template data<BNParamType>(),
193  est_var.template data<BNParamType>(),
194  epsilon_));
195  } else {
196  // Run training mode.
197  auto* Y = Output(OUTPUT);
198  Y->ResizeLike(X);
199  // obtain running mean and running inv var, and see if we need to
200  // initialize them.
201  auto* running_mean = Output(RUNNING_MEAN);
202  auto* running_var = Output(RUNNING_VAR);
203  double this_factor = 1. - momentum_;
204  BNParamType* running_mean_data = nullptr;
205  BNParamType* running_var_data = nullptr;
206  if (!running_mean->size()) {
207  // If the input mean and var are not initialized yet, this is the first
208  // run and we will initialize the storage.
209  VLOG(1) << "Initializing running mean and var.";
210  // Need to do initialization
211  running_mean->Resize(C);
212  running_var->Resize(C);
213  running_mean_data = running_mean->template mutable_data<BNParamType>();
214  running_var_data = running_var->template mutable_data<BNParamType>();
215  // In principle, setting this_momentum to 1 will wipe existing data.
216  // This has a caveat that if cudnn does not deal with 0*NaN cases we
217  // will be having an issue. Thus we choose a safe path by explicitly
218  // setting zero.
219  math::Set<BNParamType, CUDAContext>(C, 0, running_mean_data, &context_);
220  math::Set<BNParamType, CUDAContext>(C, 0, running_var_data, &context_);
221  } else {
222  // Does not need to do initialization.
223  CAFFE_ENFORCE_EQ(running_mean->ndim(), 1);
224  CAFFE_ENFORCE_EQ(running_var->ndim(), 1);
225  CAFFE_ENFORCE_EQ(running_mean->dim32(0), C);
226  CAFFE_ENFORCE_EQ(running_var->dim32(0), C);
227  running_mean_data = running_mean->template mutable_data<BNParamType>();
228  running_var_data = running_var->template mutable_data<BNParamType>();
229  }
230  // Save the mean and inv var results.
231  auto* save_mean = Output(SAVED_MEAN);
232  auto* save_var = Output(SAVED_INV_VAR);
233  save_mean->Resize(C);
234  save_var->Resize(C);
235  void* save_mean_data = save_mean->template mutable_data<BNParamType>();
236  void* save_var_data = save_var->template mutable_data<BNParamType>();
237 
238  CUDNN_ENFORCE(cudnnBatchNormalizationForwardTraining(
239  cudnn_wrapper_.inline_cudnn_handle(),
240  mode_,
243  data_desc_,
244  X.template data<T>(),
245  data_desc_,
246  Y->template mutable_data<T>(),
247  bn_param_desc_,
248  scale.template data<BNParamType>(),
249  bias.template data<BNParamType>(),
250  this_factor,
251  running_mean_data,
252  running_var_data,
253  epsilon_,
254  save_mean_data,
255  save_var_data));
256  }
257  return true;
258 }
259 
260 bool CudnnSpatialBNOp::RunOnDevice() {
261  if (Input(0).IsType<float>()) {
262  return DoRunWithType<float,float>();
263  } else if (Input(0).IsType<float16>()) {
264  return DoRunWithType<float16,float>();
265  } else {
266  LOG(FATAL) << "Unsupported input types";
267  }
268  return true;
269 }
270 
271 template <typename T, typename M>
272 bool CudnnSpatialBNGradientOp::DoRunWithType() {
273  // QoL
274  typedef typename cudnnTypeWrapper<T>::BNParamType BNParamType;
275 
276  const auto& X = Input(INPUT);
277  const auto& scale = Input(SCALE);
278  const auto& dY = Input(OUTPUT_GRAD);
279 
280  CAFFE_ENFORCE_GE(X.ndim(), 3);
281  const int N = X.dim32(0);
282  const int C = X.ndim() > 3
283  ? (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(X.ndim() - 1))
284  : (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(2));
285  const int H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
286  const int W = X.ndim() > 3
287  ? (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2))
288  : 1;
289  const int D = X.ndim() > 4
290  ? (order_ == StorageOrder::NCHW ? X.dim32(4) : X.dim32(3))
291  : 1;
292  CAFFE_ENFORCE_EQ(scale.ndim(), 1);
293  CAFFE_ENFORCE_EQ(scale.dim32(0), C);
294  // See if we need to reshape.
295  if (X.dims() != cudnn_input_dims_) {
296  if (order_ == StorageOrder::NCHW) {
297  vector<int> dims = {N, C, H, W, D};
298  vector<int> strides = {C * H * W * D, H * W * D, W * D, D, 1};
299  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
300  data_desc_,
302  X.ndim() > 3 ? X.ndim() : 4,
303  dims.data(),
304  strides.data()));
305  } else {
306  vector<int> dims = {N, C, H, W, D};
307  vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
308  CUDNN_ENFORCE(cudnnSetTensorNdDescriptor(
309  data_desc_,
311  X.ndim() > 3 ? X.ndim() : 4,
312  dims.data(),
313  strides.data()));
314  }
315  CUDNN_ENFORCE(cudnnDeriveBNTensorDescriptor(
316  bn_param_desc_, data_desc_, mode_));
317  }
318 
319  auto* dX = Output(INPUT_GRAD);
320  auto* dScale = Output(SCALE_GRAD);
321  auto* dBias = Output(BIAS_GRAD);
322  dX->ResizeLike(X);
323  dScale->ResizeLike(scale);
324  dBias->ResizeLike(scale);
325 
326  const auto& saved_mean = Input(SAVED_MEAN);
327  const auto& saved_var = Input(SAVED_INV_VAR);
328  const void* saved_mean_data = saved_mean.template data<BNParamType>();
329  const void* saved_var_data = saved_var.template data<BNParamType>();
330 
331  CUDNN_ENFORCE(cudnnBatchNormalizationBackward(
332  cudnn_wrapper_.inline_cudnn_handle(),
333  mode_,
338  data_desc_,
339  X.template data<T>(),
340  data_desc_,
341  dY.template data<T>(),
342  data_desc_,
343  dX->template mutable_data<T>(),
344  bn_param_desc_,
345  scale.template data<BNParamType>(),
346  dScale->template mutable_data<BNParamType>(),
347  dBias->template mutable_data<BNParamType>(),
348  epsilon_,
349  saved_mean_data,
350  saved_var_data));
351  return true;
352 }
353 
354 bool CudnnSpatialBNGradientOp::RunOnDevice() {
355  if (Input(0).IsType<float>()) {
356  return DoRunWithType<float,float>();
357  } else if (Input(0).IsType<float16>()) {
358  return DoRunWithType<float16,float>();
359  } else {
360  LOG(FATAL) << "Unsupported input types";
361  }
362  return true;
363 }
364 
365 // Since there is no default implementation for spatial batch normalization,
366 // we will register the cudnn version as the default as well.
367 REGISTER_CUDA_OPERATOR(SpatialBN, CudnnSpatialBNOp);
368 REGISTER_CUDA_OPERATOR(SpatialBNGradient, CudnnSpatialBNGradientOp);
369 
370 REGISTER_CUDNN_OPERATOR(SpatialBN, CudnnSpatialBNOp);
371 REGISTER_CUDNN_OPERATOR(SpatialBNGradient, CudnnSpatialBNGradientOp);
372 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
Copyright (c) 2016-present, Facebook, Inc.
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...
Definition: common_cudnn.h:127