Caffe2 - C++ API
A deep learning, cross platform ML framework
relu_op_cudnn.cc
1 
17 #include "caffe2/core/context_gpu.h"
18 #include "caffe2/core/cudnn_wrappers.h"
19 #include "caffe2/core/operator.h"
20 #include "caffe2/core/types.h"
21 
22 namespace caffe2 {
23 
24 class CuDNNReluOp final : public Operator<CUDAContext> {
25  public:
26  CuDNNReluOp(const OperatorDef& operator_def, Workspace* ws)
27  : Operator<CUDAContext>(operator_def, ws),
28  cudnn_wrapper_(&context_),
29  order_(StringToStorageOrder(
30  OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
31  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
32  CUDNN_ENFORCE(cudnnCreateActivationDescriptor(&activ_desc_));
33  CUDNN_ENFORCE(cudnnSetActivationDescriptor(
34  activ_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
35  }
36 
37  ~CuDNNReluOp() {
38  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
39  CUDNN_ENFORCE(cudnnDestroyActivationDescriptor(activ_desc_));
40  }
41 
42  template <typename T>
43  bool DoRunWithType() {
44  const auto& X = Input(0);
45  auto* Y = Output(0);
46 
47  // Return if X is empty
48  if (X.size() == 0) {
49  Y->mutable_data<T>();
50  return true;
51  }
52 
53  // See if we need to reshape.
54  if (X.dims() != cudnn_input_dims_) {
55  VLOG(1) << "Setting descriptors.";
56  cudnn_input_dims_ = X.dims();
57  int C = 1, H = 1, W = 1;
58  if (X.ndim() == 4) {
59  // Normal 4-dimensional tensors for images.
60  C = (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(3));
61  H = (order_ == StorageOrder::NCHW ? X.dim32(2) : X.dim32(1));
62  W = (order_ == StorageOrder::NCHW ? X.dim32(3) : X.dim32(2));
63  } else {
64  // If X is not 4-dimensional, we will simply use H = 1 and W = 1
65  // and wrap everything into C.
66  C = X.size() / X.dim32(0);
67  }
68  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
69  data_desc_,
70  GetCudnnTensorFormat(order_),
72  X.dim32(0),
73  C,
74  H,
75  W));
76  }
77  CUDNN_ENFORCE(cudnnActivationForward(
78  cudnn_wrapper_.inline_cudnn_handle(),
79  activ_desc_,
81  data_desc_,
82  X.template data<T>(),
84  data_desc_,
85  Y->template mutable_data<T>()));
86  return true;
87  }
88 
89  bool RunOnDevice() override {
90  // dispatch based on contents of tensor(s)
91  const auto& X = Input(0);
92  auto* Y = Output(0);
93  Y->ResizeLike(X);
94 
95  if (X.IsType<float>()) {
96  return DoRunWithType<float>();
97  } else if (X.IsType<float16>()) {
98  return DoRunWithType<float16>();
99  } else {
100  LOG(FATAL) << "Unsupported input types";
101  }
102  return true;
103  }
104 
105  protected:
106  CuDNNWrapper cudnn_wrapper_;
107  cudnnTensorDescriptor_t data_desc_;
108  cudnnActivationDescriptor_t activ_desc_;
109  vector<TIndex> cudnn_input_dims_;
110  StorageOrder order_;
111 };
112 
113 
114 // Note: You can see that in CuDNNReluGradientOp, we abused the cudnn interface
115 // by passing in the output tensor for both bottom and top. This is dependent on
116 // the assumption that the Relu gradient actually does not rely on the bottom
117 // data, or it treats input=0 the same way as input<0. This is of course not
118 // very safe, but we have been running in this way in Caffe for a while so it
119 // *might* be safe to assume so.
120 class CuDNNReluGradientOp final : public Operator<CUDAContext> {
121  public:
122  CuDNNReluGradientOp(const OperatorDef& operator_def, Workspace* ws)
123  : Operator<CUDAContext>(operator_def, ws),
124  cudnn_wrapper_(&context_),
125  order_(StringToStorageOrder(
126  OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
127  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&data_desc_));
128  CUDNN_ENFORCE(cudnnCreateActivationDescriptor(&activ_desc_));
129  CUDNN_ENFORCE(cudnnSetActivationDescriptor(
130  activ_desc_, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
131  }
132 
134  CUDNN_ENFORCE(cudnnDestroyTensorDescriptor(data_desc_));
135  CUDNN_ENFORCE(cudnnDestroyActivationDescriptor(activ_desc_));
136  }
137 
138  template <typename T>
139  bool DoRunWithType() {
140  const auto& Y = Input(0);
141  const auto& dY = Input(1);
142  auto* dX = Output(0);
143 
144  // Return if Y is empty
145  if (Y.size() == 0) {
146  dX->mutable_data<T>();
147  return true;
148  }
149 
150  // See if we need to reshape.
151  if (Y.dims() != cudnn_input_dims_) {
152  VLOG(1) << "Setting descriptors.";
153  cudnn_input_dims_ = Y.dims();
154  int C = 1, H = 1, W = 1;
155  if (Y.ndim() == 4) {
156  // Normal 4-dimensional tensors for images.
157  C = (order_ == StorageOrder::NCHW ? Y.dim32(1) : Y.dim32(3));
158  H = (order_ == StorageOrder::NCHW ? Y.dim32(2) : Y.dim32(1));
159  W = (order_ == StorageOrder::NCHW ? Y.dim32(3) : Y.dim32(2));
160  } else {
161  // If Y is not 4-dimensional, we will simply use H = 1 and W = 1
162  // and wrap everything into C.
163  C = Y.size() / Y.dim32(0);
164  }
165  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
166  data_desc_,
167  GetCudnnTensorFormat(order_),
169  Y.dim32(0),
170  C,
171  H,
172  W));
173  }
174  CUDNN_ENFORCE(cudnnActivationBackward(
175  cudnn_wrapper_.inline_cudnn_handle(),
176  activ_desc_,
178  data_desc_,
179  Y.template data<T>(),
180  data_desc_,
181  dY.template data<T>(),
182  data_desc_,
183  // Note: strictly speaking, we should be using the input data in this
184  // case, but for the ReLU case we rely on the underlying implementation
185  // that only the output is needed to calculate the Relu gradient. This
186  // will enable us to do memory optimization for in-place relu. To
187  // ensure this is correct, a unit test is provided at
188  // caffe2/python/operator_test/relu_op_test.py
189  Y.template data<T>(),
191  data_desc_,
192  dX->template mutable_data<T>()));
193  return true;
194  }
195 
196  bool RunOnDevice() override {
197  const auto& Y = Input(0);
198  auto* dX = Output(0);
199  dX->ResizeLike(Y);
200 
201  if (Y.IsType<float>()) {
202  return DoRunWithType<float>();
203  } else if (Y.IsType<float16>()) {
204  return DoRunWithType<float16>();
205  } else {
206  LOG(FATAL) << "Unsupported input types";
207  }
208  return true;
209  }
210 
211  protected:
212  CuDNNWrapper cudnn_wrapper_;
213  cudnnTensorDescriptor_t data_desc_;
214  cudnnActivationDescriptor_t activ_desc_;
215  vector<TIndex> cudnn_input_dims_;
216  StorageOrder order_;
217  // Input: Y, dY; Output: dX
218 };
219 
220 namespace {
221 REGISTER_CUDNN_OPERATOR(Relu, CuDNNReluOp);
222 REGISTER_CUDNN_OPERATOR(ReluGradient, CuDNNReluGradientOp);
223 } // namespace
224 } // namespace caffe2
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Definition: common_cudnn.h:199
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
Copyright (c) 2016-present, Facebook, Inc.
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
cudnnHandle_t inline_cudnn_handle()
Returns the inline cudnn handle that executes on the current thread&#39;s cuda_stream.
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...
Definition: common_cudnn.h:127