Caffe2 - C++ API
A deep learning, cross platform ML framework
prelu_op.cc
1 #include "caffe2/operators/prelu_op.h"
2 #include "caffe2/utils/eigen_utils.h"
3 #include "caffe2/utils/math.h"
4 
5 #include "caffe2/core/types.h"
6 #include "caffe2/utils/cpu_neon.h"
7 
8 namespace caffe2 {
9 
10 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
11 namespace {
12 
13 void runNeonPrelu(float* out, const float* in, int size, float w) {
14  float32x4_t vZero = vdupq_n_f32(0.0f);
15  float32x4_t vW = vdupq_n_f32(w);
16 
17  constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
18 
19  if (size < kVecSizeInFloat) {
20  for (int i = 0; i < size; ++i) {
21  float v = in[i];
22  out[i] = v > 0 ? v : v * w;
23  }
24 
25  return;
26  }
27 
28  // We want to load aligned from the input, but assume the output is unaligned
29  int prologue =
30  kVecSizeInFloat -
31  // remainder in floats
32  (((uintptr_t) in) % (sizeof(float32x4_t))) / sizeof(float);
33 
34  int i = 0;
35 
36  // Prologue loop
37  for (; i < prologue; ++i) {
38  float v = in[i];
39  out[i] = v > 0 ? v : v * w;
40  }
41 
42  // The loop is manually unrolled by 6; seems to be the limit for
43  // armv7 to avoid register spills
44  constexpr int kUnroll = 6;
45  constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
46 
47  int remainder = size - prologue;
48  int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
49 
50  for (; i < vectorizable; i += kFloatsPerLoop) {
51  float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
52  float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
53  float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
54  float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
55  float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
56  float32x4_t v5 = vld1q_f32_aligned(in + i + 20);
57 
58  uint32x4_t gz0 = vcgtq_f32(v0, vZero);
59  uint32x4_t gz1 = vcgtq_f32(v1, vZero);
60  uint32x4_t gz2 = vcgtq_f32(v2, vZero);
61  uint32x4_t gz3 = vcgtq_f32(v3, vZero);
62  uint32x4_t gz4 = vcgtq_f32(v4, vZero);
63  uint32x4_t gz5 = vcgtq_f32(v5, vZero);
64 
65  float32x4_t v0neg = vmulq_f32(v0, vW);
66  float32x4_t v1neg = vmulq_f32(v1, vW);
67  float32x4_t v2neg = vmulq_f32(v2, vW);
68  float32x4_t v3neg = vmulq_f32(v3, vW);
69  float32x4_t v4neg = vmulq_f32(v4, vW);
70  float32x4_t v5neg = vmulq_f32(v5, vW);
71 
72  // v0 > 0 ? v0 : v0 * w
73  v0 = vbslq_f32(gz0, v0, v0neg);
74  v1 = vbslq_f32(gz1, v1, v1neg);
75  v2 = vbslq_f32(gz2, v2, v2neg);
76  v3 = vbslq_f32(gz3, v3, v3neg);
77  v4 = vbslq_f32(gz4, v4, v4neg);
78  v5 = vbslq_f32(gz5, v5, v5neg);
79 
80  vst1q_f32(out + i + 0, v0);
81  vst1q_f32(out + i + 4, v1);
82  vst1q_f32(out + i + 8, v2);
83  vst1q_f32(out + i + 12, v3);
84  vst1q_f32(out + i + 16, v4);
85  vst1q_f32(out + i + 20, v5);
86  }
87 
88  for (; i < size; ++i) {
89  float v = in[i];
90  out[i] = v > 0 ? v : v * w;
91  }
92 }
93 
94 }
95 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
96 
97 template <>
98 bool PReluOp<float, CPUContext>::RunOnDevice() {
99  const auto& X = Input(0);
100  const auto& W = Input(1);
101 
102  auto* Y = Output(0, X.sizes(), at::dtype<float>());
103  const auto* Xdata = X.template data<float>();
104  const auto* Wdata = W.template data<float>();
105  auto* Ydata = Y->template mutable_data<float>();
106 
107  const auto C = order_ == StorageOrder::NCHW ? X.size(1) : X.size(X.dim() - 1);
108  const auto C_shared = (W.numel() == 1);
109 
110  if (!C_shared) {
111  CAFFE_ENFORCE_EQ(C, W.numel());
112  }
113 
114  if (C_shared) {
115 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
116  // The function is completely pointwise
117  runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
118 #else
119  ConstEigenVectorMap<float> Xvec(Xdata, X.numel());
120  EigenVectorMap<float> Yvec(Ydata, Y->numel());
121  Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
122 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
123  return true;
124  }
125 
126  // non-shared case.
127  switch (order_) {
128  case StorageOrder::NCHW: {
129  const auto N = X.size(0);
130  const auto dim = X.size_from_dim(2);
131 
132 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
133  // Pointwise for each channel
134  for (int n = 0; n < N; ++n) {
135  for (int c = 0; c < C; ++c) {
136  runNeonPrelu(Ydata + (n * C + c) * dim,
137  Xdata + (n * C + c) * dim,
138  dim, Wdata[c]);
139  }
140  }
141 #else
142  int nc = 0;
143  for (int n = 0; n < N; ++n) {
144  for (int c = 0; c < C; ++c) {
145  ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
146  EigenVectorMap<float>(Ydata + nc * dim, dim) =
147  Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
148  nc++;
149  }
150  }
151 #endif
152  break;
153  }
154  case StorageOrder::NHWC: {
155  // Lay out matrix as (NHW, C) and multiply by C
156  const auto NHW = X.numel() / C;
157  ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
158  ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
159  EigenArrayMap<float> Ymat(Ydata, C, NHW);
160  Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
161  break;
162  }
163  default:
164  CAFFE_THROW("Unknown storage order: ", order_);
165  }
166  return true;
167 }
168 
169 template <>
170 bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
171  auto& Y = Input(0);
172  auto& dY = Input(1);
173  auto& X = Input(2);
174  auto& W = Input(3);
175 
176  CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
177 
178  DCHECK_EQ(dY.numel(), Y.numel());
179  auto* dX = Output(0, Y.sizes(), at::dtype<float>());
180  auto* dW = Output(1, W.sizes(), at::dtype<float>());
181 
182  const auto C = order_ == StorageOrder::NCHW ? X.size(1) : X.size(X.dim() - 1);
183  const auto C_shared = (W.numel() == 1);
184 
185  const float* Ydata = Y.data<float>();
186  const float* dYdata = dY.data<float>();
187  const float* Xdata = X.data<float>();
188  const float* Wdata = W.data<float>();
189  float* dXdata = dX->template mutable_data<float>();
190  float* dWdata = dW->template mutable_data<float>();
191 
192  // non-shared case.
193  switch (order_) {
194  case StorageOrder::NCHW: {
195  const auto dim = X.size_from_dim(2);
196  const auto div_factor = C_shared ? C : 1;
197  for (auto c = 0; c < W.numel(); ++c) {
198  dWdata[c] = 0;
199  }
200 
201  for (int i = 0; i < Y.numel(); ++i) {
202  if (Xdata[i] <= 0) {
203  int c = (i / dim) % C / div_factor;
204  dWdata[c] += dYdata[i] * Xdata[i];
205  }
206  }
207 
208  for (int i = 0; i < Y.numel(); ++i) {
209  if (Xdata[i] > 0) {
210  dXdata[i] = dYdata[i];
211  } else {
212  int c = (i / dim) % C / div_factor;
213  dXdata[i] = Wdata[c] * dYdata[i];
214  }
215  }
216  break;
217  }
218  case StorageOrder::NHWC: {
219  const auto NHW = X.numel() / C;
220  ConstEigenVectorArrayMap<float> Wvec(Wdata, W.numel());
221  EigenVectorArrayMap<float> dWvec(dWdata, dW->numel());
222 
223  ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
224  ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
225  ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
226  EigenArrayMap<float> dXmat(dXdata, C, NHW);
227 
228  if (C_shared) {
229  dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
230  dWdata[0] =
231  (Xmat > 0)
232  .select(
233  Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
234  dYmat * Xmat)
235  .sum();
236  } else {
237  dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
238  dWvec = (Xmat > 0)
239  .select(
240  Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
241  dYmat * Xmat)
242  .rowwise()
243  .sum();
244  }
245  break;
246  }
247  default:
248  CAFFE_THROW("Unknown storage order: ", order_);
249  }
250 
251  return true;
252 }
253 
254 REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
255 REGISTER_CPU_GRADIENT_OPERATOR(
256  PReluGradient,
257  PReluGradientOp<float, CPUContext>);
258 
259 // Input: X, Slope, output: Y
260 OPERATOR_SCHEMA(PRelu)
261  .NumInputs(2)
262  .NumOutputs(1)
263  .AllowInplace({{0, 0}})
264  .IdenticalTypeAndShapeOfInput(0)
265  .SetDoc(R"DOC(
266 
267 The *PRelu* op takes input data tensor $X$, an input slope tensor $slope$, and produces one output tensor $Y$ of the same shape as $X.$ The op performs the element wise *PRelu* operation, defined as
268 
269 $$y=prelu(x) =\begin{cases}slope * x & x < 0\\x & otherwise\end{cases}$$
270 
271 Note, is slope is size 1, the value is shared across the channels, otherwise $X$ and $slope$ must be the same shape. See [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852) for more information.
272 
273 Github Links:
274 
275 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.h
276 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.cc
277 
278 
279 <details>
280 
281 <summary> <b>Example</b> </summary>
282 
283 **Code**
284 
285 ```
286 
287 workspace.ResetWorkspace()
288 
289 op = core.CreateOperator(
290  "PRelu",
291  ["X","Slope"],
292  ["Y"],
293 )
294 
295 workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
296 print("X:\n", workspace.FetchBlob("X"), "\n")
297 
298 workspace.FeedBlob("Slope", np.array([0.1]).astype(np.float32))
299 print("Slope:\n", workspace.FetchBlob("Slope"), "\n")
300 
301 workspace.RunOperatorOnce(op)
302 print("Y:\n", workspace.FetchBlob("Y"))
303 
304 ```
305 
306 **Result**
307 
308 ```
309 
310 X:
311  [[ 0.3957382 -0.19725518 -0.26991343]
312  [ 1.5513182 -0.27427664 -0.14584002]
313  [-0.4121164 0.9292345 0.96426094]]
314 
315 Slope:
316  [0.1]
317 
318 Y:
319  [[ 0.3957382 -0.01972552 -0.02699134]
320  [ 1.5513182 -0.02742766 -0.014584 ]
321  [-0.04121164 0.9292345 0.96426094]]
322 
323 ```
324 
325 </details>
326 
327 
328 )DOC")
329  .Input(0, "X", "Input tensor of data to be operated on.")
330  .Input(
331  1,
332  "Slope",
333  "1D input slope tensor. If `Slope` is of size 1, the value is shared across different channels")
334  .Output(0, "Y", "Output tensor, with same shape as $X$.")
335  .InheritOnnxSchema();
336 
337 // Input: Y, dY, output: dX
338 GRADIENT_OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC(
339 
340 PReluGradient takes both Y and dY and uses this to update dX and dW according
341 to the chain rule and derivatives of the rectified linear function.
342 
343 )DOC");
344 
345 class GetPReluGradient : public GradientMakerBase {
346  using GradientMakerBase::GradientMakerBase;
347  vector<OperatorDef> GetGradientDefs() override {
348  return SingleGradientDef(
349  def_.type() + "Gradient",
350  "",
351  vector<string>{O(0), GO(0), I(0), I(1)},
352  vector<string>{GI(0), GI(1)});
353  }
354 };
355 REGISTER_GRADIENT(PRelu, GetPReluGradient);
356 
357 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64