Caffe2 - C++ API
A deep learning, cross platform ML framework
prelu_op.cc
1 
17 #include "caffe2/operators/prelu_op.h"
18 #include "caffe2/utils/math.h"
19 
20 #include "caffe2/core/types.h"
21 #include "caffe2/utils/cpu_neon.h"
22 
23 namespace caffe2 {
24 
25 #ifdef __ARM_NEON__
26 namespace {
27 
28 void runNeonPrelu(float* out, const float* in, int size, float w) {
29  float32x4_t vZero = vdupq_n_f32(0.0f);
30  float32x4_t vW = vdupq_n_f32(w);
31 
32  constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
33 
34  if (size < kVecSizeInFloat) {
35  for (int i = 0; i < size; ++i) {
36  float v = in[i];
37  out[i] = v > 0 ? v : v * w;
38  }
39 
40  return;
41  }
42 
43  // We want to load aligned from the input, but assume the output is unaligned
44  int prologue =
45  kVecSizeInFloat -
46  // remainder in floats
47  (((uintptr_t) in) % (sizeof(float32x4_t))) / sizeof(float);
48 
49  int i = 0;
50 
51  // Prologue loop
52  for (; i < prologue; ++i) {
53  float v = in[i];
54  out[i] = v > 0 ? v : v * w;
55  }
56 
57  // The loop is manually unrolled by 6; seems to be the limit for
58  // armv7 to avoid register spills
59  constexpr int kUnroll = 6;
60  constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
61 
62  int remainder = size - prologue;
63  int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
64 
65  for (; i < vectorizable; i += kFloatsPerLoop) {
66  float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
67  float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
68  float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
69  float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
70  float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
71  float32x4_t v5 = vld1q_f32_aligned(in + i + 20);
72 
73  uint32x4_t gz0 = vcgtq_f32(v0, vZero);
74  uint32x4_t gz1 = vcgtq_f32(v1, vZero);
75  uint32x4_t gz2 = vcgtq_f32(v2, vZero);
76  uint32x4_t gz3 = vcgtq_f32(v3, vZero);
77  uint32x4_t gz4 = vcgtq_f32(v4, vZero);
78  uint32x4_t gz5 = vcgtq_f32(v5, vZero);
79 
80  float32x4_t v0neg = vmulq_f32(v0, vW);
81  float32x4_t v1neg = vmulq_f32(v1, vW);
82  float32x4_t v2neg = vmulq_f32(v2, vW);
83  float32x4_t v3neg = vmulq_f32(v3, vW);
84  float32x4_t v4neg = vmulq_f32(v4, vW);
85  float32x4_t v5neg = vmulq_f32(v5, vW);
86 
87  // v0 > 0 ? v0 : v0 * w
88  v0 = vbslq_f32(gz0, v0, v0neg);
89  v1 = vbslq_f32(gz1, v1, v1neg);
90  v2 = vbslq_f32(gz2, v2, v2neg);
91  v3 = vbslq_f32(gz3, v3, v3neg);
92  v4 = vbslq_f32(gz4, v4, v4neg);
93  v5 = vbslq_f32(gz5, v5, v5neg);
94 
95  vst1q_f32(out + i + 0, v0);
96  vst1q_f32(out + i + 4, v1);
97  vst1q_f32(out + i + 8, v2);
98  vst1q_f32(out + i + 12, v3);
99  vst1q_f32(out + i + 16, v4);
100  vst1q_f32(out + i + 20, v5);
101  }
102 
103  for (; i < size; ++i) {
104  float v = in[i];
105  out[i] = v > 0 ? v : v * w;
106  }
107 }
108 
109 }
110 #endif // __ARM_NEON__
111 
112 template <>
113 bool PReluOp<float, CPUContext>::RunOnDevice() {
114  const auto& X = Input(0);
115  const auto& W = Input(1);
116  auto* Y = Output(0);
117  Y->ResizeLike(X);
118  const auto* Xdata = X.template data<float>();
119  const auto* Wdata = W.template data<float>();
120  auto* Ydata = Y->template mutable_data<float>();
121 
122  const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
123  const auto C_shared = (W.size() == 1);
124 
125  if (!C_shared) {
126  CAFFE_ENFORCE_EQ(C, W.size());
127  }
128 
129  if (C_shared) {
130 #ifdef __ARM_NEON__
131  // The function is completely pointwise
132  runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
133 #else
134  ConstEigenVectorMap<float> Xvec(Xdata, X.size());
135  EigenVectorMap<float> Yvec(Ydata, Y->size());
136  Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
137 #endif // __ARM_NEON__
138  return true;
139  }
140 
141  // non-shared case.
142  switch (order_) {
143  case StorageOrder::NCHW: {
144  const auto N = X.dim(0);
145  const auto dim = X.size_from_dim(2);
146 
147 #ifdef __ARM_NEON__
148  // Pointwise for each channel
149  for (int n = 0; n < N; ++n) {
150  for (int c = 0; c < C; ++c) {
151  runNeonPrelu(Ydata + (n * C + c) * dim,
152  Xdata + (n * C + c) * dim,
153  dim, Wdata[c]);
154  }
155  }
156 #else
157  int nc = 0;
158  for (int n = 0; n < N; ++n) {
159  for (int c = 0; c < C; ++c) {
160  ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
161  EigenVectorMap<float>(Ydata + nc * dim, dim) =
162  Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
163  nc++;
164  }
165  }
166 #endif
167  break;
168  }
169  case StorageOrder::NHWC: {
170  // Lay out matrix as (NHW, C) and multiply by C
171  const auto NHW = X.size() / C;
172  ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
173  ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
174  EigenArrayMap<float> Ymat(Ydata, C, NHW);
175  Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
176  break;
177  }
178  default:
179  CAFFE_THROW("Unknown storage order: ", order_);
180  }
181  return true;
182 }
183 
184 template <>
185 bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
186  auto& Y = Input(0);
187  auto& dY = Input(1);
188  auto& X = Input(2);
189  auto& W = Input(3);
190 
191  CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");
192  auto* dX = Output(0);
193  auto* dW = Output(1);
194 
195  DCHECK_EQ(dY.size(), Y.size());
196  dX->ResizeLike(Y);
197  dW->ResizeLike(W);
198 
199  const auto C = order_ == StorageOrder::NCHW ? X.dim(1) : X.dim(X.ndim() - 1);
200  const auto C_shared = (W.size() == 1);
201 
202  const float* Ydata = Y.data<float>();
203  const float* dYdata = dY.data<float>();
204  const float* Xdata = X.data<float>();
205  const float* Wdata = W.data<float>();
206  float* dXdata = dX->mutable_data<float>();
207  float* dWdata = dW->mutable_data<float>();
208 
209  // non-shared case.
210  switch (order_) {
211  case StorageOrder::NCHW: {
212  const auto dim = X.size_from_dim(2);
213  const auto div_factor = C_shared ? C : 1;
214  for (auto c = 0; c < W.size(); ++c) {
215  dWdata[c] = 0;
216  }
217 
218  for (int i = 0; i < Y.size(); ++i) {
219  if (Xdata[i] <= 0) {
220  int c = (i / dim) % C / div_factor;
221  dWdata[c] += dYdata[i] * Xdata[i];
222  }
223  }
224 
225  for (int i = 0; i < Y.size(); ++i) {
226  if (Xdata[i] > 0) {
227  dXdata[i] = dYdata[i];
228  } else {
229  int c = (i / dim) % C / div_factor;
230  dXdata[i] = Wdata[c] * dYdata[i];
231  }
232  }
233  break;
234  }
235  case StorageOrder::NHWC: {
236  const auto NHW = X.size() / C;
237  ConstEigenVectorArrayMap<float> Wvec(Wdata, W.size());
238  EigenVectorArrayMap<float> dWvec(dWdata, dW->size());
239 
240  ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
241  ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
242  ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
243  EigenArrayMap<float> dXmat(dXdata, C, NHW);
244 
245  if (C_shared) {
246  dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
247  dWdata[0] =
248  (Xmat > 0)
249  .select(
250  Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
251  dYmat * Xmat)
252  .sum();
253  } else {
254  dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
255  dWvec = (Xmat > 0)
256  .select(
257  Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
258  dYmat * Xmat)
259  .rowwise()
260  .sum();
261  }
262  break;
263  }
264  default:
265  CAFFE_THROW("Unknown storage order: ", order_);
266  }
267 
268  return true;
269 }
270 
271 REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
272 REGISTER_CPU_OPERATOR(PReluGradient, PReluGradientOp<float, CPUContext>);
273 
274 // Input: X, Slope, output: Y
275 OPERATOR_SCHEMA(PRelu)
276  .NumInputs(2)
277  .NumOutputs(1)
278  .AllowInplace({{0, 0}})
279  .IdenticalTypeAndShapeOfInput(0)
280  .SetDoc(R"DOC(
281 
282 PRelu takes input data (Tensor<T>) and slope tensor as input, and produces one
283 output data (Tensor<T>) where the function `f(x) = slope * x for x < 0`,
284 `f(x) = x for x >= 0`., is applied to the data tensor elementwise.
285 
286 )DOC")
287  .Input(0, "X", "1D input tensor")
288  .Input(
289  1,
290  "Slope",
291  "1D slope tensor. If `Slope` is of size 1, the value is shared"
292  "across different channels")
293  .Output(0, "Y", "1D input tensor");
294 
295 // Input: Y, dY, output: dX
296 OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC(
297 
298 PReluGradient takes both Y and dY and uses this to update dX and dW according
299 to the chain rule and derivatives of the rectified linear function.
300 
301 )DOC");
302 
304  using GradientMakerBase::GradientMakerBase;
305  vector<OperatorDef> GetGradientDefs() override {
306  return SingleGradientDef(
307  def_.type() + "Gradient",
308  "",
309  vector<string>{O(0), GO(0), I(0), I(1)},
310  vector<string>{GI(0), GI(1)});
311  }
312 };
313 REGISTER_GRADIENT(PRelu, GetPReluGradient);
314 
315 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...