doxygen-c/html/prelu__op_8cc_source.html

 #include "caffe2/operators/prelu_op.h"
 #include "caffe2/utils/eigen_utils.h"
 #include "caffe2/utils/math.h"

 #include "caffe2/core/types.h"
 #include "caffe2/utils/cpu_neon.h"

 namespace caffe2 {

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 namespace {

 void runNeonPrelu(float* out, const float* in, int size, float w) {
   float32x4_t vZero = vdupq_n_f32(0.0f);
   float32x4_t vW = vdupq_n_f32(w);

   constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);

   if (size < kVecSizeInFloat) {
     for (int i = 0; i < size; ++i) {
       float v = in[i];
       out[i] = v > 0 ? v : v * w;
     }

     return;
   }

   // We want to load aligned from the input, but assume the output is unaligned
   int prologue =
     kVecSizeInFloat -
     // remainder in floats
     (((uintptr_t) in) % (sizeof(float32x4_t))) / sizeof(float);

   int i = 0;

   // Prologue loop
   for (; i < prologue; ++i) {
     float v = in[i];
     out[i] = v > 0 ? v : v * w;
   }

   // The loop is manually unrolled by 6; seems to be the limit for
   // armv7 to avoid register spills
   constexpr int kUnroll = 6;
   constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;

   int remainder = size - prologue;
   int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;

   for (; i < vectorizable; i += kFloatsPerLoop) {
     float32x4_t v0 = vld1q_f32_aligned(in + i + 0);
     float32x4_t v1 = vld1q_f32_aligned(in + i + 4);
     float32x4_t v2 = vld1q_f32_aligned(in + i + 8);
     float32x4_t v3 = vld1q_f32_aligned(in + i + 12);
     float32x4_t v4 = vld1q_f32_aligned(in + i + 16);
     float32x4_t v5 = vld1q_f32_aligned(in + i + 20);

     uint32x4_t gz0 = vcgtq_f32(v0, vZero);
     uint32x4_t gz1 = vcgtq_f32(v1, vZero);
     uint32x4_t gz2 = vcgtq_f32(v2, vZero);
     uint32x4_t gz3 = vcgtq_f32(v3, vZero);
     uint32x4_t gz4 = vcgtq_f32(v4, vZero);
     uint32x4_t gz5 = vcgtq_f32(v5, vZero);

     float32x4_t v0neg = vmulq_f32(v0, vW);
     float32x4_t v1neg = vmulq_f32(v1, vW);
     float32x4_t v2neg = vmulq_f32(v2, vW);
     float32x4_t v3neg = vmulq_f32(v3, vW);
     float32x4_t v4neg = vmulq_f32(v4, vW);
     float32x4_t v5neg = vmulq_f32(v5, vW);

     // v0 > 0 ? v0 : v0 * w
     v0 = vbslq_f32(gz0, v0, v0neg);
     v1 = vbslq_f32(gz1, v1, v1neg);
     v2 = vbslq_f32(gz2, v2, v2neg);
     v3 = vbslq_f32(gz3, v3, v3neg);
     v4 = vbslq_f32(gz4, v4, v4neg);
     v5 = vbslq_f32(gz5, v5, v5neg);

     vst1q_f32(out + i + 0, v0);
     vst1q_f32(out + i + 4, v1);
     vst1q_f32(out + i + 8, v2);
     vst1q_f32(out + i + 12, v3);
     vst1q_f32(out + i + 16, v4);
     vst1q_f32(out + i + 20, v5);
   }

   for (; i < size; ++i) {
     float v = in[i];
     out[i] = v > 0 ? v : v * w;
   }
 }

 }
 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)

 template <>
 bool PReluOp<float, CPUContext>::RunOnDevice() {
   const auto& X = Input(0);
   const auto& W = Input(1);

   auto* Y = Output(0, X.sizes(), at::dtype<float>());
   const auto* Xdata = X.template data<float>();
   const auto* Wdata = W.template data<float>();
   auto* Ydata = Y->template mutable_data<float>();

   const auto C = order_ == StorageOrder::NCHW ? X.size(1) : X.size(X.dim() - 1);
   const auto C_shared = (W.numel() == 1);

   if (!C_shared) {
     CAFFE_ENFORCE_EQ(C, W.numel());
   }

   if (C_shared) {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
     // The function is completely pointwise
     runNeonPrelu(Ydata, Xdata, X.size(), Wdata[0]);
 #else
     ConstEigenVectorMap<float> Xvec(Xdata, X.numel());
     EigenVectorMap<float> Yvec(Ydata, Y->numel());
     Yvec = Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[0];
 #endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
     return true;
   }

   // non-shared case.
   switch (order_) {
     case StorageOrder::NCHW: {
       const auto N = X.size(0);
       const auto dim = X.size_from_dim(2);

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       // Pointwise for each channel
       for (int n = 0; n < N; ++n) {
         for (int c = 0; c < C; ++c) {
           runNeonPrelu(Ydata + (n * C + c) * dim,
                        Xdata + (n * C + c) * dim,
                        dim, Wdata[c]);
         }
       }
 #else
       int nc = 0;
       for (int n = 0; n < N; ++n) {
         for (int c = 0; c < C; ++c) {
           ConstEigenVectorMap<float> Xvec(Xdata + nc * dim, dim);
           EigenVectorMap<float>(Ydata + nc * dim, dim) =
               Xvec.cwiseMax(0.f) + Xvec.cwiseMin(0.f) * Wdata[c];
           nc++;
         }
       }
 #endif
       break;
     }
     case StorageOrder::NHWC: {
       // Lay out matrix as (NHW, C) and multiply by C
       const auto NHW = X.numel() / C;
       ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
       ConstEigenVectorArrayMap<float> Wvec(Wdata, C);
       EigenArrayMap<float> Ymat(Ydata, C, NHW);
       Ymat = (Xmat > 0).select(Xmat, Xmat.colwise() * Wvec);
       break;
     }
     default:
       CAFFE_THROW("Unknown storage order: ", order_);
   }
   return true;
 }

 template <>
 bool PReluGradientOp<float, CPUContext>::RunOnDevice() {
   auto& Y = Input(0);
   auto& dY = Input(1);
   auto& X = Input(2);
   auto& W = Input(3);

   CAFFE_ENFORCE(&Y != &X, "Cannot backpropagate through an in-place PReLU");

   DCHECK_EQ(dY.numel(), Y.numel());
   auto* dX = Output(0, Y.sizes(), at::dtype<float>());
   auto* dW = Output(1, W.sizes(), at::dtype<float>());

   const auto C = order_ == StorageOrder::NCHW ? X.size(1) : X.size(X.dim() - 1);
   const auto C_shared = (W.numel() == 1);

   const float* Ydata = Y.data<float>();
   const float* dYdata = dY.data<float>();
   const float* Xdata = X.data<float>();
   const float* Wdata = W.data<float>();
   float* dXdata = dX->template mutable_data<float>();
   float* dWdata = dW->template mutable_data<float>();

   // non-shared case.
   switch (order_) {
     case StorageOrder::NCHW: {
       const auto dim = X.size_from_dim(2);
       const auto div_factor = C_shared ? C : 1;
       for (auto c = 0; c < W.numel(); ++c) {
         dWdata[c] = 0;
       }

       for (int i = 0; i < Y.numel(); ++i) {
         if (Xdata[i] <= 0) {
           int c = (i / dim) % C / div_factor;
           dWdata[c] += dYdata[i] * Xdata[i];
         }
       }

       for (int i = 0; i < Y.numel(); ++i) {
         if (Xdata[i] > 0) {
           dXdata[i] = dYdata[i];
         } else {
           int c = (i / dim) % C / div_factor;
           dXdata[i] = Wdata[c] * dYdata[i];
         }
       }
       break;
     }
     case StorageOrder::NHWC: {
       const auto NHW = X.numel() / C;
       ConstEigenVectorArrayMap<float> Wvec(Wdata, W.numel());
       EigenVectorArrayMap<float> dWvec(dWdata, dW->numel());

       ConstEigenArrayMap<float> Ymat(Ydata, C, NHW);
       ConstEigenArrayMap<float> dYmat(dYdata, C, NHW);
       ConstEigenArrayMap<float> Xmat(Xdata, C, NHW);
       EigenArrayMap<float> dXmat(dXdata, C, NHW);

       if (C_shared) {
         dXmat = (Xmat > 0).select(dYmat, dYmat * Wdata[0]);
         dWdata[0] =
             (Xmat > 0)
                 .select(
                     Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
                     dYmat * Xmat)
                 .sum();
       } else {
         dXmat = (Xmat > 0).select(dYmat, dYmat.colwise() * Wvec);
         dWvec = (Xmat > 0)
                     .select(
                         Xmat.cwiseMin(0.0f), // zero gradients on the 'if' path.
                         dYmat * Xmat)
                     .rowwise()
                     .sum();
       }
       break;
     }
     default:
       CAFFE_THROW("Unknown storage order: ", order_);
   }

   return true;
 }

 REGISTER_CPU_OPERATOR(PRelu, PReluOp<float, CPUContext>);
 REGISTER_CPU_GRADIENT_OPERATOR(
     PReluGradient,
     PReluGradientOp<float, CPUContext>);

 // Input: X, Slope, output: Y
 OPERATOR_SCHEMA(PRelu)
     .NumInputs(2)
     .NumOutputs(1)
     .AllowInplace({{0, 0}})
     .IdenticalTypeAndShapeOfInput(0)
     .SetDoc(R"DOC(

 The *PRelu* op takes input data tensor $X$, an input slope tensor $slope$, and produces one output tensor $Y$ of the same shape as $X.$ The op performs the element wise *PRelu* operation, defined as

 $$y=prelu(x) =\begin{cases}slope * x & x < 0\\x & otherwise\end{cases}$$

 Note, is slope is size 1, the value is shared across the channels, otherwise $X$ and $slope$ must be the same shape. See [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852) for more information.

 Github Links:

 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/prelu_op.cc


 <details>

 <summary> <b>Example</b> </summary>

 **Code**

 ```

 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "PRelu",
     ["X","Slope"],
     ["Y"],
 )

 workspace.FeedBlob("X", np.random.randn(3, 3).astype(np.float32))
 print("X:\n", workspace.FetchBlob("X"), "\n")

 workspace.FeedBlob("Slope", np.array([0.1]).astype(np.float32))
 print("Slope:\n", workspace.FetchBlob("Slope"), "\n")

 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))

 ```

 **Result**

 ```

 X:
  [[ 0.3957382  -0.19725518 -0.26991343]
  [ 1.5513182  -0.27427664 -0.14584002]
  [-0.4121164   0.9292345   0.96426094]]

 Slope:
  [0.1]

 Y:
  [[ 0.3957382  -0.01972552 -0.02699134]
  [ 1.5513182  -0.02742766 -0.014584  ]
  [-0.04121164  0.9292345   0.96426094]]

 ```

 </details>


 )DOC")
     .Input(0, "X", "Input tensor of data to be operated on.")
     .Input(
         1,
         "Slope",
         "1D input slope tensor. If `Slope` is of size 1, the value is shared across different channels")
     .Output(0, "Y", "Output tensor, with same shape as $X$.")
     .InheritOnnxSchema();

 // Input: Y, dY, output: dX
 GRADIENT_OPERATOR_SCHEMA(PReluGradient).NumInputs(4).NumOutputs(2).SetDoc(R"DOC(

 PReluGradient takes both Y and dY and uses this to update dX and dW according
 to the chain rule and derivatives of the rectified linear function.

 )DOC");

 class GetPReluGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         def_.type() + "Gradient",
         "",
         vector<string>{O(0), GO(0), I(0), I(1)},
         vector<string>{GI(0), GI(1)});
   }
 };
 REGISTER_GRADIENT(PRelu, GetPReluGradient);

 } // namespace caffe2
caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13

C
Definition: static.cpp:64