doxygen-c/html/pool__op__util_8cc_source.html

 #include "caffe2/operators/pool_op_util.h"

 #include "caffe2/utils/eigen_utils.h"

 namespace caffe2 {
 namespace pool_op_util {

 namespace {

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)

 // Vectorizes 4x4p0s0 averge pooling for ARM NEON
 void AvgPoolNeon4x4p0s0Plane(
     int inputH,
     int inputW,
     const float* input,
     float* output) {
   constexpr int kKernelHeight = 4;
   constexpr int kKernelWidth = 4;
   constexpr float kDiv = (1.0f / ((float)kKernelHeight * (float)kKernelWidth));

   // Handle portion that can be unrolled by 4
   constexpr int kUnroll = 4;
   constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
   constexpr int kLoadCols = kUnroll * kLoadSizeFloat;

   if (inputW % kLoadCols == 0) {
     //
     // Manually unroll by 4 (kUnroll)
     //

     for (int h = 0; h < inputH; h += kKernelHeight) {
       float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
       const float* curInput = input + h * inputW;

       for (int w = 0; w < inputW; w += kLoadCols) {
         float32x4_t out = {};

         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
           float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
           float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
           out = vsetq_lane_f32(v0, out, 0);
         }
         curInput += kLoadSizeFloat;

         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
           float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
           float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
           out = vsetq_lane_f32(v0, out, 1);
         }
         curInput += kLoadSizeFloat;

         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
           float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
           float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
           out = vsetq_lane_f32(v0, out, 2);
         }
         curInput += kLoadSizeFloat;

         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
           float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
           float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
           out = vsetq_lane_f32(v0, out, 3);
         }
         curInput += kLoadSizeFloat;

         out = vmulq_f32(out, vdupq_n_f32(kDiv));
         vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
       }
     }
   } else {
     //
     // Not unrolled
     //

     for (int h = 0; h < inputH; h += kKernelHeight) {
       const float* inputRow = input + h * inputW;
       float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);

       for (int w = 0; w < inputW; w += kKernelWidth) {
         const float* curInput = inputRow + w;

         float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
         float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
         float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
         float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
         float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
         outputRow[w / kKernelWidth] = v0;
       }
     }
   }
 }

 // Vectorizes 2x2p0s0 averge pooling for ARM NEON
 void MaxPoolNeon2x2p0s0Plane(
     int inputH,
     int inputW,
     const float* input,
     float* output) {
   constexpr int kKernelHeight = 2;
   constexpr int kKernelWidth = 2;

   // Handle portion that can be unrolled by 4
   constexpr int kUnroll = 4;
   constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
   constexpr int kLoadCols = kUnroll * kLoadSizeFloat;

   if (inputW % kLoadCols == 0) {
     for (int h = 0; h < inputH; h += kKernelHeight) {
       float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
       const float* curInput = input + h * inputW;

       for (int w = 0; w < inputW; w += kLoadCols) {
         float32x2_t hmax_0, hmax_1, hmax_2, hmax_3;
         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
           hmax_0 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
         }
         curInput += kLoadSizeFloat;
         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
           hmax_1 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
         }
         curInput += kLoadSizeFloat;
         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
           hmax_2 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
         }
         curInput += kLoadSizeFloat;
         {
           float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
           float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
           float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
           hmax_3 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
         }
         curInput += kLoadSizeFloat;

         float32x4_t out_0 = vcombine_f32(hmax_0, hmax_1);
         float32x4_t out_1 = vcombine_f32(hmax_2, hmax_3);
         vst1q_f32_aligned(&outputRow[w / kKernelWidth + 0], out_0);
         vst1q_f32_aligned(&outputRow[w / kKernelWidth + 4], out_1);
       }
     }
   } else {
     // Not unrolled
     for (int h = 0; h < inputH; h += kKernelHeight) {
       const float* inputRow = input + h * inputW;
       float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);

       for (int w = 0; w < inputW; w += kKernelWidth * 2) {
         const float* curInput = inputRow + w;
         float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
         float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
         float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
         float32x2_t hmax = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
         vst1_f32(&outputRow[w / kKernelWidth], hmax);
       }
     }
   }
 }

 #endif

 } // namespace

 bool IsNeon4x4p0s0Eligible(
     const int input_h,
     const int input_w,
     const int output_h,
     const int output_w,
     const int kh,
     const int kw,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const int pad_b,
     const int pad_r,
     const int dilation_h,
     const int dilation_w,
     const float* X,
     float* Y) {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   // Use this kernel only if:
   //   1. Kernel size is 4x4
   //   2. Stride is 4x4
   //   3. Padding is 0
   //   4. Dilation is 1
   //   5. Output width and height are even divisors of input width
   //   6. Input width and height are divisible by 4 (should be implied by all of
   //      the above, but just check again)
   // Input and output pointers are aligned by float32x4_t
   const bool kernel_ok = (kh == 4) && (kw == 4);
   const bool stride_ok = (stride_h == 4) && (stride_w == 4);
   const bool pad_ok =
       (pad_t == 0) && (pad_l == 0) && (pad_b == 0) && (pad_r == 0);
   const bool dilation_ok = (dilation_h == 1) && (dilation_w == 1);
   const bool output_ok = (input_h % output_h == 0) && (input_w % output_w == 0);
   const bool input_ok = (input_w % 4 == 0) && (input_h % 4 == 0);
   const bool align_ok = isPointerAligned(X, sizeof(float32x4_t)) &&
       isPointerAligned(Y, sizeof(float32x4_t));
   return kernel_ok && stride_ok && pad_ok && dilation_ok && output_ok &&
       input_ok && align_ok;
 #else
   (void)input_h;
   (void)input_w;
   (void)output_h;
   (void)output_w;
   (void)kh;
   (void)kw;
   (void)stride_h;
   (void)stride_w;
   (void)pad_t;
   (void)pad_l;
   (void)pad_b;
   (void)pad_r;
   (void)dilation_h;
   (void)dilation_w;
   (void)X;
   (void)Y;
   return false;
 #endif
 }

 bool IsNeon2x2p0s0Eligible(
     const int input_h,
     const int input_w,
     const int output_h,
     const int output_w,
     const int kh,
     const int kw,
     const int stride_h,
     const int stride_w,
     const int pad_t,
     const int pad_l,
     const int pad_b,
     const int pad_r,
     const int dilation_h,
     const int dilation_w,
     const float* X,
     float* Y) {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   // Use this kernel only if:
   //   1. Kernel size is 2x2
   //   2. Stride is 2x2
   //   3. Padding is 0
   //   4. Dilation is 1
   //   5. Output width and height are even divisors of input width
   //   6. Input width and height are divisible by 4 (should be implied b all of
   //      the above, but just check again)
   // Input and output pointers are aligned by float32x4_t
   const bool kernel_ok = (kh == 2) && (kw == 2);
   const bool stride_ok = (stride_h == 2) && (stride_w == 2);
   const bool pad_ok =
       (pad_t == 0) && (pad_l == 0) && (pad_b == 0) && (pad_r == 0);
   const bool dilation_ok = (dilation_h == 1) && (dilation_w == 1);
   const bool output_ok = (input_h % output_h == 0) && (input_w % output_w == 0);
   const bool input_ok = (input_w % 4 == 0) && (input_h % 4 == 0);
   const bool align_ok = isPointerAligned(X, sizeof(float32x4_t)) &&
       isPointerAligned(Y, sizeof(float32x4_t));
   return kernel_ok && stride_ok && pad_ok && dilation_ok && output_ok &&
       input_ok && align_ok;
 #else
   (void)input_h;
   (void)input_w;
   (void)output_h;
   (void)output_w;
   (void)kh;
   (void)kw;
   (void)stride_h;
   (void)stride_w;
   (void)pad_t;
   (void)pad_l;
   (void)pad_b;
   (void)pad_r;
   (void)dilation_h;
   (void)dilation_w;
   (void)X;
   (void)Y;
   return false;
 #endif
 }

 void RunNeonAveragePool4x4p0s0NCHW(
     int N,
     int C,
     int H,
     int W,
     const float* X,
     float* Y) {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   const int X_stride = H * W;
   const int Y_stride = (H / 4) * (W / 4);
   const float* X_ptr = X;
   float* Y_ptr = Y;
   for (int i = 0; i < N; ++i) {
     for (int j = 0; j < C; ++j) {
       AvgPoolNeon4x4p0s0Plane(H, W, X_ptr, Y_ptr);
       X_ptr += X_stride;
       Y_ptr += Y_stride;
     }
   }
 #else
   (void)N;
   (void)C;
   (void)H;
   (void)W;
   (void)X;
   (void)Y;
 #endif
 }

 void RunNeonMaxPool2x2p0s0NCHW(
     int N,
     int C,
     int H,
     int W,
     const float* X,
     float* Y) {
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
   const int X_stride = H * W;
   const int Y_stride = (H / 2) * (W / 2);
   const float* X_ptr = X;
   float* Y_ptr = Y;
   for (int i = 0; i < N; ++i) {
     for (int j = 0; j < C; ++j) {
       MaxPoolNeon2x2p0s0Plane(H, W, X_ptr, Y_ptr);
       X_ptr += X_stride;
       Y_ptr += Y_stride;
     }
   }
 #else
   (void)N;
   (void)C;
   (void)H;
   (void)W;
   (void)X;
   (void)Y;
 #endif
 }

 } // namespace pool_op_util
 } // namespace caffe2
caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13

C
Definition: static.cpp:64