Caffe2 - C++ API
A deep learning, cross platform ML framework
pool_op_util.cc
1 #include "caffe2/operators/pool_op_util.h"
2 
3 #include "caffe2/utils/eigen_utils.h"
4 
5 namespace caffe2 {
6 namespace pool_op_util {
7 
8 namespace {
9 
10 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
11 
12 // Vectorizes 4x4p0s0 averge pooling for ARM NEON
13 void AvgPoolNeon4x4p0s0Plane(
14  int inputH,
15  int inputW,
16  const float* input,
17  float* output) {
18  constexpr int kKernelHeight = 4;
19  constexpr int kKernelWidth = 4;
20  constexpr float kDiv = (1.0f / ((float)kKernelHeight * (float)kKernelWidth));
21 
22  // Handle portion that can be unrolled by 4
23  constexpr int kUnroll = 4;
24  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
25  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
26 
27  if (inputW % kLoadCols == 0) {
28  //
29  // Manually unroll by 4 (kUnroll)
30  //
31 
32  for (int h = 0; h < inputH; h += kKernelHeight) {
33  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
34  const float* curInput = input + h * inputW;
35 
36  for (int w = 0; w < inputW; w += kLoadCols) {
37  float32x4_t out = {};
38 
39  {
40  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
41  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
42  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
43  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
44  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
45  out = vsetq_lane_f32(v0, out, 0);
46  }
47  curInput += kLoadSizeFloat;
48 
49  {
50  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
51  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
52  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
53  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
54  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
55  out = vsetq_lane_f32(v0, out, 1);
56  }
57  curInput += kLoadSizeFloat;
58 
59  {
60  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
61  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
62  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
63  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
64  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
65  out = vsetq_lane_f32(v0, out, 2);
66  }
67  curInput += kLoadSizeFloat;
68 
69  {
70  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
71  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
72  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
73  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
74  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
75  out = vsetq_lane_f32(v0, out, 3);
76  }
77  curInput += kLoadSizeFloat;
78 
79  out = vmulq_f32(out, vdupq_n_f32(kDiv));
80  vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
81  }
82  }
83  } else {
84  //
85  // Not unrolled
86  //
87 
88  for (int h = 0; h < inputH; h += kKernelHeight) {
89  const float* inputRow = input + h * inputW;
90  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
91 
92  for (int w = 0; w < inputW; w += kKernelWidth) {
93  const float* curInput = inputRow + w;
94 
95  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
96  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
97  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
98  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
99  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
100  outputRow[w / kKernelWidth] = v0;
101  }
102  }
103  }
104 }
105 
106 // Vectorizes 2x2p0s0 averge pooling for ARM NEON
107 void MaxPoolNeon2x2p0s0Plane(
108  int inputH,
109  int inputW,
110  const float* input,
111  float* output) {
112  constexpr int kKernelHeight = 2;
113  constexpr int kKernelWidth = 2;
114 
115  // Handle portion that can be unrolled by 4
116  constexpr int kUnroll = 4;
117  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
118  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
119 
120  if (inputW % kLoadCols == 0) {
121  for (int h = 0; h < inputH; h += kKernelHeight) {
122  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
123  const float* curInput = input + h * inputW;
124 
125  for (int w = 0; w < inputW; w += kLoadCols) {
126  float32x2_t hmax_0, hmax_1, hmax_2, hmax_3;
127  {
128  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
129  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
130  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
131  hmax_0 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
132  }
133  curInput += kLoadSizeFloat;
134  {
135  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
136  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
137  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
138  hmax_1 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
139  }
140  curInput += kLoadSizeFloat;
141  {
142  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
143  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
144  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
145  hmax_2 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
146  }
147  curInput += kLoadSizeFloat;
148  {
149  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
150  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
151  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
152  hmax_3 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
153  }
154  curInput += kLoadSizeFloat;
155 
156  float32x4_t out_0 = vcombine_f32(hmax_0, hmax_1);
157  float32x4_t out_1 = vcombine_f32(hmax_2, hmax_3);
158  vst1q_f32_aligned(&outputRow[w / kKernelWidth + 0], out_0);
159  vst1q_f32_aligned(&outputRow[w / kKernelWidth + 4], out_1);
160  }
161  }
162  } else {
163  // Not unrolled
164  for (int h = 0; h < inputH; h += kKernelHeight) {
165  const float* inputRow = input + h * inputW;
166  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
167 
168  for (int w = 0; w < inputW; w += kKernelWidth * 2) {
169  const float* curInput = inputRow + w;
170  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
171  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
172  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
173  float32x2_t hmax = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
174  vst1_f32(&outputRow[w / kKernelWidth], hmax);
175  }
176  }
177  }
178 }
179 
180 #endif
181 
182 } // namespace
183 
184 bool IsNeon4x4p0s0Eligible(
185  const int input_h,
186  const int input_w,
187  const int output_h,
188  const int output_w,
189  const int kh,
190  const int kw,
191  const int stride_h,
192  const int stride_w,
193  const int pad_t,
194  const int pad_l,
195  const int pad_b,
196  const int pad_r,
197  const int dilation_h,
198  const int dilation_w,
199  const float* X,
200  float* Y) {
201 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
202  // Use this kernel only if:
203  // 1. Kernel size is 4x4
204  // 2. Stride is 4x4
205  // 3. Padding is 0
206  // 4. Dilation is 1
207  // 5. Output width and height are even divisors of input width
208  // 6. Input width and height are divisible by 4 (should be implied by all of
209  // the above, but just check again)
210  // Input and output pointers are aligned by float32x4_t
211  const bool kernel_ok = (kh == 4) && (kw == 4);
212  const bool stride_ok = (stride_h == 4) && (stride_w == 4);
213  const bool pad_ok =
214  (pad_t == 0) && (pad_l == 0) && (pad_b == 0) && (pad_r == 0);
215  const bool dilation_ok = (dilation_h == 1) && (dilation_w == 1);
216  const bool output_ok = (input_h % output_h == 0) && (input_w % output_w == 0);
217  const bool input_ok = (input_w % 4 == 0) && (input_h % 4 == 0);
218  const bool align_ok = isPointerAligned(X, sizeof(float32x4_t)) &&
219  isPointerAligned(Y, sizeof(float32x4_t));
220  return kernel_ok && stride_ok && pad_ok && dilation_ok && output_ok &&
221  input_ok && align_ok;
222 #else
223  (void)input_h;
224  (void)input_w;
225  (void)output_h;
226  (void)output_w;
227  (void)kh;
228  (void)kw;
229  (void)stride_h;
230  (void)stride_w;
231  (void)pad_t;
232  (void)pad_l;
233  (void)pad_b;
234  (void)pad_r;
235  (void)dilation_h;
236  (void)dilation_w;
237  (void)X;
238  (void)Y;
239  return false;
240 #endif
241 }
242 
243 bool IsNeon2x2p0s0Eligible(
244  const int input_h,
245  const int input_w,
246  const int output_h,
247  const int output_w,
248  const int kh,
249  const int kw,
250  const int stride_h,
251  const int stride_w,
252  const int pad_t,
253  const int pad_l,
254  const int pad_b,
255  const int pad_r,
256  const int dilation_h,
257  const int dilation_w,
258  const float* X,
259  float* Y) {
260 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
261  // Use this kernel only if:
262  // 1. Kernel size is 2x2
263  // 2. Stride is 2x2
264  // 3. Padding is 0
265  // 4. Dilation is 1
266  // 5. Output width and height are even divisors of input width
267  // 6. Input width and height are divisible by 4 (should be implied b all of
268  // the above, but just check again)
269  // Input and output pointers are aligned by float32x4_t
270  const bool kernel_ok = (kh == 2) && (kw == 2);
271  const bool stride_ok = (stride_h == 2) && (stride_w == 2);
272  const bool pad_ok =
273  (pad_t == 0) && (pad_l == 0) && (pad_b == 0) && (pad_r == 0);
274  const bool dilation_ok = (dilation_h == 1) && (dilation_w == 1);
275  const bool output_ok = (input_h % output_h == 0) && (input_w % output_w == 0);
276  const bool input_ok = (input_w % 4 == 0) && (input_h % 4 == 0);
277  const bool align_ok = isPointerAligned(X, sizeof(float32x4_t)) &&
278  isPointerAligned(Y, sizeof(float32x4_t));
279  return kernel_ok && stride_ok && pad_ok && dilation_ok && output_ok &&
280  input_ok && align_ok;
281 #else
282  (void)input_h;
283  (void)input_w;
284  (void)output_h;
285  (void)output_w;
286  (void)kh;
287  (void)kw;
288  (void)stride_h;
289  (void)stride_w;
290  (void)pad_t;
291  (void)pad_l;
292  (void)pad_b;
293  (void)pad_r;
294  (void)dilation_h;
295  (void)dilation_w;
296  (void)X;
297  (void)Y;
298  return false;
299 #endif
300 }
301 
302 void RunNeonAveragePool4x4p0s0NCHW(
303  int N,
304  int C,
305  int H,
306  int W,
307  const float* X,
308  float* Y) {
309 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
310  const int X_stride = H * W;
311  const int Y_stride = (H / 4) * (W / 4);
312  const float* X_ptr = X;
313  float* Y_ptr = Y;
314  for (int i = 0; i < N; ++i) {
315  for (int j = 0; j < C; ++j) {
316  AvgPoolNeon4x4p0s0Plane(H, W, X_ptr, Y_ptr);
317  X_ptr += X_stride;
318  Y_ptr += Y_stride;
319  }
320  }
321 #else
322  (void)N;
323  (void)C;
324  (void)H;
325  (void)W;
326  (void)X;
327  (void)Y;
328 #endif
329 }
330 
331 void RunNeonMaxPool2x2p0s0NCHW(
332  int N,
333  int C,
334  int H,
335  int W,
336  const float* X,
337  float* Y) {
338 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
339  const int X_stride = H * W;
340  const int Y_stride = (H / 2) * (W / 2);
341  const float* X_ptr = X;
342  float* Y_ptr = Y;
343  for (int i = 0; i < N; ++i) {
344  for (int j = 0; j < C; ++j) {
345  MaxPoolNeon2x2p0s0Plane(H, W, X_ptr, Y_ptr);
346  X_ptr += X_stride;
347  Y_ptr += Y_stride;
348  }
349  }
350 #else
351  (void)N;
352  (void)C;
353  (void)H;
354  (void)W;
355  (void)X;
356  (void)Y;
357 #endif
358 }
359 
360 } // namespace pool_op_util
361 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64