1 #include "caffe2/operators/pool_op_util.h" 3 #include "caffe2/utils/eigen_utils.h" 6 namespace pool_op_util {
10 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 13 void AvgPoolNeon4x4p0s0Plane(
18 constexpr
int kKernelHeight = 4;
19 constexpr
int kKernelWidth = 4;
20 constexpr
float kDiv = (1.0f / ((float)kKernelHeight * (
float)kKernelWidth));
23 constexpr
int kUnroll = 4;
24 constexpr
int kLoadSizeFloat = (
sizeof(float32x4_t) /
sizeof(
float));
25 constexpr
int kLoadCols = kUnroll * kLoadSizeFloat;
27 if (inputW % kLoadCols == 0) {
32 for (
int h = 0; h < inputH; h += kKernelHeight) {
33 float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
34 const float* curInput = input + h * inputW;
36 for (
int w = 0; w < inputW; w += kLoadCols) {
40 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
41 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
42 float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
43 float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
44 float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
45 out = vsetq_lane_f32(v0, out, 0);
47 curInput += kLoadSizeFloat;
50 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
51 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
52 float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
53 float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
54 float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
55 out = vsetq_lane_f32(v0, out, 1);
57 curInput += kLoadSizeFloat;
60 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
61 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
62 float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
63 float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
64 float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
65 out = vsetq_lane_f32(v0, out, 2);
67 curInput += kLoadSizeFloat;
70 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
71 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
72 float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
73 float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
74 float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
75 out = vsetq_lane_f32(v0, out, 3);
77 curInput += kLoadSizeFloat;
79 out = vmulq_f32(out, vdupq_n_f32(kDiv));
80 vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
88 for (
int h = 0; h < inputH; h += kKernelHeight) {
89 const float* inputRow = input + h * inputW;
90 float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
92 for (
int w = 0; w < inputW; w += kKernelWidth) {
93 const float* curInput = inputRow + w;
95 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
96 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
97 float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
98 float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
99 float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
100 outputRow[w / kKernelWidth] = v0;
107 void MaxPoolNeon2x2p0s0Plane(
112 constexpr
int kKernelHeight = 2;
113 constexpr
int kKernelWidth = 2;
116 constexpr
int kUnroll = 4;
117 constexpr
int kLoadSizeFloat = (
sizeof(float32x4_t) /
sizeof(
float));
118 constexpr
int kLoadCols = kUnroll * kLoadSizeFloat;
120 if (inputW % kLoadCols == 0) {
121 for (
int h = 0; h < inputH; h += kKernelHeight) {
122 float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
123 const float* curInput = input + h * inputW;
125 for (
int w = 0; w < inputW; w += kLoadCols) {
126 float32x2_t hmax_0, hmax_1, hmax_2, hmax_3;
128 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
129 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
130 float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
131 hmax_0 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
133 curInput += kLoadSizeFloat;
135 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
136 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
137 float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
138 hmax_1 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
140 curInput += kLoadSizeFloat;
142 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
143 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
144 float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
145 hmax_2 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
147 curInput += kLoadSizeFloat;
149 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
150 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
151 float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
152 hmax_3 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
154 curInput += kLoadSizeFloat;
156 float32x4_t out_0 = vcombine_f32(hmax_0, hmax_1);
157 float32x4_t out_1 = vcombine_f32(hmax_2, hmax_3);
158 vst1q_f32_aligned(&outputRow[w / kKernelWidth + 0], out_0);
159 vst1q_f32_aligned(&outputRow[w / kKernelWidth + 4], out_1);
164 for (
int h = 0; h < inputH; h += kKernelHeight) {
165 const float* inputRow = input + h * inputW;
166 float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
168 for (
int w = 0; w < inputW; w += kKernelWidth * 2) {
169 const float* curInput = inputRow + w;
170 float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
171 float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
172 float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
173 float32x2_t hmax = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
174 vst1_f32(&outputRow[w / kKernelWidth], hmax);
184 bool IsNeon4x4p0s0Eligible(
197 const int dilation_h,
198 const int dilation_w,
201 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 211 const bool kernel_ok = (kh == 4) && (kw == 4);
212 const bool stride_ok = (stride_h == 4) && (stride_w == 4);
214 (pad_t == 0) && (pad_l == 0) && (pad_b == 0) && (pad_r == 0);
215 const bool dilation_ok = (dilation_h == 1) && (dilation_w == 1);
216 const bool output_ok = (input_h % output_h == 0) && (input_w % output_w == 0);
217 const bool input_ok = (input_w % 4 == 0) && (input_h % 4 == 0);
218 const bool align_ok = isPointerAligned(X,
sizeof(float32x4_t)) &&
219 isPointerAligned(Y,
sizeof(float32x4_t));
220 return kernel_ok && stride_ok && pad_ok && dilation_ok && output_ok &&
221 input_ok && align_ok;
243 bool IsNeon2x2p0s0Eligible(
256 const int dilation_h,
257 const int dilation_w,
260 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 270 const bool kernel_ok = (kh == 2) && (kw == 2);
271 const bool stride_ok = (stride_h == 2) && (stride_w == 2);
273 (pad_t == 0) && (pad_l == 0) && (pad_b == 0) && (pad_r == 0);
274 const bool dilation_ok = (dilation_h == 1) && (dilation_w == 1);
275 const bool output_ok = (input_h % output_h == 0) && (input_w % output_w == 0);
276 const bool input_ok = (input_w % 4 == 0) && (input_h % 4 == 0);
277 const bool align_ok = isPointerAligned(X,
sizeof(float32x4_t)) &&
278 isPointerAligned(Y,
sizeof(float32x4_t));
279 return kernel_ok && stride_ok && pad_ok && dilation_ok && output_ok &&
280 input_ok && align_ok;
302 void RunNeonAveragePool4x4p0s0NCHW(
309 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 310 const int X_stride = H * W;
311 const int Y_stride = (H / 4) * (W / 4);
312 const float* X_ptr = X;
314 for (
int i = 0; i < N; ++i) {
315 for (
int j = 0; j < C; ++j) {
316 AvgPoolNeon4x4p0s0Plane(H, W, X_ptr, Y_ptr);
331 void RunNeonMaxPool2x2p0s0NCHW(
338 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 339 const int X_stride = H * W;
340 const int Y_stride = (H / 2) * (W / 2);
341 const float* X_ptr = X;
343 for (
int i = 0; i < N; ++i) {
344 for (
int j = 0; j < C; ++j) {
345 MaxPoolNeon2x2p0s0Plane(H, W, X_ptr, Y_ptr);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...