Caffe2 - C++ API
A deep learning, cross platform ML framework
pool_op.cc
1 
17 // TODO(ataei): reduce the apparent redundancy of all the code below.
18 #include "caffe2/operators/pool_op.h"
19 #include "caffe2/utils/cpu_neon.h"
20 
21 namespace caffe2 {
22 
23 using std::max;
24 using std::min;
25 
26 namespace {
27 
28 #ifdef __ARM_NEON__
29 
30 bool isNeon4x4p0s0Eligible(
31  int inputH,
32  int inputW,
33  int outputH,
34  int outputW,
35  int kH,
36  int kW,
37  int strideH,
38  int strideW,
39  int padT,
40  int padL,
41  int padB,
42  int padR,
43  int dilationH,
44  int dilationW,
45  const float* input,
46  float* output) {
47  // Use this kernel only if:
48  // Kernel width is 4x4
49  // Kernel stride is 4x4
50  // Padding is 0
51  // Dilation is 1
52  // Output width and height are even divisors of input width
53  // Input width and height are divisible by 4 (should be implied by
54  // all of the above, but just check again)
55  // Input and output pointers are aligned by float32x4_t
56 
57  bool kernelOk = (kH == 4) && (kW == 4);
58  bool strideOk = (strideH == 4) && (strideW == 4);
59  bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
60  bool dilationOk = (dilationH == 1) && (dilationW == 1);
61 
62  bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
63  bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
64  bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
65  isPointerAligned(output, sizeof(float32x4_t));
66 
67  return kernelOk && strideOk && padOk && dilationOk && outputOk && inputOk &&
68  alignOk;
69 }
70 
71 // Vectorizes 4x4p0s0 averge pooling for ARM NEON
72 void avgPoolNeon4x4p0s0Plane(
73  int inputH,
74  int inputW,
75  const float* input,
76  float* output) {
77  constexpr int kKernelHeight = 4;
78  constexpr int kKernelWidth = 4;
79  constexpr float kDiv = (1.0f / ((float)kKernelHeight * (float)kKernelWidth));
80 
81  // Handle portion that can be unrolled by 4
82  constexpr int kUnroll = 4;
83  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
84  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
85 
86  if (inputW % kLoadCols == 0) {
87  //
88  // Manually unroll by 4 (kUnroll)
89  //
90 
91  for (int h = 0; h < inputH; h += kKernelHeight) {
92  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
93  const float* curInput = input + h * inputW;
94 
95  for (int w = 0; w < inputW; w += kLoadCols) {
96  float32x4_t out = {};
97 
98  {
99  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
100  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
101  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
102  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
103  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
104  out = vsetq_lane_f32(v0, out, 0);
105  }
106  curInput += kLoadSizeFloat;
107 
108  {
109  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
110  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
111  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
112  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
113  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
114  out = vsetq_lane_f32(v0, out, 1);
115  }
116  curInput += kLoadSizeFloat;
117 
118  {
119  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
120  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
121  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
122  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
123  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
124  out = vsetq_lane_f32(v0, out, 2);
125  }
126  curInput += kLoadSizeFloat;
127 
128  {
129  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
130  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
131  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
132  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
133  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3);
134  out = vsetq_lane_f32(v0, out, 3);
135  }
136  curInput += kLoadSizeFloat;
137 
138  out = vmulq_f32(out, vdupq_n_f32(kDiv));
139  vst1q_f32_aligned(&outputRow[w / kKernelWidth], out);
140  }
141  }
142  } else {
143  //
144  // Not unrolled
145  //
146 
147  for (int h = 0; h < inputH; h += kKernelHeight) {
148  const float* inputRow = input + h * inputW;
149  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
150 
151  for (int w = 0; w < inputW; w += kKernelWidth) {
152  const float* curInput = inputRow + w;
153 
154  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
155  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
156  float32x4_t v0_2 = vld1q_f32_aligned(curInput + 2 * inputW);
157  float32x4_t v0_3 = vld1q_f32_aligned(curInput + 3 * inputW);
158  float v0 = horizontal_sum_f32(v0_0, v0_1, v0_2, v0_3) * kDiv;
159  outputRow[w / kKernelWidth] = v0;
160  }
161  }
162  }
163 }
164 
165 void runNeonAveragePool4x4p0s0NCHW(
166  int N,
167  int C,
168  int inputH,
169  int inputW,
170  const float* input,
171  float* output) {
172  // We only have the 4x4p0s0 implementation at present, which is
173  // checked at a higher level
174  int outputH = inputH / 4;
175  int outputW = inputW / 4;
176 
177  for (int n = 0; n < N; ++n) {
178  for (int c = 0; c < C; ++c) {
179  const float* curInput = input + (n * C + c) * inputH * inputW;
180  float* curOutput = output + (n * C + c) * outputH * outputW;
181 
182  avgPoolNeon4x4p0s0Plane(inputH, inputW, curInput, curOutput);
183  }
184  }
185 }
186 
187 bool isNeon2x2p0s0Eligible(
188  int inputH,
189  int inputW,
190  int outputH,
191  int outputW,
192  int kH,
193  int kW,
194  int strideH,
195  int strideW,
196  int padT,
197  int padL,
198  int padB,
199  int padR,
200  int dilationH,
201  int dilationW,
202  const float* input,
203  float* output) {
204  // Use this kernel only if:
205  // Kernel width is 2x2
206  // Kernel stride is 2x2
207  // Padding is 0
208  // Dilation is 1
209  // Output width and height are even divisors of input width
210  // Input width and height are divisible by 4 (should be implied by
211  // all of the above, but just check again)
212  // Input and output pointers are aligned by float32x4_t
213 
214  bool kernelOk = (kH == 2) && (kW == 2);
215  bool strideOk = (strideH == 2) && (strideW == 2);
216  bool padOk = (padT == 0) && (padL == 0) && (padB == 0) && (padR == 0);
217  bool dilationOk = (dilationH == 1) && (dilationW == 1);
218 
219  bool outputOk = ((inputH % outputH) == 0) && ((inputW % outputW) == 0);
220  bool inputOk = (inputW % 4 == 0) && (inputH % 4 == 0);
221  bool alignOk = isPointerAligned(input, sizeof(float32x4_t)) &&
222  isPointerAligned(output, sizeof(float32x4_t));
223 
224  return kernelOk && strideOk && padOk && dilationOk && outputOk && inputOk &&
225  alignOk;
226 }
227 
228 // Vectorizes 2x2p0s0 averge pooling for ARM NEON
229 void maxPoolNeon2x2p0s0Plane(
230  int inputH,
231  int inputW,
232  const float* input,
233  float* output) {
234  constexpr int kKernelHeight = 2;
235  constexpr int kKernelWidth = 2;
236 
237  // Handle portion that can be unrolled by 4
238  constexpr int kUnroll = 4;
239  constexpr int kLoadSizeFloat = (sizeof(float32x4_t) / sizeof(float));
240  constexpr int kLoadCols = kUnroll * kLoadSizeFloat;
241 
242  if (inputW % kLoadCols == 0) {
243  for (int h = 0; h < inputH; h += kKernelHeight) {
244  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
245  const float* curInput = input + h * inputW;
246 
247  for (int w = 0; w < inputW; w += kLoadCols) {
248  float32x2_t hmax_0, hmax_1, hmax_2, hmax_3;
249  {
250  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
251  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
252  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
253  hmax_0 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
254  }
255  curInput += kLoadSizeFloat;
256  {
257  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
258  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
259  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
260  hmax_1 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
261  }
262  curInput += kLoadSizeFloat;
263  {
264  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
265  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
266  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
267  hmax_2 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
268  }
269  curInput += kLoadSizeFloat;
270  {
271  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
272  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
273  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
274  hmax_3 = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
275  }
276  curInput += kLoadSizeFloat;
277 
278  float32x4_t out_0 = vcombine_f32(hmax_0, hmax_1);
279  float32x4_t out_1 = vcombine_f32(hmax_2, hmax_3);
280  vst1q_f32_aligned(&outputRow[w / kKernelWidth + 0], out_0);
281  vst1q_f32_aligned(&outputRow[w / kKernelWidth + 4], out_1);
282  }
283  }
284  } else {
285  // Not unrolled
286  for (int h = 0; h < inputH; h += kKernelHeight) {
287  const float* inputRow = input + h * inputW;
288  float* outputRow = output + (h / kKernelHeight) * (inputW / kKernelWidth);
289 
290  for (int w = 0; w < inputW; w += kKernelWidth * 2) {
291  const float* curInput = inputRow + w;
292  float32x4_t v0_0 = vld1q_f32_aligned(curInput + 0 * inputW);
293  float32x4_t v0_1 = vld1q_f32_aligned(curInput + 1 * inputW);
294  float32x4_t vmax = vmaxq_f32(v0_0, v0_1);
295  float32x2_t hmax = vpmax_f32(vget_low_f32(vmax), vget_high_f32(vmax));
296  vst1_f32(&outputRow[w / kKernelWidth], hmax);
297  }
298  }
299  }
300 }
301 
302 void runNeonMaxPool2x2p0s0NCHW(
303  int N,
304  int C,
305  int inputH,
306  int inputW,
307  const float* input,
308  float* output) {
309  // We only have the 2x2p0s0 implementation at present, which is
310  // checked at a higher level
311  int outputH = inputH / 2;
312  int outputW = inputW / 2;
313 
314  for (int n = 0; n < N; ++n) {
315  for (int c = 0; c < C; ++c) {
316  const float* curInput = input + (n * C + c) * inputH * inputW;
317  float* curOutput = output + (n * C + c) * outputH * outputW;
318  maxPoolNeon2x2p0s0Plane(inputH, inputW, curInput, curOutput);
319  }
320  }
321 }
322 #endif // __ARM_NEON__
323 
324 } // namespace
325 
326 template <typename T>
327 class AveragePool {
328  public:
329  static float initialize() {
330  return 0.0;
331  }
332 
333  static void process(
334  const int x_col,
335  const int y_col,
336  ConstEigenMatrixMap<float>& x_mat,
337  EigenMatrixMap<float>& y_mat) {
338  y_mat.col(y_col) += x_mat.col(x_col);
339  }
340 
341  static void process(const T& x_data, T& y_data) {
342  y_data += x_data;
343  }
344 
345  static void finalize(const int size, T& y_data) {
346  y_data /= size;
347  }
348 
349  static void
350  finalize(const int size, const int col, EigenMatrixMap<float>& y_mat) {
351  y_mat.col(col) /= size;
352  }
353 
354  static bool runSpecialized(
355  int N,
356  int C,
357  int inputH,
358  int inputW,
359  int outputH,
360  int outputW,
361  int kH,
362  int kW,
363  int strideH,
364  int strideW,
365  int padT,
366  int padL,
367  int padB,
368  int padR,
369  int dilationH,
370  int dilationW,
371  const float* input,
372  float* output) {
373 #ifdef __ARM_NEON__
374  if (isNeon4x4p0s0Eligible(
375  inputH,
376  inputW,
377  outputH,
378  outputW,
379  kH,
380  kW,
381  strideH,
382  strideW,
383  padT,
384  padL,
385  padB,
386  padR,
387  dilationH,
388  dilationW,
389  input,
390  output)) {
391  runNeonAveragePool4x4p0s0NCHW(N, C, inputH, inputW, input, output);
392  return true;
393  }
394 #else
395  (void)N;
396  (void)C;
397  (void)inputH;
398  (void)inputW;
399  (void)outputH;
400  (void)outputW;
401  (void)kH;
402  (void)kW;
403  (void)strideH;
404  (void)strideW;
405  (void)padT;
406  (void)padL;
407  (void)padB;
408  (void)padR;
409  (void)dilationH;
410  (void)dilationW;
411  (void)input;
412  (void)output;
413 #endif
414  return false;
415  }
416 };
417 
418 template <typename T>
419 class MaxPool {
420  public:
421  static float initialize() {
422  return std::numeric_limits<float>::lowest();
423  }
424 
425  static void process(
426  const int x_col,
427  const int y_col,
428  ConstEigenMatrixMap<float>& x_mat,
429  EigenMatrixMap<float>& y_mat) {
430  y_mat.col(y_col) = y_mat.col(y_col).cwiseMax(x_mat.col(x_col));
431  }
432 
433  static void process(const T& x_data, T& y_data) {
434  if (x_data > y_data) {
435  y_data = x_data;
436  }
437  }
438 
439  static void finalize(const int /*size*/, T& /*y_data*/) {}
440 
441  static void finalize(
442  const int /*size*/,
443  const int /*col*/,
444  EigenMatrixMap<float>& /*y_mat*/) {}
445 
446  static bool runSpecialized(
447  int N,
448  int C,
449  int inputH,
450  int inputW,
451  int outputH,
452  int outputW,
453  int kH,
454  int kW,
455  int strideH,
456  int strideW,
457  int padT,
458  int padL,
459  int padB,
460  int padR,
461  int dilationH,
462  int dilationW,
463  const float* input,
464  float* output) {
465 #ifdef __ARM_NEON__
466  if (isNeon2x2p0s0Eligible(
467  inputH,
468  inputW,
469  outputH,
470  outputW,
471  kH,
472  kW,
473  strideH,
474  strideW,
475  padT,
476  padL,
477  padB,
478  padR,
479  dilationH,
480  dilationW,
481  input,
482  output)) {
483  runNeonMaxPool2x2p0s0NCHW(N, C, inputH, inputW, input, output);
484  return true;
485  }
486 #else
487  (void)N;
488  (void)C;
489  (void)inputH;
490  (void)inputW;
491  (void)outputH;
492  (void)outputW;
493  (void)kH;
494  (void)kW;
495  (void)strideH;
496  (void)strideW;
497  (void)padT;
498  (void)padL;
499  (void)padB;
500  (void)padR;
501  (void)dilationH;
502  (void)dilationW;
503  (void)input;
504  (void)output;
505 #endif
506  return false;
507  }
508 };
509 
510 template <typename T, class Context, typename PoolType>
512  auto& X = Input(0);
513  auto* Y = Output(0);
514  ConvPoolOpBase<Context>::SetOutputSize(X, Y, X.dim32(1));
515 
516  const float* Xdata = X.template data<float>();
517  float* Ydata = Y->template mutable_data<float>();
518  // The main loop
519  int channels = X.dim32(1);
520  int height = X.dim32(2);
521  int width = kernel_.size() > 1 ? X.dim32(3) : 1;
522  int depth = kernel_.size() > 2 ? X.dim32(4) : 1;
523  int pooled_height = Y->dim32(2);
524  int pooled_width = kernel_.size() > 1 ? Y->dim32(3) : 1;
525  int pooled_depth = kernel_.size() > 2 ? Y->dim32(4) : 1;
526 
527  // We specialize certain variants on ARM for vectorization
528  if (kernel_.size() == 2 &&
529  PoolType::runSpecialized(
530  X.dim32(0),
531  X.dim32(1),
532  X.dim32(2),
533  X.dim32(3),
534  Y->dim32(2),
535  Y->dim32(3),
536  kernel_h(),
537  kernel_w(),
538  stride_h(),
539  stride_w(),
540  pad_t(),
541  pad_l(),
542  pad_b(),
543  pad_r(),
544  dilation_h(),
545  dilation_w(),
546  Xdata,
547  Ydata)) {
548  return true;
549  }
550 
551  switch (kernel_.size()) {
552  case 1:
553  for (int n = 0; n < X.dim32(0); ++n) {
554  for (int c = 0; c < channels; ++c) {
555  for (int ph = 0; ph < pooled_height; ++ph) {
556  int hstart = ph * stride_h() - pad_t();
557  int hend = min(hstart + kernel_h(), height);
558  hstart = max(hstart, 0);
559  T Yh = PoolType::initialize();
560  for (int h = hstart; h < hend; ++h) {
561  PoolType::process(Xdata[h], Yh);
562  }
563  PoolType::finalize(hend - hstart, Yh);
564  Ydata[ph] = Yh;
565  }
566  // Do offset.
567  Xdata += height;
568  Ydata += pooled_height;
569  }
570  }
571  break;
572  case 2:
573  for (int n = 0; n < X.dim32(0); ++n) {
574  for (int c = 0; c < channels; ++c) {
575  for (int ph = 0; ph < pooled_height; ++ph) {
576  int hstart = ph * stride_h() - pad_t();
577  int hend = min(hstart + kernel_h(), height);
578  hstart = max(hstart, 0);
579  for (int pw = 0; pw < pooled_width; ++pw) {
580  int wstart = pw * stride_w() - pad_l();
581  int wend = min(wstart + kernel_w(), width);
582  wstart = max(wstart, 0);
583  const int pool_index = ph * pooled_width + pw;
584  T Yh = PoolType::initialize();
585  for (int h = hstart; h < hend; ++h) {
586  for (int w = wstart; w < wend; ++w) {
587  const int input_index = h * width + w;
588  PoolType::process(Xdata[input_index], Yh);
589  }
590  }
591  PoolType::finalize((hend - hstart) * (wend - wstart), Yh);
592  Ydata[pool_index] = Yh;
593  }
594  }
595  // Do offset.
596  Xdata += height * width;
597  Ydata += pooled_height * pooled_width;
598  }
599  }
600  break;
601  case 3:
602  for (int n = 0; n < X.dim32(0); ++n) {
603  for (int c = 0; c < channels; ++c) {
604  for (int ph = 0; ph < pooled_height; ++ph) {
605  int hstart = ph * stride_h() - pad_t();
606  int hend = min(hstart + kernel_h(), height);
607  hstart = max(hstart, 0);
608  for (int pw = 0; pw < pooled_width; ++pw) {
609  int wstart = pw * stride_w() - pad_l();
610  int wend = min(wstart + kernel_w(), width);
611  wstart = max(wstart, 0);
612  for (int pd = 0; pd < pooled_depth; ++pd) {
613  int dstart = pd * stride_[2] - pads_[2];
614  int dend = min(dstart + kernel_[2], depth);
615  dstart = max(dstart, 0);
616  const int pool_index =
617  ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
618  T Yh = PoolType::initialize();
619  for (int h = hstart; h < hend; ++h) {
620  for (int w = wstart; w < wend; ++w) {
621  for (int d = dstart; d < dend; ++d) {
622  const int input_index = h * width * depth + w * depth + d;
623  PoolType::process(Xdata[input_index], Yh);
624  }
625  }
626  }
627  PoolType::finalize(
628  (hend - hstart) * (wend - wstart) * (dend - dstart), Yh);
629  Ydata[pool_index] = Yh;
630  }
631  }
632  }
633  // Do offset.
634  Xdata += height * width * depth;
635  Ydata += pooled_height * pooled_width * pooled_depth;
636  }
637  }
638  break;
639  default:
640  CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
641  return false;
642  }
643  return true;
644 }
645 
646 template <typename T, class Context, typename PoolType>
648  auto& X = Input(0);
649  auto* Y = Output(0);
650  int height = X.dim32(1);
651  int width = kernel_.size() > 1 ? X.dim32(2) : 1;
652  int depth = kernel_.size() > 2 ? X.dim32(3) : 1;
653  int channels = X.dim32(X.ndim() - 1);
655 
656  EigenMatrixMap<float> Ymat(
657  Y->template mutable_data<float>(), channels, Y->size() / channels);
658  ConstEigenMatrixMap<float> Xmat(
659  X.template data<float>(), channels, X.size() / channels);
660  int pooled_height = Y->dim32(1);
661  int pooled_width = kernel_.size() > 1 ? Y->dim32(2) : 1;
662  int pooled_depth = kernel_.size() > 2 ? Y->dim32(3) : 1;
663  // The main loop
664  switch (kernel_.size()) {
665  case 1:
666  for (int n = 0; n < X.dim32(0); ++n) {
667  for (int ph = 0; ph < pooled_height; ++ph) {
668  int hstart = ph * stride_h() - pad_t();
669  int hend = min(hstart + kernel_h(), height);
670  hstart = max(hstart, 0);
671  const int y_col = n * pooled_height + ph;
672  Ymat.col(y_col).setConstant(PoolType::initialize());
673  for (int h = hstart; h < hend; ++h) {
674  const int x_col = n * height + h;
675  PoolType::process(x_col, y_col, Xmat, Ymat);
676  }
677  PoolType::finalize((hend - hstart), y_col, Ymat);
678  }
679  }
680  break;
681  case 2:
682  for (int n = 0; n < X.dim32(0); ++n) {
683  for (int ph = 0; ph < pooled_height; ++ph) {
684  int hstart = ph * stride_h() - pad_t();
685  int hend = min(hstart + kernel_h(), height);
686  hstart = max(hstart, 0);
687  for (int pw = 0; pw < pooled_width; ++pw) {
688  int wstart = pw * stride_w() - pad_l();
689  int wend = min(wstart + kernel_w(), width);
690  wstart = max(wstart, 0);
691  const int y_col = (n * pooled_height + ph) * pooled_width + pw;
692  Ymat.col(y_col).setConstant(PoolType::initialize());
693  for (int h = hstart; h < hend; ++h) {
694  for (int w = wstart; w < wend; ++w) {
695  const int x_col = (n * height + h) * width + w;
696  PoolType::process(x_col, y_col, Xmat, Ymat);
697  }
698  }
699  PoolType::finalize((hend - hstart) * (wend - wstart), y_col, Ymat);
700  }
701  }
702  }
703  break;
704  case 3:
705  for (int n = 0; n < X.dim32(0); ++n) {
706  for (int ph = 0; ph < pooled_height; ++ph) {
707  int hstart = ph * stride_h() - pad_t();
708  int hend = min(hstart + kernel_h(), height);
709  hstart = max(hstart, 0);
710  for (int pw = 0; pw < pooled_width; ++pw) {
711  int wstart = pw * stride_w() - pad_l();
712  int wend = min(wstart + kernel_w(), width);
713  wstart = max(wstart, 0);
714  for (int pd = 0; pd < pooled_depth; ++pd) {
715  int dstart = pd * stride_[2] - pads_[2];
716  int dend = min(dstart + kernel_[2], depth);
717  dstart = max(dstart, 0);
718  const int y_col = ((n * pooled_height + ph) * pooled_width + pw) *
719  pooled_depth +
720  pd;
721  Ymat.col(y_col).setConstant(PoolType::initialize());
722  for (int h = hstart; h < hend; ++h) {
723  for (int w = wstart; w < wend; ++w) {
724  for (int d = dstart; d < dend; ++d) {
725  const int x_col =
726  ((n * height + h) * width + w) * depth + d;
727  PoolType::process(x_col, y_col, Xmat, Ymat);
728  }
729  }
730  }
731  PoolType::finalize(
732  (hend - hstart) * (wend - wstart) * (dend - dstart),
733  y_col,
734  Ymat);
735  }
736  }
737  }
738  }
739  break;
740  default:
741  CAFFE_THROW("Unsupported pooling size : ", kernel_.size());
742  return false;
743  }
744  return true;
745 }
746 const char* kAveragePoolDoc = R"DOC(
747 consumes an input blob X and applies average pooling across the
748 the blob according to kernel sizes, stride sizes, and pad lengths defined by the
749 ConvPoolOpBase operator. Average pooling consisting of averaging all values of a
750 subset of the input tensor according to the kernel size and downsampling the
751 data into the output blob Y for further processing.
752 )DOC";
753 
754 const char* kMaxPoolDoc = R"DOC(
755 consumes an input blob X and applies max pooling across the
756 the blob according to kernel sizes, stride sizes, and pad lengths defined by the
757 ConvPoolOpBase operator. Max pooling consisting of taking the maximum value of a
758 subset of the input tensor according to the kernel size and downsampling the
759 data into the output blob Y for further processing.
760 )DOC";
761 
762 std::function<void(OpSchema&)> AveragePoolDocGenerator(const char* dim) {
763  return [=](OpSchema& schema) {
764  string doc = "AveragePool{dim} {pool_doc}";
765  ReplaceAll(doc, "{dim}", dim);
766  ReplaceAll(doc, "{pool_doc}", kAveragePoolDoc);
767  schema.SetDoc(doc);
768  schema.Input(
769  0,
770  "X",
771  "Input data tensor from the previous operator; dimensions depend on "
772  "whether the NCHW or NHWC operators are being used. For example, in "
773  "the former, the input has size (N x C x H x W), where N is the batch "
774  "size, C is the number of channels, and H and W are the height and the "
775  "width of the data. The corresponding permutation of dimensions is "
776  "used in the latter case.");
777  schema.Output(
778  0,
779  "Y",
780  "Output data tensor from average pooling across the input "
781  "tensor. Dimensions will vary based on various kernel, stride, and pad "
782  "sizes.");
783  };
784 }
785 
786 std::function<void(OpSchema&)> MaxPoolDocGenerator(const char* dim) {
787  return [=](OpSchema& schema) {
788  string doc = "MaxPool{dim} {pool_doc}";
789  ReplaceAll(doc, "{dim}", dim);
790  ReplaceAll(doc, "{pool_doc}", kMaxPoolDoc);
791  schema.SetDoc(doc);
792  schema.Input(
793  0,
794  "X",
795  "Input data tensor from the previous operator; dimensions depend on "
796  "whether the NCHW or NHWC operators are being used. For example, in "
797  "the former, the input has size (N x C x H x W), where N is the batch "
798  "size, C is the number of channels, and H and W are the height and the "
799  "width of the data. The corresponding permutation of dimensions is "
800  "used in the latter case.");
801  schema.Output(
802  0,
803  "Y",
804  "Output data tensor from max pooling across the input "
805  "tensor. Dimensions will vary based on various kernel, stride, and pad "
806  "sizes.");
807  };
808 }
809 REGISTER_CPU_OPERATOR(
810  AveragePool,
811  PoolOp<float, CPUContext, AveragePool<float>>);
812 
813 OPERATOR_SCHEMA(AveragePool)
814  .NumInputs(1)
815  .NumOutputs(1)
817  .FillUsing(AveragePoolDocGenerator(""));
818 
819 REGISTER_CPU_OPERATOR(
820  AveragePool1D,
821  PoolOp<float, CPUContext, AveragePool<float>>);
822 
823 OPERATOR_SCHEMA(AveragePool1D)
824  .NumInputs(1)
825  .NumOutputs(1)
827  .FillUsing(AveragePoolDocGenerator("1D"));
828 
829 REGISTER_CPU_OPERATOR(
830  AveragePool2D,
831  PoolOp<float, CPUContext, AveragePool<float>>);
832 
833 OPERATOR_SCHEMA(AveragePool2D)
834  .NumInputs(1)
835  .NumOutputs(1)
837  .FillUsing(AveragePoolDocGenerator("2D"));
838 
839 REGISTER_CPU_OPERATOR(
840  AveragePool3D,
841  PoolOp<float, CPUContext, AveragePool<float>>);
842 
843 OPERATOR_SCHEMA(AveragePool3D)
844  .NumInputs(1)
845  .NumOutputs(1)
847  .FillUsing(AveragePoolDocGenerator("3D"));
848 
849 REGISTER_CPU_OPERATOR(MaxPool, PoolOp<float, CPUContext, MaxPool<float>>);
850 
851 OPERATOR_SCHEMA(MaxPool)
852  .NumInputs(1)
853  .NumOutputs(1)
855  .FillUsing(MaxPoolDocGenerator(""));
856 
857 REGISTER_CPU_OPERATOR(MaxPool1D, PoolOp<float, CPUContext, MaxPool<float>>);
858 
859 OPERATOR_SCHEMA(MaxPool1D)
860  .NumInputs(1)
861  .NumOutputs(1)
863  .FillUsing(MaxPoolDocGenerator("1D"));
864 
865 REGISTER_CPU_OPERATOR(MaxPool2D, PoolOp<float, CPUContext, MaxPool<float>>);
866 
867 OPERATOR_SCHEMA(MaxPool2D)
868  .NumInputs(1)
869  .NumOutputs(1)
871  .FillUsing(MaxPoolDocGenerator("2D"));
872 
873 REGISTER_CPU_OPERATOR(MaxPool3D, PoolOp<float, CPUContext, MaxPool<float>>);
874 
875 OPERATOR_SCHEMA(MaxPool3D)
876  .NumInputs(1)
877  .NumOutputs(1)
879  .FillUsing(MaxPoolDocGenerator("3D"));
880 } // namespace caffe2
A class to record the schema of an op.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:82
Copyright (c) 2016-present, Facebook, Inc.