Caffe2 - C++ API
A deep learning, cross platform ML framework
pool_dnnlowp_op_avx2.cc
1 #include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h"
2 
3 #include <immintrin.h>
4 #include <cmath>
5 
6 namespace caffe2 {
7 
8 using namespace std;
9 
10 void max_pool_avx2(
11  const uint8_t* Xdata,
12  int n,
13  int height,
14  int width,
15  int channels,
16  int pooled_height,
17  int pooled_width,
18  int kernel_h,
19  int kernel_w,
20  int stride_h,
21  int stride_w,
22  int pad_t,
23  int pad_l,
24  uint8_t* Ydata) {
25  const uint8_t* Xdata_temp = Xdata + n * height * width * channels;
26  uint8_t* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
27  for (int ph = 0; ph < pooled_height; ++ph) {
28  int hstart = ph * stride_h - pad_t;
29  int hend = hstart + kernel_h < height ? hstart + kernel_h : height;
30  hstart = hstart > 0 ? hstart : 0;
31  for (int pw = 0; pw < pooled_width; ++pw) {
32  int wstart = pw * stride_w - pad_l;
33  int wend = wstart + kernel_w < width ? wstart + kernel_w : width;
34  wstart = wstart > 0 ? wstart : 0;
35 
36  uint8_t* Yh = Ydata_temp + (ph * pooled_width + pw) * channels;
37  constexpr int VLEN = 32;
38  // vectorized loop
39  for (int c = 0; c < channels / VLEN * VLEN; c += VLEN) {
40  __m256i Y_v = _mm256_setzero_si256();
41  for (int h = hstart; h < hend; ++h) {
42  for (int w = wstart; w < wend; ++w) {
43  const int input_idx = (h * width + w) * channels + c;
44  Y_v = _mm256_max_epu8(
45  _mm256_loadu_si256(
46  reinterpret_cast<const __m256i*>(Xdata_temp + input_idx)),
47  Y_v);
48  }
49  }
50  _mm256_storeu_si256(reinterpret_cast<__m256i*>(Yh + c), Y_v);
51  }
52 
53  // remainder
54  for (int c = channels / VLEN * VLEN; c < channels; ++c) {
55  Yh[c] = 0;
56  }
57  for (int h = hstart; h < hend; ++h) {
58  for (int w = wstart; w < wend; ++w) {
59  for (int c = channels / VLEN * VLEN; c < channels; ++c) {
60  const int input_idx = (h * width + w) * channels + c;
61  Yh[c] =
62  Xdata_temp[input_idx] > Yh[c] ? Xdata_temp[input_idx] : Yh[c];
63  }
64  }
65  }
66  } // pw loop
67  } // ph loop
68 }
69 
70 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13