1 #include "caffe2/quantization/server/pool_dnnlowp_op_avx2.h" 25 const uint8_t* Xdata_temp = Xdata + n * height * width * channels;
26 uint8_t* Ydata_temp = Ydata + n * pooled_height * pooled_width * channels;
27 for (
int ph = 0; ph < pooled_height; ++ph) {
28 int hstart = ph * stride_h - pad_t;
29 int hend = hstart + kernel_h < height ? hstart + kernel_h : height;
30 hstart = hstart > 0 ? hstart : 0;
31 for (
int pw = 0; pw < pooled_width; ++pw) {
32 int wstart = pw * stride_w - pad_l;
33 int wend = wstart + kernel_w < width ? wstart + kernel_w : width;
34 wstart = wstart > 0 ? wstart : 0;
36 uint8_t* Yh = Ydata_temp + (ph * pooled_width + pw) * channels;
37 constexpr
int VLEN = 32;
39 for (
int c = 0; c < channels / VLEN * VLEN; c += VLEN) {
40 __m256i Y_v = _mm256_setzero_si256();
41 for (
int h = hstart; h < hend; ++h) {
42 for (
int w = wstart; w < wend; ++w) {
43 const int input_idx = (h * width + w) * channels + c;
44 Y_v = _mm256_max_epu8(
46 reinterpret_cast<const __m256i*>(Xdata_temp + input_idx)),
50 _mm256_storeu_si256(reinterpret_cast<__m256i*>(Yh + c), Y_v);
54 for (
int c = channels / VLEN * VLEN; c < channels; ++c) {
57 for (
int h = hstart; h < hend; ++h) {
58 for (
int w = wstart; w < wend; ++w) {
59 for (
int c = channels / VLEN * VLEN; c < channels; ++c) {
60 const int input_idx = (h * width + w) * channels + c;
62 Xdata_temp[input_idx] > Yh[c] ? Xdata_temp[input_idx] : Yh[c];
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...