Caffe2 - C++ API
A deep learning, cross platform ML framework
relu_dnnlowp_op_avx2.cc
1 #include <algorithm>
2 #include <cstdint>
3 
4 #include <immintrin.h>
5 
6 namespace caffe2 {
7 
8 namespace internal {
9 
10 template <typename T>
11 void ReluAVX2(const int N, const int zero_point, const T* X, T* Y);
12 
13 template <>
14 void ReluAVX2<uint8_t>(
15  const int N,
16  const int zero_point,
17  const uint8_t* X,
18  uint8_t* Y) {
19  constexpr int kVLen = 32;
20  const int n = N / kVLen * kVLen;
21  const int r = N % kVLen;
22  const __m256i zero_v = _mm256_set1_epi8(static_cast<uint8_t>(zero_point));
23  for (int i = 0; i < n; i += kVLen) {
24  __m256i cur_v = _mm256_max_epu8(
25  _mm256_loadu_si256(reinterpret_cast<const __m256i*>(X + i)), zero_v);
26  _mm256_storeu_si256(reinterpret_cast<__m256i*>(Y + i), cur_v);
27  }
28  for (int i = 0; i < r; ++i) {
29  Y[n + i] = std::max(X[n + i], static_cast<uint8_t>(zero_point));
30  }
31 }
32 
33 template <>
34 void ReluAVX2<uint16_t>(
35  const int N,
36  const int zero_point,
37  const uint16_t* X,
38  uint16_t* Y) {
39  constexpr int kVLen = 16;
40  const int n = N / kVLen * kVLen;
41  const int r = N % kVLen;
42  const __m256i zero_v = _mm256_set1_epi16(static_cast<uint16_t>(zero_point));
43  for (int i = 0; i < n; i += kVLen) {
44  __m256i cur_v = _mm256_max_epu16(
45  _mm256_loadu_si256(reinterpret_cast<const __m256i*>(X + i)), zero_v);
46  _mm256_storeu_si256(reinterpret_cast<__m256i*>(Y + i), cur_v);
47  }
48  for (int i = 0; i < r; ++i) {
49  Y[n + i] = std::max(X[n + i], static_cast<uint16_t>(zero_point));
50  }
51 }
52 
53 } // namespace internal
54 
55 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13