Caffe2 - C++ API
A deep learning, cross platform ML framework
fully_connected_fake_lowp_op_avx2.cc
1 #include <immintrin.h>
2 
3 namespace caffe2 {
4 
5 // convert to float16 reducing mantissa, preserving exponent
6 void fp32_to_bfp16(const float* source, size_t size, float* dest) {
7  // Results on a 1 sign, 8 exponent, 7 mantissa
8  constexpr int mask = 0xFFFF0000;
9  __m256 wmask = _mm256_broadcast_ss(reinterpret_cast<const float*>(&mask));
10 
11  for (auto i = 0; i < (size / 8) * 8; i += 8) {
12  __m256 data = _mm256_loadu_ps(&source[i]);
13  _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data));
14  }
15  for (auto i = (size / 8) * 8; i < size; i++) {
16  alignas(64) float tmp[8];
17  __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i]));
18  _mm256_store_ps(tmp, data);
19  dest[i] = tmp[0];
20  }
21 }
22 
23 // convert to float24 reducing mantissa, preserving exponent
24 void fp32_to_bfp24(const float* source, size_t size, float* dest) {
25  // Results on a 1 sign, 8 exponent, 7 mantissa
26  constexpr int mask = 0xFFFFFF00;
27  __m256 wmask = _mm256_broadcast_ss(reinterpret_cast<const float*>(&mask));
28 
29  for (auto i = 0; i < (size / 8) * 8; i += 8) {
30  __m256 data = _mm256_loadu_ps(&source[i]);
31  _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data));
32  }
33  for (auto i = (size / 8) * 8; i < size; i++) {
34  alignas(64) float tmp[8];
35  __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i]));
36  _mm256_store_ps(tmp, data);
37  dest[i] = tmp[0];
38  }
39 }
40 
41 // convert to float14 reducing mantissa, preserving exponent
42 void fp32_to_bfp14(const float* source, size_t size, float* dest) {
43  // Results on a 1 sign, 8 exponent, 7 mantissa
44  constexpr int mask = 0xFFFC0000;
45  __m256 wmask = _mm256_broadcast_ss((float*)(&mask));
46 
47  for (auto i = 0; i < (size / 8) * 8; i += 8) {
48  __m256 data = _mm256_loadu_ps(&source[i]);
49  _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data));
50  }
51  for (auto i = (size / 8) * 8; i < size; i++) {
52  alignas(64) float tmp[8];
53  __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i]));
54  _mm256_store_ps(tmp, data);
55  dest[i] = tmp[0];
56  }
57 }
58 
59 void fp32_to_bfp16_scalar(const float* source, size_t size, float* dest) {
60  constexpr int mask = 0xFFFF0000;
61  for (auto i = 0; i < size; i++) {
62  *(int*)(dest + i) = *(int*)(source + i) & mask;
63  }
64 }
65 
66 // convert to IEEE float16
67 void fp32_to_fp16(const float* source, size_t size, float* dest) {
68  for (auto i = 0; i < (size / 8) * 8; i += 8) {
69  __m128i vin_fp16 = _mm256_cvtps_ph(_mm256_loadu_ps(&source[i]), 0);
70  _mm256_storeu_ps(&dest[i], _mm256_cvtph_ps(vin_fp16));
71  }
72  for (auto i = (size / 8) * 8; i < size; i++) {
73  alignas(64) float tmp[8];
74  __m128i vin_fp16 = _mm256_cvtps_ph(_mm256_set1_ps(source[i]), 0);
75  _mm256_store_ps(tmp, _mm256_cvtph_ps(vin_fp16));
76  dest[i] = tmp[0];
77  }
78 }
79 
80 // fp32 -> int32 -> += 1<< 15 -> fp32 -> truncation
81 void fp32_to_bfp16_round(const float* source, size_t size, float* dest) {
82  constexpr int offset = 0x00008000; // 1 << 15
83  constexpr int mask = 0xFFFF0000;
84 
85  __m256i woffset = _mm256_set1_epi32(offset);
86  __m256i wmask = _mm256_set1_epi32(mask);
87 
88  for (auto i = 0; i < (size / 8) * 8; i += 8) {
89  __m256i v32int = _mm256_add_epi32(
90  _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&source[i])),
91  woffset);
92  _mm256_storeu_si256(
93  reinterpret_cast<__m256i*>(&dest[i]), _mm256_and_si256(wmask, v32int));
94  }
95  for (auto i = (size / 8) * 8; i < size; i++) {
96  alignas(8) float tmp[8];
97  __m256i v32int = _mm256_add_epi32(
98  _mm256_set1_epi32(*reinterpret_cast<const int*>(&source[i])), woffset);
99  _mm256_store_si256(
100  reinterpret_cast<__m256i*>(tmp), _mm256_and_si256(wmask, v32int));
101  dest[i] = tmp[0];
102  }
103 }
104 
105 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13