6 void fp32_to_bfp16(
const float* source,
size_t size,
float* dest) {
8 constexpr
int mask = 0xFFFF0000;
9 __m256 wmask = _mm256_broadcast_ss(reinterpret_cast<const float*>(&mask));
11 for (
auto i = 0; i < (size / 8) * 8; i += 8) {
12 __m256 data = _mm256_loadu_ps(&source[i]);
13 _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data));
15 for (
auto i = (size / 8) * 8; i < size; i++) {
16 alignas(64)
float tmp[8];
17 __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i]));
18 _mm256_store_ps(tmp, data);
24 void fp32_to_bfp24(
const float* source,
size_t size,
float* dest) {
26 constexpr
int mask = 0xFFFFFF00;
27 __m256 wmask = _mm256_broadcast_ss(reinterpret_cast<const float*>(&mask));
29 for (
auto i = 0; i < (size / 8) * 8; i += 8) {
30 __m256 data = _mm256_loadu_ps(&source[i]);
31 _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data));
33 for (
auto i = (size / 8) * 8; i < size; i++) {
34 alignas(64)
float tmp[8];
35 __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i]));
36 _mm256_store_ps(tmp, data);
42 void fp32_to_bfp14(
const float* source,
size_t size,
float* dest) {
44 constexpr
int mask = 0xFFFC0000;
45 __m256 wmask = _mm256_broadcast_ss((
float*)(&mask));
47 for (
auto i = 0; i < (size / 8) * 8; i += 8) {
48 __m256 data = _mm256_loadu_ps(&source[i]);
49 _mm256_storeu_ps(&dest[i], _mm256_and_ps(wmask, data));
51 for (
auto i = (size / 8) * 8; i < size; i++) {
52 alignas(64)
float tmp[8];
53 __m256 data = _mm256_and_ps(wmask, _mm256_set1_ps(source[i]));
54 _mm256_store_ps(tmp, data);
59 void fp32_to_bfp16_scalar(
const float* source,
size_t size,
float* dest) {
60 constexpr
int mask = 0xFFFF0000;
61 for (
auto i = 0; i < size; i++) {
62 *(
int*)(dest + i) = *(
int*)(source + i) & mask;
67 void fp32_to_fp16(
const float* source,
size_t size,
float* dest) {
68 for (
auto i = 0; i < (size / 8) * 8; i += 8) {
69 __m128i vin_fp16 = _mm256_cvtps_ph(_mm256_loadu_ps(&source[i]), 0);
70 _mm256_storeu_ps(&dest[i], _mm256_cvtph_ps(vin_fp16));
72 for (
auto i = (size / 8) * 8; i < size; i++) {
73 alignas(64)
float tmp[8];
74 __m128i vin_fp16 = _mm256_cvtps_ph(_mm256_set1_ps(source[i]), 0);
75 _mm256_store_ps(tmp, _mm256_cvtph_ps(vin_fp16));
81 void fp32_to_bfp16_round(
const float* source,
size_t size,
float* dest) {
82 constexpr
int offset = 0x00008000;
83 constexpr
int mask = 0xFFFF0000;
85 __m256i woffset = _mm256_set1_epi32(offset);
86 __m256i wmask = _mm256_set1_epi32(mask);
88 for (
auto i = 0; i < (size / 8) * 8; i += 8) {
89 __m256i v32int = _mm256_add_epi32(
90 _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&source[i])),
93 reinterpret_cast<__m256i*>(&dest[i]), _mm256_and_si256(wmask, v32int));
95 for (
auto i = (size / 8) * 8; i < size; i++) {
96 alignas(8)
float tmp[8];
97 __m256i v32int = _mm256_add_epi32(
98 _mm256_set1_epi32(*reinterpret_cast<const int*>(&source[i])), woffset);
100 reinterpret_cast<__m256i*>(tmp), _mm256_and_si256(wmask, v32int));
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...