19 static constexpr
double QEPSILON = 1e-8;
21 void quantize_and_compress__base(
22 const float* input_data,
27 const float* random_buffer) {
28 uint64_t data_per_byte = 8 / bitwidth;
29 uint64_t tail = input_size % data_per_byte;
30 tail = tail ? data_per_byte - tail : 0;
31 uint64_t segment_size = (input_size + data_per_byte - 1) / data_per_byte;
34 float minimum_element = INFINITY, maximum_element = -INFINITY;
35 for (
auto i = 0; i < input_size; ++i) {
37 input_data[i] < minimum_element ? input_data[i] : minimum_element;
39 input_data[i] > maximum_element ? input_data[i] : maximum_element;
41 output_data[0] = bitwidth;
42 output_data[1] = tail;
43 reinterpret_cast<float*
>(output_data + 2)[0] = minimum_element;
44 reinterpret_cast<float*
>(output_data + 2)[1] = maximum_element;
46 float gap = (maximum_element - minimum_element) / ((1 << bitwidth) - 1.0f);
47 float gap_inverse = 1. / (gap + QEPSILON);
48 uint8_t max_q = (1 << bitwidth) - 1;
49 uint64_t bit_start = 0;
51 for (
int start = 0; start < input_size; start += segment_size) {
52 uint64_t stride = start + segment_size <= input_size ? segment_size
55 for (; i < stride; ++i) {
56 float fval = input_data[start + i];
57 float thetimes = (fval - minimum_element) * gap_inverse;
58 float rounded = floor(thetimes + random_buffer[start + i]);
59 rounded = rounded < static_cast<float>(max_q)
61 : static_cast<float>(max_q);
62 rounded = rounded > 0.0f ? rounded : 0.0f;
63 uint8_t qval = rounded;
65 uint8_t orval = output_data[10 + i];
66 output_data[10 + i] = orval |
static_cast<uint8_t
>(qval << bit_start);
68 bit_start += bitwidth;
71 for (
int start = 0; start < input_size; start += segment_size) {
72 uint64_t stride = start + segment_size <= input_size ? segment_size
75 for (; i < stride; ++i) {
76 float fval = input_data[start + i];
77 float thetimes = (fval - minimum_element) * gap_inverse;
78 thetimes = thetimes < static_cast<float>(max_q)
80 : static_cast<float>(max_q);
81 thetimes = thetimes > 0.0f ? thetimes : 0.0f;
82 uint8_t qval = nearbyint(thetimes);
84 uint8_t orval = output_data[10 + i];
85 output_data[10 + i] = orval |
static_cast<uint8_t
>(qval << bit_start);
87 bit_start += bitwidth;
92 decltype(quantize_and_compress__base) quantize_and_compress__avx2;
93 void quantize_and_compress(
94 const
float* input_data,
99 const
float* random_buffer) {
101 quantize_and_compress,
109 quantize_and_compress,
118 void decompress_and_dequantize__base(
119 const uint8_t* input_data,
121 uint64_t input_size) {
123 const float minimum_element =
124 reinterpret_cast<const float*
>(input_data + 2)[0];
125 const float maximum_element =
126 reinterpret_cast<const float*
>(input_data + 2)[1];
127 const uint64_t bitwidth = input_data[0];
129 (maximum_element - minimum_element) / ((1 << bitwidth) - 1.f) +
132 const uint64_t tail = input_data[1];
134 const uint64_t output_size = (input_size - 10) * (8 / bitwidth) - tail;
136 uint64_t bit_start = 0;
137 const uint64_t segment_size = input_size - 10;
138 for (
int start = 0; start < output_size; start += segment_size) {
139 uint64_t stride = start + segment_size <= output_size ? segment_size
140 : output_size - start;
141 uint8_t mask = (1 << bitwidth) - 1;
143 for (; i < stride; ++i) {
144 output_data[start + i] =
145 ((input_data[10 + i] >> bit_start) & mask) * gap + minimum_element;
147 bit_start += bitwidth;
151 decltype(decompress_and_dequantize__base) decompress_and_dequantize__avx2;
152 void decompress_and_dequantize(
153 const uint8_t* input_data,
155 uint64_t input_size) {
156 AVX2_DO(decompress_and_dequantize, input_data, output_data, input_size);
157 BASE_DO(decompress_and_dequantize, input_data, output_data, input_size);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...