1 #include "caffe2/operators/fused_rowwise_random_quantization_ops.h" 2 #include <c10/util/Registry.h> 3 #include "caffe2/utils/math.h" 7 #define IS_LITTLE_ENDIAN \ 9 const int32_t kValue = 1; \ 10 return reinterpret_cast<const uint8_t*>(&kValue)[0] == 1; \ 13 template <
class Context>
14 bool FloatToFusedRandRowwiseQuantizedOp<Context>::RunOnDevice() {
15 CAFFE_ENFORCE(IS_LITTLE_ENDIAN,
"Unsupported endianness");
17 const auto& input = Input(DATA_FLOAT);
22 "Expect input to be a matrix. Reshape the input tensor to a matrix for usage.");
24 const auto input_rows = input.size(0);
25 const auto input_columns = input.size(1);
37 size_t data_per_byte = 8 / bitwidth_;
39 size_t segment_size = (input_columns + data_per_byte - 1) / data_per_byte;
40 const std::vector<int64_t> output_dimensions = {
41 input_rows, 10 +
static_cast<int64_t
>(segment_size)};
43 Output(DATA_FUSED_QUANTIZED, output_dimensions, at::dtype<uint8_t>());
45 const auto* input_data = input.template data<float>();
46 auto* output_data = output->template mutable_data<uint8_t>();
47 const size_t output_columns =
static_cast<size_t>(output->size(1));
48 memset(output_data, 0, output->numel());
51 random_buffer_.resize(input_columns);
54 for (
size_t row = 0; row < input_rows; ++row) {
56 #ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL 57 int status = vsRngUniform(
58 VSL_RNG_METHOD_UNIFORM_STD,
61 random_buffer_.data(),
64 if (status != VSL_ERROR_OK) {
65 LOG(WARNING) <<
"vsRngUniform returns " << status;
68 for (
int i = 0; i < input_columns; ++i) {
69 random_buffer_[i] = (*dis_)(gen_);
74 math::quantize_and_compress(
75 input_data + row * input_columns,
76 output_data + row * output_columns,
80 random_buffer_.data());
86 template <
class Context>
87 bool FusedRandRowwiseQuantizedToFloatOp<Context>::RunOnDevice() {
88 CAFFE_ENFORCE(IS_LITTLE_ENDIAN,
"Unsupported endianness");
90 const auto& input = Input(DATA_FUSED_QUANTIZED);
92 CAFFE_ENFORCE_EQ(input.dim(), 2,
"Expect input to be a matrix.");
96 "Expect input to have size greater than or equal to 4.");
98 const auto input_rows = input.size(0);
99 const auto input_columns = input.size(1);
100 const auto* input_data = input.template data<uint8_t>();
101 const size_t bitwidth = input_data[0];
103 bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8,
104 "Unsupported bitwidth");
105 const size_t tail = input_data[1];
106 const size_t output_columns = (input_columns - 10) * (8 / bitwidth) - tail;
107 const std::vector<int64_t> output_dimensions = {
108 input_rows,
static_cast<int64_t
>(output_columns)};
109 auto* output = Output(DATA_FLOAT, output_dimensions, at::dtype<float>());
110 auto* output_data = output->template mutable_data<float>();
111 for (
size_t row = 0; row < input_rows; ++row) {
112 math::decompress_and_dequantize(
113 input_data + row * input_columns,
114 output_data + row * output_columns,
121 #undef IS_LITTLE_ENDIAN 123 REGISTER_CPU_OPERATOR(
124 FloatToFusedRandRowwiseQuantized,
125 FloatToFusedRandRowwiseQuantizedOp<CPUContext>);
126 OPERATOR_SCHEMA(FloatToFusedRandRowwiseQuantized)
129 .TensorInferenceFunction([](
const OperatorDef& def,
130 const vector<TensorShape>& in) {
131 ArgumentHelper helper(def);
132 auto bitwidth = helper.GetSingleArgument<int32_t>(
"bitwidth", 8);
133 size_t data_per_byte = 8 / bitwidth;
134 vector<TensorShape> out;
135 TensorShape X = in[0];
136 X.set_dims(1, 10 + (X.dims(1) + data_per_byte - 1) / data_per_byte);
137 out.push_back(std::move(X));
138 out[0].set_data_type(TensorProto_DataType_UINT8);
142 Applies row-wise stochastic/random quantization by determining the range of 143 each row in the input matrix, and then quantize each element to one of two 144 closest discrete levels by randomly drawing Bernoulli distribution. 145 The method is extended from TernGrad [1], 146 which randomly quantizes gradients to three levels to reduce communication in distributed training. 147 The format of each row (x) in the output matrix is [bitwidth][tail][min][max][data]: 148 bitwidth[1 Byte]: bitwidth per data [1, 2, 4 or 8]; 149 tail[1 Byte]: the number of unused buckets [1-8] (One byte is split to 8/bitwidth buckets and each bucket stores one low-precision data in bitwidth bits); 150 min[4 Bytes]: the minimum floating value min(x); 151 max[4 Bytes]: the maximum floating value max(x); 152 data: quantized data. 153 The quantization is uniform with levels q = min + (max-min)/(2^bitwidth - 1)*[0:1:2^bitwidth]. 154 During stochastic/random quantization x'=Quantize(x), for q_j < x_i <= q_{j+1}, we draw quantization x'_i from Bernoulli distributions with 155 P(x'_i = q_{j+1}) = (x_i - q_j)/(q_{j+1} - q_j), and 156 P(x'_i = q_j) = (q_{j+1} - x_i)/(q_{j+1} - q_j) where x'_i is the quantized value of x_i. 157 [1] proved E{x'_i}=x_i, which is an unbiased approximation. More details are in the paper. 158 For example, suppose targeted bitwidth = 2 and x = [0.3, -1.4, -0.6, 0.9, 1.0], 159 then tail = 3, min = -1.4, max = 1.0 and q = [-1.4, -0.6, 0.2, 1.0]. 160 x_1 = 0.3 will be quantized to x'_1 = 0.2 with probability 7/8 and to x'_1 = 1.0 with probability 1/8. 161 The storage format of quantized data is: [x'_1|x'_3|x'_5|xxx]-[x'_2|x'_4|xxx|xxx]. 162 In general, a input row is split to multiple segments. One segment is a continuous subarray of the row, 163 and its length is the number of bytes storing quantized data in the output matrix. 164 The b-th bucket of the i-th byte stores the i-th data of the b-th segment of input row. 166 [1] Wen, Wei, Cong Xu, Feng Yan, Chunpeng Wu, Yandan Wang, Yiran Chen, and Hai Li. 167 "Terngrad: Ternary gradients to reduce communication in distributed deep learning." 168 In Advances in Neural Information Processing Systems, pp. 1508-1518. 2017. 171 .Input(0, "input",
"Float32 input data")
172 .Output(0,
"output",
"Fused bitwidth, tail, min, max and quantized data")
173 .Arg(
"bitwidth",
"How many bits to quantiz per data (defaults to 8).")
174 .Arg(
"random",
"random or not (True). False is set up for unittest.");
175 NO_GRADIENT(FloatToFusedRandRowwiseQuantized);
177 REGISTER_CPU_OPERATOR(
178 FusedRandRowwiseQuantizedToFloat,
179 FusedRandRowwiseQuantizedToFloatOp<CPUContext>);
180 OPERATOR_SCHEMA(FusedRandRowwiseQuantizedToFloat)
183 .TensorInferenceFunction([](
const OperatorDef& def,
184 const vector<TensorShape>&) {
185 vector<TensorShape> out;
186 for (
int i = 0; i < def.output_size(); i++) {
188 ts.set_unknown_shape(
true);
189 ts.set_data_type(TensorProto_DataType_FLOAT);
195 De-quantizes the result of the FloatToFusedRandRowwiseQuantized operator. 196 Refer FloatToFusedRandRowwiseQuantized operator for details. 201 "Fused bitwidth, tail, min, max and quantized data")
202 .Output(0,
"float_input",
"Float32 data");
203 NO_GRADIENT(FusedRandRowwiseQuantizedToFloat);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...