1 #include "caffe2/operators/fused_rowwise_8bit_conversion_ops.h" 2 #include "c10/util/Registry.h" 7 void convertfp32fp32(
float* dst,
const float* src,
size_t N) {
8 memcpy(dst, src,
sizeof(
float) * N);
11 void convertfp16fp32(
float* dst,
const at::Half* src,
size_t N) {
12 for (
size_t i = 0; i < N; i++) {
17 void convertfp32fp16(
at::Half* dst,
const float* src,
size_t N) {
18 for (
size_t i = 0; i < N; i++) {
24 REGISTER_CPU_OPERATOR(
25 FloatToFused8BitRowwiseQuantized,
26 FloatToFused8BitRowwiseQuantizedOp<float, convertfp32fp32, CPUContext>);
27 OPERATOR_SCHEMA(FloatToFused8BitRowwiseQuantized)
30 .TensorInferenceFunction([](
const OperatorDef& ,
31 const vector<TensorShape>& in) {
32 vector<TensorShape> out;
33 TensorShape X = in[0];
34 X.set_dims(1, X.dims(1) + 8);
35 out.push_back(std::move(X));
36 out[0].set_data_type(TensorProto_DataType_UINT8);
40 Applies 8-bit row-wise quantization by determining the range 41 (maximum - minimum) and offset (minimum value) of each row in the input 42 matrix, and then scaling each element to an 8-bit number between 0 and 43 255. To later de-quantize values, the scale (range / 255) and offset 44 (bias) are stored alongside the data. More precisely, the first 4 bytes 45 of each row in the output matrix are a 32-bit float storing the scale, 46 the next 4 bytes store the bias as a 32-bit float, and all remaining 47 bytes in the row encode single quantized values.) 49 .Input(0, "input",
"Float32 input data")
50 .Output(0,
"output",
"Fused scale, bias and quantized data");
51 NO_GRADIENT(FloatToFused8BitRowwiseQuantized);
53 REGISTER_CPU_OPERATOR(
54 HalfFloatToFused8BitRowwiseQuantized,
55 FloatToFused8BitRowwiseQuantizedOp<at::Half, convertfp16fp32, CPUContext>);
56 OPERATOR_SCHEMA(HalfFloatToFused8BitRowwiseQuantized)
59 .TensorInferenceFunction([](
const OperatorDef& ,
60 const vector<TensorShape>& in) {
61 vector<TensorShape> out;
62 TensorShape X = in[0];
63 X.set_dims(1, X.dims(1) + 8);
64 out.push_back(std::move(X));
65 out[0].set_data_type(TensorProto_DataType_UINT8);
69 Applies 8-bit row-wise quantization by determining the range 70 (maximum - minimum) and offset (minimum value) of each row in the input 71 matrix, and then scaling each element to an 8-bit number between 0 and 72 255. To later de-quantize values, the scale (range / 255) and offset 73 (bias) are stored alongside the data. More precisely, the first 4 bytes 74 of each row in the output matrix are a 32-bit float storing the scale, 75 the next 4 bytes store the bias as a 32-bit float, and all remaining 76 bytes in the row encode single quantized values.) 78 .Input(0, "input",
"Float16 input data")
79 .Output(0,
"output",
"Fused scale, bias and quantized data");
80 NO_GRADIENT(HalfFloatToFused8BitRowwiseQuantized);
82 REGISTER_CPU_OPERATOR(
83 Fused8BitRowwiseQuantizedToFloat,
84 Fused8BitRowwiseQuantizedToFloatOp<float, convertfp32fp32, CPUContext>);
85 OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedToFloat)
88 .TensorInferenceFunction([](
const OperatorDef& ,
89 const vector<TensorShape>& in) {
90 vector<TensorShape> out;
91 TensorShape X = in[0];
92 X.set_dims(1, X.dims(1) - 8);
93 out.push_back(std::move(X));
94 out[0].set_data_type(TensorProto_DataType_FLOAT);
98 De-quantizes the result of the 99 FloatToFused8BitRowwiseQuantized operator. The input is expected to 100 encode the scale as a 32-bit float in the second to the last 4 bytes of each 101 row, followed by the bias as a 32-bit float in the next 4 bytes, and the 102 quantized values in the preceding bytes of the row. The output is a 103 matrix containing only the values, but de-quantized. De-quantization is 104 performed by multiplying each value by its row's scale and bias 105 parameters. The de-quantized values will thus not be exactly equal to 106 the original, un-quantized floating point values. 110 "scale_bias_quantized_input",
111 "Fused scale, bias and quantized data")
112 .Output(0,
"float_output",
"Float32 data");
113 NO_GRADIENT(Fused8BitRowwiseQuantizedToFloat);
115 REGISTER_CPU_OPERATOR(
116 Fused8BitRowwiseQuantizedToHalfFloat,
117 Fused8BitRowwiseQuantizedToFloatOp<at::Half, convertfp32fp16, CPUContext>);
118 OPERATOR_SCHEMA(Fused8BitRowwiseQuantizedToHalfFloat)
121 .TensorInferenceFunction([](
const OperatorDef& ,
122 const vector<TensorShape>& in) {
123 vector<TensorShape> out;
124 TensorShape X = in[0];
125 X.set_dims(1, X.dims(1) - 8);
126 out.push_back(std::move(X));
127 out[0].set_data_type(TensorProto_DataType_FLOAT16);
131 De-quantizes the result of the 132 HalfFloatToFused8BitRowwiseQuantized operator. The input is expected to 133 encode the scale as a 32-bit float in the second to the last 4 bytes of each 134 row, followed by the bias as a 32-bit float in the next 4 bytes, and the 135 quantized values in the preceding bytes of the row. The output is a 136 matrix containing only the values, but de-quantized. De-quantization is 137 performed by multiplying each value by its row's scale and bias 138 parameters. The de-quantized values will thus not be exactly equal to 139 the original, un-quantized floating point values. 143 "scale_bias_quantized_input",
144 "Fused scale, bias and quantized data")
145 .Output(0,
"float16_output",
"Float16 data");
146 NO_GRADIENT(Fused8BitRowwiseQuantizedToHalfFloat);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...