1 #include "fbgemm_pack_op.h" 3 #include "caffe2/core/tensor_int8.h" 5 #include "caffe2_dnnlowp_utils.h" 7 C10_DECLARE_int32(caffe2_dnnlowp_nbits_in_non_outlier);
12 using dnnlowp::TensorQuantizationParams;
21 vector<TensorQuantizationParams>& qparams,
22 vector<
typename make_signed<T>::type>& W_quantized,
24 using T_signed =
typename make_signed<T>::type;
26 const auto& filter = blob.IsType<int8::Int8TensorCPU>()
27 ? blob.Get<int8::Int8TensorCPU>().t
28 : blob.Get<TensorCPU>();
30 W_quantized.resize(filter.numel());
32 int signed_min = -(1 << (qfactory->GetWeightPrecision() - 1));
33 if (blob.IsType<int8::Int8TensorCPU>()) {
34 qparams[0].scale = blob.Get<int8::Int8TensorCPU>().scale;
35 qparams[0].zero_point =
36 blob.Get<int8::Int8TensorCPU>().zero_point + signed_min;
38 const T* W_data = filter.data<
T>();
39 for (
auto i = 0; i < filter.numel(); ++i) {
40 W_quantized[i] = W_data[i] + signed_min;
43 for (
int g = 0; g < qparams.size(); ++g) {
44 size_t offset = g * (M / qparams.size()) * kernel_dim;
46 filter.data<
float>() + offset,
47 (M / qparams.size()) * kernel_dim,
52 qparams[g].zero_point += signed_min;
54 fbgemm::Quantize<T_signed>(
55 filter.data<
float>() + offset,
56 W_quantized.data() + offset,
57 (M / qparams.size()) * kernel_dim,
63 template void QuantizeWeight<uint8_t>(
67 vector<TensorQuantizationParams>& qparams,
68 vector<int8_t>& W_quantized,
71 template void QuantizeWeight<uint16_t>(
75 vector<TensorQuantizationParams>& qparams,
76 vector<int16_t>& W_quantized,
83 void ComputeColumnOffsets(
87 const vector<TensorQuantizationParams>& qparams,
88 vector<int32_t>& col_offsets) {
89 col_offsets.resize(num_cols);
90 int num_quant_groups = qparams.size();
91 for (
int g = 0; g < num_quant_groups; ++g) {
92 int j_begin = g * (num_cols / num_quant_groups);
93 int j_end = j_begin + (num_cols / num_quant_groups);
94 for (
int j = j_begin; j < j_end; ++j) {
96 for (
int k = 0; k < num_rows; ++k) {
97 sum += W[j * num_rows + k];
99 col_offsets[j] = sum - qparams[g].zero_point * num_rows;
104 template void ComputeColumnOffsets<int8_t>(
108 const vector<TensorQuantizationParams>& qparams,
109 vector<int32_t>& col_offsets);
111 template void ComputeColumnOffsets<int16_t>(
115 const vector<TensorQuantizationParams>& qparams,
116 vector<int32_t>& col_offsets);
118 fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
122 int nbits_in_non_outlier,
123 vector<int8_t>& W_quantized) {
125 for (
int group_id = 0; group_id < groups; ++group_id) {
126 for (
int i = 0; i < (M / groups) * kernel_dim; ++i) {
127 int8_t w = W_quantized[group_id * (M / groups) * kernel_dim + i];
128 bool is_outlier = nbits_in_non_outlier == 0 ||
129 w < -(1 << (nbits_in_non_outlier - 1)) ||
130 w >= (1 << (nbits_in_non_outlier - 1));
137 fbgemm::CompressedSparseColumn* Wq_outlier =
138 new fbgemm::CompressedSparseColumn(kernel_dim, M);
139 Wq_outlier->RowIdx().resize(outlier_cnt);
140 Wq_outlier->Values().resize(outlier_cnt);
143 for (
int group_id = 0; group_id < groups; ++group_id) {
144 for (
int j = 0; j < M / groups; ++j) {
145 Wq_outlier->ColPtr()[group_id * (M / groups) + j] = outlier_cnt;
147 for (
int k = 0; k < kernel_dim; ++k) {
148 int8_t w = W_quantized[(group_id * (M / groups) + j) * kernel_dim + k];
149 bool is_outlier = nbits_in_non_outlier == 0 ||
150 w < -(1 << (nbits_in_non_outlier - 1)) ||
151 w >= (1 << (nbits_in_non_outlier - 1));
153 CAFFE_ENFORCE_LE(k, numeric_limits<int16_t>::max());
154 Wq_outlier->RowIdx()[outlier_cnt] = k;
155 Wq_outlier->Values()[outlier_cnt] = w;
158 W_quantized[(group_id * (M / groups) + j) * kernel_dim + k] = 0;
163 Wq_outlier->ColPtr()[M] = outlier_cnt;
169 static void QuantizeConvBias(
172 const TensorQuantizationParams& in_qparams,
173 const vector<TensorQuantizationParams>& filter_qparams,
174 vector<int32_t>& b_quantized) {
175 const auto& bias = blob.IsType<int8::Int8TensorCPU>()
176 ? blob.Get<int8::Int8TensorCPU>().t
177 : blob.Get<TensorCPU>();
178 if (blob.IsType<int8::Int8TensorCPU>()) {
179 TensorQuantizationParams bias_qparams;
180 bias_qparams.scale = blob.Get<int8::Int8TensorCPU>().scale;
181 bias_qparams.zero_point = blob.Get<int8::Int8TensorCPU>().zero_point;
184 bias_qparams.scale - in_qparams.scale * filter_qparams[0].scale),
186 CAFFE_ENFORCE_EQ(bias_qparams.zero_point, 0);
187 b_quantized.resize(bias.numel());
189 bias.data<int32_t>(), bias.data<int32_t>() + bias.numel());
191 const float* bdata = bias.data<
float>();
192 b_quantized.resize(bias.numel());
193 for (
int g = 0; g < filter_qparams.size(); ++g) {
194 int i_begin = g * (M / filter_qparams.size());
195 int i_end = i_begin + (M / filter_qparams.size());
196 for (
int i = i_begin; i < i_end; ++i) {
197 b_quantized[i] = fbgemm::Quantize<int32_t>(
200 in_qparams.scale * filter_qparams[g].scale,
210 FullyConnectedDNNLowPPackWeightOp::FullyConnectedDNNLowPPackWeightOp(
211 const OperatorDef& operator_def,
213 : DNNLowPOp<uint8_t, FCFp32Op>(operator_def, ws),
214 axis_w_(this->GetSingleArgument<int32_t>(
"axis_w", 1)),
215 quantize_channelwise_(
216 this->GetSingleArgument<bool>(
"quantize_channelwise", false)) {
217 if (this->debug_def().engine() ==
"DNNLOWP_ROWWISE") {
218 quantize_channelwise_ =
true;
220 if (this->debug_def().engine() ==
"DNNLOWP_ACC16") {
221 nbits_in_non_outlier_ = this->GetSingleArgument<int>(
222 "nbits_in_non_outlier", FLAGS_caffe2_dnnlowp_nbits_in_non_outlier);
226 bool FullyConnectedDNNLowPPackWeightOp::RunOnDevice() {
227 const auto& filter = InputTensorCPU_(0);
228 const auto canonical_axis_w = filter.canonical_axis_index(axis_w_);
229 const auto K = filter.size_from_dim(canonical_axis_w);
230 const auto N = filter.size_to_dim(canonical_axis_w);
232 auto* Y = this->Output<Int8FCDNNLowPPackedWeightBlob>(0);
237 Y->original_tensor.ResizeLike(filter);
239 Y->qparams.resize(quantize_channelwise_ ? N : 1);
241 vector<int8_t> W_quantized;
242 QuantizeWeight<uint8_t>(
243 InputBlob(0), K, N, Y->qparams, W_quantized, qfactory_.get());
245 if (this->InputIsType<int8::Int8TensorCPU>(0) && quantize_channelwise_) {
246 static int log_occurences = 0;
247 if (log_occurences < 32) {
249 LOG(WARNING) <<
"Cannot do row-wise quantization for " 250 "pre-quantized weight " 251 << this->debug_def().input(0);
258 Y->column_offsets.reset(
new vector<int32_t>());
259 ComputeColumnOffsets(
260 K, N, W_quantized.data(), Y->qparams, *Y->column_offsets);
262 if (this->debug_def().engine() ==
"DNNLOWP_ACC16") {
263 if (nbits_in_non_outlier_ < 8) {
265 ExtractOutlierMatrix(1, K, N, nbits_in_non_outlier_, W_quantized));
266 int outlier_cnt = Y->W_outlier->ColPtr()[N];
268 LOG(INFO) <<
"Proportion of outlier for FC layer with weight blob " 269 << this->debug_def().input(0) <<
" is " 270 <<
static_cast<float>(outlier_cnt) / W_quantized.size();
271 LOG(INFO) <<
"nbits_in_non_outlier " << nbits_in_non_outlier_;
274 Y->nbits_in_non_outlier = nbits_in_non_outlier_;
275 Y->W_acc16.reset(
new fbgemm::PackBMatrix<int8_t, int16_t>(
276 fbgemm::matrix_op_t::Transpose,
284 Y->W.reset(
new fbgemm::PackBMatrix<int8_t>(
285 fbgemm::matrix_op_t::Transpose,
295 if (InputSize() >= 2) {
296 TensorQuantizationParams in_qparams;
297 CAFFE_ENFORCE(HasSingleArgumentOfType<float>(
"in_scale"));
298 in_qparams.scale = GetSingleArgument<float>(
"in_scale", 0);
299 Y->bias.reset(
new vector<int32_t>());
300 QuantizeConvBias(InputBlob(1), N, in_qparams, Y->qparams, *Y->bias);
310 ConvDNNLowPPackWeightOp::ConvDNNLowPPackWeightOp(
311 const OperatorDef& operator_def,
313 : ConvPoolDNNLowPOpBase<uint8_t, ConvFp32Op>(operator_def, ws),
315 this->GetSingleArgument<bool>(
"quantize_groupwise", false)) {
316 if (this->debug_def().engine() ==
"DNNLOWP_ACC16") {
317 nbits_in_non_outlier_ = this->GetSingleArgument<int>(
318 "nbits_in_non_outlier", FLAGS_caffe2_dnnlowp_nbits_in_non_outlier);
322 bool ConvDNNLowPPackWeightOp::TakeDepthWise3x3FastPath_() {
323 const auto& filter = this->InputTensorCPU_(FILTER);
325 int M = filter.dim32(0);
327 int C_per_group = filter.dim32(filter.dim() - 1);
328 return this->debug_def().engine() !=
"DNNLOWP_ACC16" && group_ == M &&
329 C_per_group == 1 && group_ % 8 == 0 && this->kernel_.size() == 2 &&
330 kernel_h() == 3 && kernel_w() == 3 && stride_h() == stride_w() &&
331 (stride_h() == 1 || stride_h() == 2) && dilation_h() == 1 &&
332 dilation_w() == 1 && pad_t() == 1 && pad_b() == 1 && pad_l() == 1 &&
333 pad_r() == 1 && GetCpuId().avx2();
336 bool ConvDNNLowPPackWeightOp::TakeDepthWise3x3x3FastPath_() {
337 const auto& filter = this->InputTensorCPU_(FILTER);
339 int M = filter.dim32(0);
341 int C_per_group = filter.dim32(filter.dim() - 1);
342 bool ret = this->debug_def().engine() !=
"DNNLOWP_ACC16" && group_ == M &&
343 C_per_group == 1 && group_ % 8 == 0 && this->kernel_.size() == 3 &&
344 this->kernel_[0] == 3 && this->kernel_[1] == 3 && this->kernel_[2] == 3 &&
345 this->stride_[0] == this->stride_[1] &&
346 this->stride_[0] == this->stride_[2] &&
347 (this->stride_[0] == 1 || this->stride_[0] == 2) &&
348 this->dilation_[0] == 1 && this->dilation_[1] == 1 &&
349 this->dilation_[2] == 1 &&
351 this->pads_.begin(), this->pads_.end(), 1, multiplies<int>()) == 1 &&
356 bool ConvDNNLowPPackWeightOp::TakeGConvFastPath_() {
357 if (this->debug_def().engine() ==
"DNNLOWP_ACC16" ||
358 this->kernel_.size() != 2) {
362 auto& filter = InputTensorCPU_(FILTER);
363 const int M = filter.dim32(0),
C = filter.dim32(filter.dim() - 1) * group_;
364 fbgemm::conv_param_t<> conv_p(
370 {this->kernel_[0], this->kernel_[1]},
371 {this->stride_[0], this->stride_[1]},
372 {this->pads_[0], this->pads_[1], this->pads_[2], this->pads_[3]});
374 return fbgemm::fbgemmOptimizedGConv(conv_p);
377 bool ConvDNNLowPPackWeightOp::RunOnDevice() {
378 const auto& filter = InputTensorCPU_(FILTER);
380 auto* Y = this->Output<Int8ConvDNNLowPPackedWeightBlob>(0);
384 Y->original_tensor.ResizeLike(filter);
388 int M = filter.dim32(0);
390 int C_per_group = filter.dim32(filter.dim() - 1);
392 int kernel_dims_size = 1;
393 for (
int i = 0; i < filter.dim() - 2; ++i) {
394 kernel_dims_size *= filter.dim32(i + 1);
396 int kernel_dim = C_per_group * kernel_dims_size;
398 vector<int8_t> W_quantized;
399 Y->qparams.resize(quantize_groupwise_ ? group_ : 1);
400 QuantizeWeight<uint8_t>(
408 if (this->InputIsType<int8::Int8TensorCPU>(FILTER) && quantize_groupwise_) {
409 static int log_occurences = 0;
410 if (log_occurences < 32) {
412 LOG(WARNING) <<
"Cannot do group-wise quantization for " 413 "pre-quantized weight " 414 << this->debug_def().input(0);
421 Y->column_offsets.reset(
new vector<int32_t>());
422 ComputeColumnOffsets(
423 kernel_dim, M, W_quantized.data(), Y->qparams, *Y->column_offsets);
426 if (this->debug_def().engine() ==
"DNNLOWP_ACC16" &&
427 nbits_in_non_outlier_ > 0) {
428 if (nbits_in_non_outlier_ < 8) {
429 Y->W_outlier.reset(ExtractOutlierMatrix(
430 group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized));
431 int outlier_cnt = Y->W_outlier->ColPtr()[M];
433 LOG(INFO) <<
"Proportion of outlier for Conv layer with weight blob " 434 << this->debug_def().input(0) <<
" is " 435 <<
static_cast<float>(outlier_cnt) / W_quantized.size();
436 LOG(INFO) <<
"nbits_in_non_outlier " << nbits_in_non_outlier_;
439 Y->nbits_in_non_outlier = nbits_in_non_outlier_;
440 Y->W_acc16.reset(
new fbgemm::PackBMatrix<int8_t, int16_t>(
441 fbgemm::matrix_op_t::Transpose,
448 }
else if (TakeDepthWise3x3FastPath_()) {
449 Y->W_depthwise_3x3.reset(
450 new fbgemm::Packed3x3ConvMatrix(group_, W_quantized.data()));
451 }
else if (TakeDepthWise3x3x3FastPath_()) {
452 Y->W_depthwise_3x3x3.reset(
453 new fbgemm::Packed3x3x3ConvMatrix(group_, W_quantized.data()));
454 }
else if (TakeGConvFastPath_()) {
455 fbgemm::conv_param_t<> conv_p(
457 group_ * C_per_group,
461 {this->kernel_[0], this->kernel_[1]},
462 {this->stride_[0], this->stride_[1]},
463 {this->pads_[0], this->pads_[1], this->pads_[2], this->pads_[3]});
465 Y->W_gconv.reset(
new fbgemm::PackWeightMatrixForGConv<int8_t>(
466 fbgemm::matrix_op_t::Transpose, conv_p, W_quantized.data()));
468 Y->W.reset(
new fbgemm::PackBMatrix<int8_t>(
469 fbgemm::matrix_op_t::Transpose,
478 if (InputSize() >= 2) {
479 TensorQuantizationParams in_qparams;
480 CAFFE_ENFORCE(HasSingleArgumentOfType<float>(
"in_scale"));
481 in_qparams.scale = GetSingleArgument<float>(
"in_scale", 0);
482 Y->bias.reset(
new vector<int32_t>());
483 QuantizeConvBias(InputBlob(BIAS), M, in_qparams, Y->qparams, *Y->bias);
492 CAFFE_KNOWN_TYPE(Int8FCDNNLowPPackedWeightBlob);
493 CAFFE_KNOWN_TYPE(Int8ConvDNNLowPPackedWeightBlob);
495 REGISTER_CPU_OPERATOR_WITH_ENGINE(
498 FullyConnectedDNNLowPPackWeightOp);
500 REGISTER_CPU_OPERATOR_WITH_ENGINE(
503 FullyConnectedDNNLowPPackWeightOp);
505 REGISTER_CPU_OPERATOR_WITH_ENGINE(
508 FullyConnectedDNNLowPPackWeightOp);
510 OPERATOR_SCHEMA(Int8FCPackWeight)
513 .SetDoc(R
"DOC(Prepack weight for Int8FC)DOC") 514 .Input(0, "W",
"Weight tensor in KRSC layout")
515 .Input(1,
"b",
"Bias tensor")
516 .Output(0,
"W_q",
"Weight/bias tensor in a packed format");
518 REGISTER_CPU_OPERATOR_WITH_ENGINE(
521 ConvDNNLowPPackWeightOp);
523 REGISTER_CPU_OPERATOR_WITH_ENGINE(
526 ConvDNNLowPPackWeightOp);
528 OPERATOR_SCHEMA(Int8ConvPackWeight)
531 .SetDoc(R
"DOC(Prepack weight for Int8Conv)DOC") 532 .Input(0, "W",
"Weight tensor in KRSC layout")
533 .Input(1,
"b",
"Bias tensor")
534 .Output(0,
"W_q",
"Weight/bias tensor in a packed format");
TensorQuantizationParams ChooseQuantizationParams(float min, float max, int precision, bool preserve_sparsity, bool is_signed=false) const
Choose quantization scale and zero_point that maps floating-point range [min, max] to the integer ran...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...