1 #include "fully_connected_dnnlowp_acc16_op.h" 3 #include <fbgemm/src/RefImplementations.h> 5 #include "fbgemm_pack_op.h" 7 C10_DECLARE_int32(caffe2_dnnlowp_nbits_in_non_outlier);
8 C10_DECLARE_int32(caffe2_dnnlowp_copy_to_32bit_frequency);
12 FullyConnectedDNNLowPAcc16Op::FullyConnectedDNNLowPAcc16Op(
13 const OperatorDef& operator_def,
15 : FullyConnectedDNNLowPOp<uint8_t>(operator_def, ws),
16 nbits_in_non_outlier_(this->template GetSingleArgument<int>(
17 "nbits_in_non_outlier",
18 FLAGS_caffe2_dnnlowp_nbits_in_non_outlier)),
19 copy_to_32bit_frequency_(this->template GetSingleArgument<int>(
20 "copy_to_32bit_frequency",
21 FLAGS_caffe2_dnnlowp_copy_to_32bit_frequency)) {}
23 bool FullyConnectedDNNLowPAcc16Op::RunOnDevice() {
27 this->ParseDNNLowPOperatorArguments_();
30 if (!GetQuantizationParameters_()) {
34 const auto& X = InputTensorCPU_(0);
35 const auto& W = InputTensorCPU_(1);
36 auto* Y = OutputTensorCPU_(0);
37 const auto canonical_axis = X.canonical_axis_index(axis_);
38 const auto M = X.size_to_dim(canonical_axis);
39 const auto K = X.size_from_dim(canonical_axis);
40 const auto canonical_axis_w = W.canonical_axis_index(axis_w_);
41 const int N = W.size_to_dim(canonical_axis_w);
44 vector<uint8_t> X_temp;
45 const uint8_t* Xdata =
46 QuantizeInputIfNeeded<uint8_t>(
this, 0, in_qparams_[0], X_temp);
48 if (this->quantize_channelwise_) {
49 LOG(WARNING) <<
"FC with 16-bit accumulation doesn't work with per-channel " 54 if (!Wq_acc16_packed_ || !is_weight_constant_) {
55 if (this->
template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
57 const auto& packed_filter =
58 this->
template Input<Int8FCDNNLowPPackedWeightBlob>(1);
59 Wq_outlier_ = packed_filter.W_outlier;
60 Wq_acc16_packed_ = packed_filter.W_acc16;
62 if (nbits_in_non_outlier_ != packed_filter.nbits_in_non_outlier) {
64 <<
"nbits_in_non_outlier in packed weight " 65 << packed_filter.nbits_in_non_outlier
66 <<
" doesn't match with nbits_in_non_outlier specified in operator " 67 << nbits_in_non_outlier_;
70 if (!Wq_acc16_packed_ && nbits_in_non_outlier_ < 8) {
71 static int log_occurences = 0;
72 if (log_occurences < 32) {
74 LOG(WARNING) <<
"FC DNNLOWP_ACC16 using outlier-aware quantization";
78 CAFFE_ENFORCE(!W_quantized_.empty());
81 ExtractOutlierMatrix(1, K, N, nbits_in_non_outlier_, W_quantized_));
82 int outlier_cnt = Wq_outlier_->ColPtr()[N];
84 LOG(INFO) <<
"Proportion of outlier for FC layer with weight blob " 85 << this->debug_def().input(1) <<
" is " 86 << (float)outlier_cnt / W_quantized_.size();
88 LOG(INFO) <<
"copy_to_32bit_frequency " << copy_to_32bit_frequency_;
91 Wq_acc16_packed_.reset(
new fbgemm::PackBMatrix<int8_t, int16_t>(
92 fbgemm::matrix_op_t::Transpose,
95 reinterpret_cast<const int8_t*>(W_quantized_.data()),
98 if (is_weight_constant_) {
99 vector<T_signed>().swap(W_quantized_);
104 Y_shape_cache_ = X.sizes().vec();
105 Y_shape_cache_.resize(canonical_axis + 1);
106 Y_shape_cache_[canonical_axis] = N;
107 Y->Resize(Y_shape_cache_);
112 Y_int32_.resize(Y->size());
113 uint8_t* Ydata = GetQuantizedOutputData_();
114 if (nbits_in_non_outlier_ > 0) {
115 int row_offset_size_per_thread =
116 PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize();
117 int x_pack_buf_size_per_thread =
118 PackAWithRowOffset<uint8_t, int16_t>::packedBufferSize();
119 this->row_offsets_.resize(row_offset_size_per_thread);
120 this->X_pack_buf_.resize(x_pack_buf_size_per_thread);
123 PackAWithRowOffset<uint8_t, int16_t> packA(
124 matrix_op_t::NoTranspose,
131 row_offsets_.data());
133 if (!dequantize_output_) {
134 DoNothing<> doNothingObj{};
135 ReQuantizeOutput<
false > reqObj(
137 this->requantization_multipliers_.data(),
138 out_qparams_.zero_point,
139 column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
140 this->filter_zero_points_.data(),
141 packA.getRowOffsetBuffer(),
142 column_offsets_->empty() ?
nullptr : column_offsets_->data(),
143 this->b_quantized_data_,
146 if (nbits_in_non_outlier_ < 8) {
148 typename ReQuantizeOutput<
false >::outType,
150 ReQuantizeOutput<
false >>
151 spmdmObj(reqObj, Xdata, K, *Wq_outlier_);
174 DoNothing<float, float> doNothingObj{};
175 ReQuantizeForFloat<
false > reqObj(
177 in_qparams_[0].scale,
178 this->filter_scales_.data(),
179 column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
180 this->filter_zero_points_.data(),
181 packA.getRowOffsetBuffer(),
182 column_offsets_->empty() ?
nullptr : column_offsets_->data(),
183 this->b_dequantized_data_,
186 if (nbits_in_non_outlier_ < 8) {
188 typename ReQuantizeForFloat<
false >::outType,
190 ReQuantizeForFloat<
false >>
191 spmdmObj(reqObj, Xdata, K, *Wq_outlier_);
196 Y->mutable_data<
float>(),
206 Y->mutable_data<
float>(),
215 block_type_t block{0,
static_cast<int>(
M), 0, static_cast<int>(N)};
217 block, Xdata, K,
false , Y_int32_.data(), N);
219 if (dequantize_output_) {
220 float* Ydata_float = Output(0)->template mutable_data<float>();
222 #pragma omp parallel for 223 for (
int i = 0; i <
M; ++i) {
224 int32_t row_offset = 0;
225 for (
int k = 0; k < K; ++k) {
226 row_offset += Xdata[i * K + k];
229 for (
int j = 0; j < N; ++j) {
230 int quant_group = this->quantize_channelwise_ ? j : 0;
231 Y_int32_[i * N + j] -=
232 row_offset * this->filter_qparams_[quant_group].zero_point;
233 if (!column_offsets_->empty()) {
234 Y_int32_[i * N + j] -=
235 in_qparams_[0].zero_point * (*column_offsets_)[j];
237 Ydata_float[i * N + j] = Y_int32_[i * N + j] * in_qparams_[0].scale *
238 in_qparams_[quant_group].scale +
239 b_dequantized_data_[j];
244 #pragma omp parallel for 245 for (
int i = 0; i < M; ++i) {
246 int32_t row_offset = 0;
247 for (
int k = 0; k < K; ++k) {
248 row_offset += Xdata[i * K + k];
251 requantize_u8acc32_ref(
255 Y_int32_.data() + i * N,
257 this->requantization_multipliers_.data(),
258 out_qparams_.zero_point,
259 column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
260 this->filter_zero_points_.data(),
262 column_offsets_->empty() ?
nullptr : column_offsets_->data(),
263 b_quantized_->data(),
269 if (!dequantize_output_) {
270 PropagateOutputTensorQuantizationParams(
this, 0, out_qparams_);
272 MeasureQuantizationError_();
277 REGISTER_CPU_OPERATOR_WITH_ENGINE(
280 FullyConnectedDNNLowPAcc16Op);
282 REGISTER_CPU_OPERATOR_WITH_ENGINE(
285 FullyConnectedDNNLowPAcc16Op);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...