Caffe2 - C++ API
A deep learning, cross platform ML framework
fully_connected_dnnlowp_acc16_op.cc
1 #include "fully_connected_dnnlowp_acc16_op.h"
2 
3 #include <fbgemm/src/RefImplementations.h>
4 
5 #include "fbgemm_pack_op.h"
6 
7 C10_DECLARE_int32(caffe2_dnnlowp_nbits_in_non_outlier);
8 C10_DECLARE_int32(caffe2_dnnlowp_copy_to_32bit_frequency);
9 
10 namespace caffe2 {
11 
12 FullyConnectedDNNLowPAcc16Op::FullyConnectedDNNLowPAcc16Op(
13  const OperatorDef& operator_def,
14  Workspace* ws)
15  : FullyConnectedDNNLowPOp<uint8_t>(operator_def, ws),
16  nbits_in_non_outlier_(this->template GetSingleArgument<int>(
17  "nbits_in_non_outlier",
18  FLAGS_caffe2_dnnlowp_nbits_in_non_outlier)),
19  copy_to_32bit_frequency_(this->template GetSingleArgument<int>(
20  "copy_to_32bit_frequency",
21  FLAGS_caffe2_dnnlowp_copy_to_32bit_frequency)) {}
22 
23 bool FullyConnectedDNNLowPAcc16Op::RunOnDevice() {
24  using namespace std;
25  using namespace dnnlowp;
26 
27  this->ParseDNNLowPOperatorArguments_();
28 
29  // Get quantization parameters
30  if (!GetQuantizationParameters_()) {
31  return false;
32  }
33 
34  const auto& X = InputTensorCPU_(0);
35  const auto& W = InputTensorCPU_(1);
36  auto* Y = OutputTensorCPU_(0);
37  const auto canonical_axis = X.canonical_axis_index(axis_);
38  const auto M = X.size_to_dim(canonical_axis);
39  const auto K = X.size_from_dim(canonical_axis);
40  const auto canonical_axis_w = W.canonical_axis_index(axis_w_);
41  const int N = W.size_to_dim(canonical_axis_w);
42 
43  // Quantize X
44  vector<uint8_t> X_temp;
45  const uint8_t* Xdata =
46  QuantizeInputIfNeeded<uint8_t>(this, 0, in_qparams_[0], X_temp);
47 
48  if (this->quantize_channelwise_) {
49  LOG(WARNING) << "FC with 16-bit accumulation doesn't work with per-channel "
50  "quantization yet.";
51  }
52 
53  // Pack W if needed
54  if (!Wq_acc16_packed_ || !is_weight_constant_) {
55  if (this->template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
56  // If the input is already packed
57  const auto& packed_filter =
58  this->template Input<Int8FCDNNLowPPackedWeightBlob>(1);
59  Wq_outlier_ = packed_filter.W_outlier;
60  Wq_acc16_packed_ = packed_filter.W_acc16;
61 
62  if (nbits_in_non_outlier_ != packed_filter.nbits_in_non_outlier) {
63  LOG(WARNING)
64  << "nbits_in_non_outlier in packed weight "
65  << packed_filter.nbits_in_non_outlier
66  << " doesn't match with nbits_in_non_outlier specified in operator "
67  << nbits_in_non_outlier_;
68  }
69  } else {
70  if (!Wq_acc16_packed_ && nbits_in_non_outlier_ < 8) {
71  static int log_occurences = 0;
72  if (log_occurences < 32) {
73  ++log_occurences;
74  LOG(WARNING) << "FC DNNLOWP_ACC16 using outlier-aware quantization";
75  }
76 
77  // Separate out outliers
78  CAFFE_ENFORCE(!W_quantized_.empty());
79 
80  Wq_outlier_.reset(
81  ExtractOutlierMatrix(1, K, N, nbits_in_non_outlier_, W_quantized_));
82  int outlier_cnt = Wq_outlier_->ColPtr()[N];
83 
84  LOG(INFO) << "Proportion of outlier for FC layer with weight blob "
85  << this->debug_def().input(1) << " is "
86  << (float)outlier_cnt / W_quantized_.size();
87 
88  LOG(INFO) << "copy_to_32bit_frequency " << copy_to_32bit_frequency_;
89  }
90 
91  Wq_acc16_packed_.reset(new fbgemm::PackBMatrix<int8_t, int16_t>(
92  fbgemm::matrix_op_t::Transpose,
93  K,
94  N,
95  reinterpret_cast<const int8_t*>(W_quantized_.data()),
96  K));
97 
98  if (is_weight_constant_) {
99  vector<T_signed>().swap(W_quantized_);
100  }
101  }
102  }
103 
104  Y_shape_cache_ = X.sizes().vec();
105  Y_shape_cache_.resize(canonical_axis + 1);
106  Y_shape_cache_[canonical_axis] = N;
107  Y->Resize(Y_shape_cache_);
108 
109  using namespace fbgemm;
110  // main GEMM
111  // TODO : omp parallelization
112  Y_int32_.resize(Y->size());
113  uint8_t* Ydata = GetQuantizedOutputData_();
114  if (nbits_in_non_outlier_ > 0) {
115  int row_offset_size_per_thread =
116  PackAWithRowOffset<uint8_t, int16_t>::rowOffsetBufferSize();
117  int x_pack_buf_size_per_thread =
118  PackAWithRowOffset<uint8_t, int16_t>::packedBufferSize();
119  this->row_offsets_.resize(row_offset_size_per_thread);
120  this->X_pack_buf_.resize(x_pack_buf_size_per_thread);
121 
122  // TODO: use PackAMatrix if filter_qparams_[0].zero_point == 0
123  PackAWithRowOffset<uint8_t, int16_t> packA(
124  matrix_op_t::NoTranspose,
125  M,
126  K,
127  Xdata,
128  K,
129  X_pack_buf_.data(),
130  1, // group
131  row_offsets_.data());
132 
133  if (!dequantize_output_) {
134  DoNothing<> doNothingObj{};
135  ReQuantizeOutput<false /* fuse relu */> reqObj(
136  doNothingObj,
137  this->requantization_multipliers_.data(),
138  out_qparams_.zero_point,
139  column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
140  this->filter_zero_points_.data(),
141  packA.getRowOffsetBuffer(),
142  column_offsets_->empty() ? nullptr : column_offsets_->data(),
143  this->b_quantized_data_,
144  N); // ncols per quant group
145 
146  if (nbits_in_non_outlier_ < 8) {
147  DoSpmdmOnInpBuffer<
148  typename ReQuantizeOutput<false /* fuse relu */>::outType,
149  int32_t,
150  ReQuantizeOutput<false /* fuse relu */>>
151  spmdmObj(reqObj, Xdata, K, *Wq_outlier_);
152 
153  fbgemmPacked(
154  packA,
155  *Wq_acc16_packed_,
156  Ydata,
157  Y_int32_.data(),
158  N,
159  spmdmObj,
160  0, // thread_id
161  1); // num_threads
162  } else {
163  fbgemmPacked(
164  packA,
165  *Wq_acc16_packed_,
166  Ydata,
167  Y_int32_.data(),
168  N,
169  reqObj,
170  0, // thread_id
171  1); // num_threads
172  }
173  } else {
174  DoNothing<float, float> doNothingObj{};
175  ReQuantizeForFloat<false /* FUSE_RELU*/> reqObj(
176  doNothingObj,
177  in_qparams_[0].scale,
178  this->filter_scales_.data(),
179  column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
180  this->filter_zero_points_.data(),
181  packA.getRowOffsetBuffer(),
182  column_offsets_->empty() ? nullptr : column_offsets_->data(),
183  this->b_dequantized_data_,
184  N); // ncols per quant group
185 
186  if (nbits_in_non_outlier_ < 8) {
187  DoSpmdmOnInpBuffer<
188  typename ReQuantizeForFloat<false /* fuse relu */>::outType,
189  int32_t,
190  ReQuantizeForFloat<false /* fuse relu */>>
191  spmdmObj(reqObj, Xdata, K, *Wq_outlier_);
192 
193  fbgemmPacked(
194  packA,
195  *Wq_acc16_packed_,
196  Y->mutable_data<float>(),
197  Y_int32_.data(),
198  N,
199  spmdmObj,
200  0, // thread_id
201  1); // num_threads
202  } else {
203  fbgemmPacked(
204  packA,
205  *Wq_acc16_packed_,
206  Y->mutable_data<float>(),
207  Y_int32_.data(),
208  N,
209  reqObj,
210  0, // thread_id
211  1); // num_threads
212  }
213  }
214  } else {
215  block_type_t block{0, static_cast<int>(M), 0, static_cast<int>(N)};
216  Wq_outlier_->SpMDM(
217  block, Xdata, K, false /* accumulate */, Y_int32_.data(), N);
218 
219  if (dequantize_output_) {
220  float* Ydata_float = Output(0)->template mutable_data<float>();
221 
222 #pragma omp parallel for
223  for (int i = 0; i < M; ++i) {
224  int32_t row_offset = 0;
225  for (int k = 0; k < K; ++k) {
226  row_offset += Xdata[i * K + k];
227  }
228 
229  for (int j = 0; j < N; ++j) {
230  int quant_group = this->quantize_channelwise_ ? j : 0;
231  Y_int32_[i * N + j] -=
232  row_offset * this->filter_qparams_[quant_group].zero_point;
233  if (!column_offsets_->empty()) {
234  Y_int32_[i * N + j] -=
235  in_qparams_[0].zero_point * (*column_offsets_)[j];
236  }
237  Ydata_float[i * N + j] = Y_int32_[i * N + j] * in_qparams_[0].scale *
238  in_qparams_[quant_group].scale +
239  b_dequantized_data_[j];
240  }
241  }
242  } else {
243  // Add offsets/bias, and requantize
244 #pragma omp parallel for
245  for (int i = 0; i < M; ++i) {
246  int32_t row_offset = 0;
247  for (int k = 0; k < K; ++k) {
248  row_offset += Xdata[i * K + k];
249  }
250 
251  requantize_u8acc32_ref(
252  1,
253  N,
254  N,
255  Y_int32_.data() + i * N,
256  Ydata + i * N,
257  this->requantization_multipliers_.data(),
258  out_qparams_.zero_point,
259  column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
260  this->filter_zero_points_.data(),
261  &row_offset,
262  column_offsets_->empty() ? nullptr : column_offsets_->data(),
263  b_quantized_->data(),
264  N); // ncols per quant group
265  }
266  }
267  }
268 
269  if (!dequantize_output_) {
270  PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
271  }
272  MeasureQuantizationError_();
273 
274  return true;
275 }
276 
277 REGISTER_CPU_OPERATOR_WITH_ENGINE(
278  FC,
279  DNNLOWP_ACC16,
280  FullyConnectedDNNLowPAcc16Op);
281 
282 REGISTER_CPU_OPERATOR_WITH_ENGINE(
283  Int8FC,
284  DNNLOWP_ACC16,
285  FullyConnectedDNNLowPAcc16Op);
286 
287 } // namespace caffe2
Definition: any.cpp:108
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: OpClasses.h:566