Caffe2 - C++ API
A deep learning, cross platform ML framework
fbgemm_pack_op.cc
1 #include "fbgemm_pack_op.h"
2 
3 #include "caffe2/core/tensor_int8.h"
4 
5 #include "caffe2_dnnlowp_utils.h"
6 
7 C10_DECLARE_int32(caffe2_dnnlowp_nbits_in_non_outlier);
8 
9 namespace caffe2 {
10 
11 using namespace std;
12 using dnnlowp::TensorQuantizationParams;
13 
14 // Helper functions
15 
16 template <typename T>
17 void QuantizeWeight(
18  const Blob& blob,
19  int kernel_dim,
20  int M,
21  vector<TensorQuantizationParams>& qparams,
22  vector<typename make_signed<T>::type>& W_quantized,
23  dnnlowp::QuantizationFactory* qfactory) {
24  using T_signed = typename make_signed<T>::type;
25 
26  const auto& filter = blob.IsType<int8::Int8TensorCPU>()
27  ? blob.Get<int8::Int8TensorCPU>().t
28  : blob.Get<TensorCPU>();
29 
30  W_quantized.resize(filter.numel());
31 
32  int signed_min = -(1 << (qfactory->GetWeightPrecision() - 1));
33  if (blob.IsType<int8::Int8TensorCPU>()) {
34  qparams[0].scale = blob.Get<int8::Int8TensorCPU>().scale;
35  qparams[0].zero_point =
36  blob.Get<int8::Int8TensorCPU>().zero_point + signed_min;
37 
38  const T* W_data = filter.data<T>();
39  for (auto i = 0; i < filter.numel(); ++i) {
40  W_quantized[i] = W_data[i] + signed_min;
41  }
42  } else {
43  for (int g = 0; g < qparams.size(); ++g) {
44  size_t offset = g * (M / qparams.size()) * kernel_dim;
45  qparams[g] = qfactory->ChooseQuantizationParams(
46  filter.data<float>() + offset,
47  (M / qparams.size()) * kernel_dim,
48  true /*weight*/);
49 
50  // qparams[g] is computed for unsigned type.
51  // Adjust for the fact that weight will actually use signed.
52  qparams[g].zero_point += signed_min;
53 
54  fbgemm::Quantize<T_signed>(
55  filter.data<float>() + offset,
56  W_quantized.data() + offset,
57  (M / qparams.size()) * kernel_dim,
58  qparams[g]);
59  }
60  }
61 }
62 
63 template void QuantizeWeight<uint8_t>(
64  const Blob& blob,
65  int kernel_dim,
66  int M,
67  vector<TensorQuantizationParams>& qparams,
68  vector<int8_t>& W_quantized,
70 
71 template void QuantizeWeight<uint16_t>(
72  const Blob& blob,
73  int kernel_dim,
74  int M,
75  vector<TensorQuantizationParams>& qparams,
76  vector<int16_t>& W_quantized,
78 
79 // TODO reuse col_offsets_with_zero_pt_s8acc32_ref in fbgemm
80 // RefImplementations.cc . We can't do this now because W_quantized is
81 // not transposed here.
82 template <typename T>
83 void ComputeColumnOffsets(
84  int num_rows,
85  int num_cols,
86  const T* W,
87  const vector<TensorQuantizationParams>& qparams,
88  vector<int32_t>& col_offsets) {
89  col_offsets.resize(num_cols);
90  int num_quant_groups = qparams.size();
91  for (int g = 0; g < num_quant_groups; ++g) {
92  int j_begin = g * (num_cols / num_quant_groups);
93  int j_end = j_begin + (num_cols / num_quant_groups);
94  for (int j = j_begin; j < j_end; ++j) {
95  int32_t sum = 0;
96  for (int k = 0; k < num_rows; ++k) {
97  sum += W[j * num_rows + k];
98  }
99  col_offsets[j] = sum - qparams[g].zero_point * num_rows;
100  }
101  }
102 }
103 
104 template void ComputeColumnOffsets<int8_t>(
105  int num_rows,
106  int num_cols,
107  const int8_t* W,
108  const vector<TensorQuantizationParams>& qparams,
109  vector<int32_t>& col_offsets);
110 
111 template void ComputeColumnOffsets<int16_t>(
112  int num_rows,
113  int num_cols,
114  const int16_t* W,
115  const vector<TensorQuantizationParams>& qparams,
116  vector<int32_t>& col_offsets);
117 
118 fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
119  int groups,
120  int kernel_dim,
121  int M,
122  int nbits_in_non_outlier,
123  vector<int8_t>& W_quantized) {
124  int outlier_cnt = 0;
125  for (int group_id = 0; group_id < groups; ++group_id) {
126  for (int i = 0; i < (M / groups) * kernel_dim; ++i) {
127  int8_t w = W_quantized[group_id * (M / groups) * kernel_dim + i];
128  bool is_outlier = nbits_in_non_outlier == 0 ||
129  w < -(1 << (nbits_in_non_outlier - 1)) ||
130  w >= (1 << (nbits_in_non_outlier - 1));
131  if (is_outlier) {
132  ++outlier_cnt;
133  }
134  }
135  }
136 
137  fbgemm::CompressedSparseColumn* Wq_outlier =
138  new fbgemm::CompressedSparseColumn(kernel_dim, M);
139  Wq_outlier->RowIdx().resize(outlier_cnt);
140  Wq_outlier->Values().resize(outlier_cnt);
141 
142  outlier_cnt = 0;
143  for (int group_id = 0; group_id < groups; ++group_id) {
144  for (int j = 0; j < M / groups; ++j) {
145  Wq_outlier->ColPtr()[group_id * (M / groups) + j] = outlier_cnt;
146 
147  for (int k = 0; k < kernel_dim; ++k) {
148  int8_t w = W_quantized[(group_id * (M / groups) + j) * kernel_dim + k];
149  bool is_outlier = nbits_in_non_outlier == 0 ||
150  w < -(1 << (nbits_in_non_outlier - 1)) ||
151  w >= (1 << (nbits_in_non_outlier - 1));
152  if (is_outlier) {
153  CAFFE_ENFORCE_LE(k, numeric_limits<int16_t>::max());
154  Wq_outlier->RowIdx()[outlier_cnt] = k;
155  Wq_outlier->Values()[outlier_cnt] = w;
156  ++outlier_cnt;
157 
158  W_quantized[(group_id * (M / groups) + j) * kernel_dim + k] = 0;
159  }
160  }
161  }
162  } // for each group
163  Wq_outlier->ColPtr()[M] = outlier_cnt;
164 
165  return Wq_outlier;
166 }
167 
168 // FIXME: code duplication with ConvDNNLowPOp::QuantizeBias_
169 static void QuantizeConvBias(
170  const Blob& blob,
171  int M,
172  const TensorQuantizationParams& in_qparams,
173  const vector<TensorQuantizationParams>& filter_qparams,
174  vector<int32_t>& b_quantized) {
175  const auto& bias = blob.IsType<int8::Int8TensorCPU>()
176  ? blob.Get<int8::Int8TensorCPU>().t
177  : blob.Get<TensorCPU>();
178  if (blob.IsType<int8::Int8TensorCPU>()) {
179  TensorQuantizationParams bias_qparams;
180  bias_qparams.scale = blob.Get<int8::Int8TensorCPU>().scale;
181  bias_qparams.zero_point = blob.Get<int8::Int8TensorCPU>().zero_point;
182  CAFFE_ENFORCE_LE(
183  std::abs(
184  bias_qparams.scale - in_qparams.scale * filter_qparams[0].scale),
185  1e-4);
186  CAFFE_ENFORCE_EQ(bias_qparams.zero_point, 0);
187  b_quantized.resize(bias.numel());
188  b_quantized.assign(
189  bias.data<int32_t>(), bias.data<int32_t>() + bias.numel());
190  } else {
191  const float* bdata = bias.data<float>();
192  b_quantized.resize(bias.numel());
193  for (int g = 0; g < filter_qparams.size(); ++g) {
194  int i_begin = g * (M / filter_qparams.size());
195  int i_end = i_begin + (M / filter_qparams.size());
196  for (int i = i_begin; i < i_end; ++i) {
197  b_quantized[i] = fbgemm::Quantize<int32_t>(
198  bdata[i],
199  0,
200  in_qparams.scale * filter_qparams[g].scale,
201  32,
202  true /* signed */);
203  }
204  }
205  }
206 }
207 
208 // FullyConnectedDNNLowPPackWeightOp
209 
210 FullyConnectedDNNLowPPackWeightOp::FullyConnectedDNNLowPPackWeightOp(
211  const OperatorDef& operator_def,
212  Workspace* ws)
213  : DNNLowPOp<uint8_t, FCFp32Op>(operator_def, ws),
214  axis_w_(this->GetSingleArgument<int32_t>("axis_w", 1)),
215  quantize_channelwise_(
216  this->GetSingleArgument<bool>("quantize_channelwise", false)) {
217  if (this->debug_def().engine() == "DNNLOWP_ROWWISE") {
218  quantize_channelwise_ = true;
219  }
220  if (this->debug_def().engine() == "DNNLOWP_ACC16") {
221  nbits_in_non_outlier_ = this->GetSingleArgument<int>(
222  "nbits_in_non_outlier", FLAGS_caffe2_dnnlowp_nbits_in_non_outlier);
223  }
224 }
225 
226 bool FullyConnectedDNNLowPPackWeightOp::RunOnDevice() {
227  const auto& filter = InputTensorCPU_(0);
228  const auto canonical_axis_w = filter.canonical_axis_index(axis_w_);
229  const auto K = filter.size_from_dim(canonical_axis_w);
230  const auto N = filter.size_to_dim(canonical_axis_w);
231 
232  auto* Y = this->Output<Int8FCDNNLowPPackedWeightBlob>(0);
233 
234  // Create tensor with the same shape but this new tensor shouldn't actually
235  // allocate memory for the tensor.
236  // This is just a convenient way to pass tensor shape information
237  Y->original_tensor.ResizeLike(filter);
238 
239  Y->qparams.resize(quantize_channelwise_ ? N : 1);
240 
241  vector<int8_t> W_quantized;
242  QuantizeWeight<uint8_t>(
243  InputBlob(0), K, N, Y->qparams, W_quantized, qfactory_.get());
244 
245  if (this->InputIsType<int8::Int8TensorCPU>(0) && quantize_channelwise_) {
246  static int log_occurences = 0;
247  if (log_occurences < 32) {
248  ++log_occurences;
249  LOG(WARNING) << "Cannot do row-wise quantization for "
250  "pre-quantized weight "
251  << this->debug_def().input(0);
252  }
253  }
254 
255  // Pre-compute column offsets
256  // This should happen before ExtractOutlierMatrix because W_quantized is
257  // changed in ExtractOutlierMatrix.
258  Y->column_offsets.reset(new vector<int32_t>());
259  ComputeColumnOffsets(
260  K, N, W_quantized.data(), Y->qparams, *Y->column_offsets);
261 
262  if (this->debug_def().engine() == "DNNLOWP_ACC16") {
263  if (nbits_in_non_outlier_ < 8) {
264  Y->W_outlier.reset(
265  ExtractOutlierMatrix(1, K, N, nbits_in_non_outlier_, W_quantized));
266  int outlier_cnt = Y->W_outlier->ColPtr()[N];
267 
268  LOG(INFO) << "Proportion of outlier for FC layer with weight blob "
269  << this->debug_def().input(0) << " is "
270  << static_cast<float>(outlier_cnt) / W_quantized.size();
271  LOG(INFO) << "nbits_in_non_outlier " << nbits_in_non_outlier_;
272  }
273 
274  Y->nbits_in_non_outlier = nbits_in_non_outlier_;
275  Y->W_acc16.reset(new fbgemm::PackBMatrix<int8_t, int16_t>(
276  fbgemm::matrix_op_t::Transpose,
277  K,
278  N,
279  W_quantized.data(),
280  K,
281  nullptr, // pmat
282  1)); // group
283  } else {
284  Y->W.reset(new fbgemm::PackBMatrix<int8_t>(
285  fbgemm::matrix_op_t::Transpose,
286  K,
287  N,
288  W_quantized.data(),
289  K,
290  nullptr, // pmat
291  1)); // group
292  }
293 
294  // Quantize bias
295  if (InputSize() >= 2) {
296  TensorQuantizationParams in_qparams;
297  CAFFE_ENFORCE(HasSingleArgumentOfType<float>("in_scale"));
298  in_qparams.scale = GetSingleArgument<float>("in_scale", 0);
299  Y->bias.reset(new vector<int32_t>());
300  QuantizeConvBias(InputBlob(1), N, in_qparams, Y->qparams, *Y->bias);
301  } else {
302  Y->bias = nullptr;
303  }
304 
305  return true;
306 }
307 
308 // ConvDNNLowPPackWeightOp
309 
310 ConvDNNLowPPackWeightOp::ConvDNNLowPPackWeightOp(
311  const OperatorDef& operator_def,
312  Workspace* ws)
313  : ConvPoolDNNLowPOpBase<uint8_t, ConvFp32Op>(operator_def, ws),
314  quantize_groupwise_(
315  this->GetSingleArgument<bool>("quantize_groupwise", false)) {
316  if (this->debug_def().engine() == "DNNLOWP_ACC16") {
317  nbits_in_non_outlier_ = this->GetSingleArgument<int>(
318  "nbits_in_non_outlier", FLAGS_caffe2_dnnlowp_nbits_in_non_outlier);
319  }
320 }
321 
322 bool ConvDNNLowPPackWeightOp::TakeDepthWise3x3FastPath_() {
323  const auto& filter = this->InputTensorCPU_(FILTER);
324  // The number of output channels
325  int M = filter.dim32(0);
326  // The number of input channels per group
327  int C_per_group = filter.dim32(filter.dim() - 1);
328  return this->debug_def().engine() != "DNNLOWP_ACC16" && group_ == M &&
329  C_per_group == 1 && group_ % 8 == 0 && this->kernel_.size() == 2 &&
330  kernel_h() == 3 && kernel_w() == 3 && stride_h() == stride_w() &&
331  (stride_h() == 1 || stride_h() == 2) && dilation_h() == 1 &&
332  dilation_w() == 1 && pad_t() == 1 && pad_b() == 1 && pad_l() == 1 &&
333  pad_r() == 1 && GetCpuId().avx2();
334 }
335 
336 bool ConvDNNLowPPackWeightOp::TakeDepthWise3x3x3FastPath_() {
337  const auto& filter = this->InputTensorCPU_(FILTER);
338  // The number of output channels
339  int M = filter.dim32(0);
340  // The number of input channels per group
341  int C_per_group = filter.dim32(filter.dim() - 1);
342  bool ret = this->debug_def().engine() != "DNNLOWP_ACC16" && group_ == M &&
343  C_per_group == 1 && group_ % 8 == 0 && this->kernel_.size() == 3 &&
344  this->kernel_[0] == 3 && this->kernel_[1] == 3 && this->kernel_[2] == 3 &&
345  this->stride_[0] == this->stride_[1] &&
346  this->stride_[0] == this->stride_[2] &&
347  (this->stride_[0] == 1 || this->stride_[0] == 2) &&
348  this->dilation_[0] == 1 && this->dilation_[1] == 1 &&
349  this->dilation_[2] == 1 &&
350  accumulate(
351  this->pads_.begin(), this->pads_.end(), 1, multiplies<int>()) == 1 &&
352  GetCpuId().avx2();
353  return ret;
354 }
355 
356 bool ConvDNNLowPPackWeightOp::TakeGConvFastPath_() {
357  if (this->debug_def().engine() == "DNNLOWP_ACC16" ||
358  this->kernel_.size() != 2) {
359  return false;
360  }
361 
362  auto& filter = InputTensorCPU_(FILTER);
363  const int M = filter.dim32(0), C = filter.dim32(filter.dim() - 1) * group_;
364  fbgemm::conv_param_t<> conv_p(
365  1,
366  C,
367  M,
368  {1, 1},
369  group_,
370  {this->kernel_[0], this->kernel_[1]},
371  {this->stride_[0], this->stride_[1]},
372  {this->pads_[0], this->pads_[1], this->pads_[2], this->pads_[3]});
373 
374  return fbgemm::fbgemmOptimizedGConv(conv_p);
375 }
376 
377 bool ConvDNNLowPPackWeightOp::RunOnDevice() {
378  const auto& filter = InputTensorCPU_(FILTER);
379 
380  auto* Y = this->Output<Int8ConvDNNLowPPackedWeightBlob>(0);
381  // Create tensor with the same shape but this new tensor shouldn't actually
382  // allocate memory for the tensor.
383  // This is just a convenient way to pass tensor shape information
384  Y->original_tensor.ResizeLike(filter);
385 
386  // Assume KRSC layout
387  // The number of output channels
388  int M = filter.dim32(0);
389  // The number of input channels per group
390  int C_per_group = filter.dim32(filter.dim() - 1);
391 
392  int kernel_dims_size = 1;
393  for (int i = 0; i < filter.dim() - 2; ++i) {
394  kernel_dims_size *= filter.dim32(i + 1);
395  }
396  int kernel_dim = C_per_group * kernel_dims_size;
397 
398  vector<int8_t> W_quantized;
399  Y->qparams.resize(quantize_groupwise_ ? group_ : 1);
400  QuantizeWeight<uint8_t>(
401  InputBlob(FILTER),
402  kernel_dim,
403  M,
404  Y->qparams,
405  W_quantized,
406  qfactory_.get());
407 
408  if (this->InputIsType<int8::Int8TensorCPU>(FILTER) && quantize_groupwise_) {
409  static int log_occurences = 0;
410  if (log_occurences < 32) {
411  ++log_occurences;
412  LOG(WARNING) << "Cannot do group-wise quantization for "
413  "pre-quantized weight "
414  << this->debug_def().input(0);
415  }
416  }
417 
418  // Pre-compute column offsets
419  // This should happen before ExtractOutlierMatrix because W_quantized is
420  // changed in ExtractOutlierMatrix.
421  Y->column_offsets.reset(new vector<int32_t>());
422  ComputeColumnOffsets(
423  kernel_dim, M, W_quantized.data(), Y->qparams, *Y->column_offsets);
424 
425  // When nbits_in_non_outlier == 0, we fall back to acc32
426  if (this->debug_def().engine() == "DNNLOWP_ACC16" &&
427  nbits_in_non_outlier_ > 0) {
428  if (nbits_in_non_outlier_ < 8) {
429  Y->W_outlier.reset(ExtractOutlierMatrix(
430  group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized));
431  int outlier_cnt = Y->W_outlier->ColPtr()[M];
432 
433  LOG(INFO) << "Proportion of outlier for Conv layer with weight blob "
434  << this->debug_def().input(0) << " is "
435  << static_cast<float>(outlier_cnt) / W_quantized.size();
436  LOG(INFO) << "nbits_in_non_outlier " << nbits_in_non_outlier_;
437  }
438 
439  Y->nbits_in_non_outlier = nbits_in_non_outlier_;
440  Y->W_acc16.reset(new fbgemm::PackBMatrix<int8_t, int16_t>(
441  fbgemm::matrix_op_t::Transpose,
442  group_ * kernel_dim,
443  M / group_,
444  W_quantized.data(),
445  kernel_dim,
446  nullptr, // pmat
447  group_));
448  } else if (TakeDepthWise3x3FastPath_()) {
449  Y->W_depthwise_3x3.reset(
450  new fbgemm::Packed3x3ConvMatrix(group_, W_quantized.data()));
451  } else if (TakeDepthWise3x3x3FastPath_()) {
452  Y->W_depthwise_3x3x3.reset(
453  new fbgemm::Packed3x3x3ConvMatrix(group_, W_quantized.data()));
454  } else if (TakeGConvFastPath_()) {
455  fbgemm::conv_param_t<> conv_p(
456  1,
457  group_ * C_per_group,
458  M,
459  {1, 1},
460  group_,
461  {this->kernel_[0], this->kernel_[1]},
462  {this->stride_[0], this->stride_[1]},
463  {this->pads_[0], this->pads_[1], this->pads_[2], this->pads_[3]});
464 
465  Y->W_gconv.reset(new fbgemm::PackWeightMatrixForGConv<int8_t>(
466  fbgemm::matrix_op_t::Transpose, conv_p, W_quantized.data()));
467  } else {
468  Y->W.reset(new fbgemm::PackBMatrix<int8_t>(
469  fbgemm::matrix_op_t::Transpose,
470  group_ * kernel_dim,
471  M / group_,
472  W_quantized.data(),
473  kernel_dim,
474  nullptr, // pmat
475  group_));
476  }
477 
478  if (InputSize() >= 2) {
479  TensorQuantizationParams in_qparams;
480  CAFFE_ENFORCE(HasSingleArgumentOfType<float>("in_scale"));
481  in_qparams.scale = GetSingleArgument<float>("in_scale", 0);
482  Y->bias.reset(new vector<int32_t>());
483  QuantizeConvBias(InputBlob(BIAS), M, in_qparams, Y->qparams, *Y->bias);
484  } else {
485  Y->bias = nullptr;
486  }
487 
488  return true;
489 }
490 
491 // Explicitly register TypeMeta
492 CAFFE_KNOWN_TYPE(Int8FCDNNLowPPackedWeightBlob);
493 CAFFE_KNOWN_TYPE(Int8ConvDNNLowPPackedWeightBlob);
494 
495 REGISTER_CPU_OPERATOR_WITH_ENGINE(
496  Int8FCPackWeight,
497  DNNLOWP,
498  FullyConnectedDNNLowPPackWeightOp);
499 
500 REGISTER_CPU_OPERATOR_WITH_ENGINE(
501  Int8FCPackWeight,
502  DNNLOWP_ACC16,
503  FullyConnectedDNNLowPPackWeightOp);
504 
505 REGISTER_CPU_OPERATOR_WITH_ENGINE(
506  Int8FCPackWeight,
507  DNNLOWP_ROWWISE,
508  FullyConnectedDNNLowPPackWeightOp);
509 
510 OPERATOR_SCHEMA(Int8FCPackWeight)
511  .NumInputs(1, 2)
512  .NumOutputs(1)
513  .SetDoc(R"DOC(Prepack weight for Int8FC)DOC")
514  .Input(0, "W", "Weight tensor in KRSC layout")
515  .Input(1, "b", "Bias tensor")
516  .Output(0, "W_q", "Weight/bias tensor in a packed format");
517 
518 REGISTER_CPU_OPERATOR_WITH_ENGINE(
519  Int8ConvPackWeight,
520  DNNLOWP,
521  ConvDNNLowPPackWeightOp);
522 
523 REGISTER_CPU_OPERATOR_WITH_ENGINE(
524  Int8ConvPackWeight,
525  DNNLOWP_ACC16,
526  ConvDNNLowPPackWeightOp);
527 
528 OPERATOR_SCHEMA(Int8ConvPackWeight)
529  .NumInputs(1, 2)
530  .NumOutputs(1)
531  .SetDoc(R"DOC(Prepack weight for Int8Conv)DOC")
532  .Input(0, "W", "Weight tensor in KRSC layout")
533  .Input(1, "b", "Bias tensor")
534  .Output(0, "W_q", "Weight/bias tensor in a packed format");
535 
536 } // namespace caffe2
TensorQuantizationParams ChooseQuantizationParams(float min, float max, int precision, bool preserve_sparsity, bool is_signed=false) const
Choose quantization scale and zero_point that maps floating-point range [min, max] to the integer ran...
Definition: dnnlowp.h:46
Definition: any.cpp:108
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64