Caffe2 - C++ API
A deep learning, cross platform ML framework
fully_connected_dnnlowp_op.cc
1 #include "fully_connected_dnnlowp_op.h"
2 
3 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
4 #include <chrono>
5 #endif
6 
7 #include "caffe2/core/flags.h"
8 #include "caffe2/core/tensor_int8.h"
9 #include "caffe2/utils/cpuid.h"
10 #include "fbgemm_pack_matrix_cache.h"
11 #include "fbgemm_pack_op.h"
12 #include "mmio.h"
13 
14 C10_DEFINE_bool(
15  caffe2_dnnlowp_enforce_default_operators,
16  false,
17  "When true, enforce to use the default Caffe2 operators inside DNNLOWP"
18  "instead of using its own implementation that uses AVX2 instructions"
19  "(currently only honored by FC)");
20 
21 C10_DECLARE_bool(caffe2_dnnlowp_dump_tensors);
22 
23 namespace caffe2 {
24 
25 using namespace std;
26 
27 template <typename T>
28 FullyConnectedDNNLowPOp<T>::FullyConnectedDNNLowPOp(
29  const OperatorDef& operator_def,
30  Workspace* ws)
31  : BaseType(operator_def, ws),
32  axis_(this->template GetSingleArgument<int32_t>("axis", 1)),
33  axis_w_(this->template GetSingleArgument<int32_t>("axis_w", 1)),
34  quantize_channelwise_(this->template GetSingleArgument<bool>(
35  "quantize_channelwise",
36  false)),
37  b_quantized_(make_shared<vector<int32_t>>()),
38  column_offsets_(make_shared<vector<int32_t>>()),
39  is_weight_constant_(
40  this->template GetSingleArgument<bool>("constant_weight", true)) {
41  if (!is_weight_constant_) {
42  LOG(INFO) << operator_def.output(0) << " is_weight_constant "
43  << is_weight_constant_;
44  }
45  if (this->debug_def().engine() == "DNNLOWP_ROWWISE" ||
46  this->debug_def().engine() == "DNNLOWP_ROWWISE_16") {
47  quantize_channelwise_ = true;
48  }
49 
50  VLOG(2) << "DNNLOWP FC with output " << operator_def.output(0);
51 }
52 
53 template <typename T>
54 bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
55  using namespace std;
56  using namespace dnnlowp;
57 
58  this->ParseDNNLowPOperatorArguments_();
59 
60  if ((!GetCpuId().avx2() || FLAGS_caffe2_dnnlowp_enforce_default_operators) &&
61  dequantize_output_) {
62  if (!GetCpuId().avx2()) {
63  static int log_occurences = 0;
64  if (log_occurences < 32) {
65  ++log_occurences;
66  LOG(WARNING)
67  << "Falling back to the default Caffe2 operator because AVX2 "
68  "instruction is not available";
69  }
70  } else {
71  static int log_occurences = 0;
72  if (log_occurences < 32) {
73  ++log_occurences;
74  LOG(WARNING) << "Falling back to the default Caffe2 operator because "
75  "dnnlowp_enforce_default_caffe2_operators option is on";
76  }
77  }
78 
79  Fp32Op_()->DequantizeInput();
80  FullyConnectedOp<CPUContext>* fp32_op = Fp32Op_()->Get();
81  if (!fp32_op->RunOnDevice()) {
82  return false;
83  }
84 
85  auto* Y_ref = fp32_op->Output(0);
86  auto* Y = OutputTensorCPU_(0);
87  Y->ResizeLike(*Y_ref);
88  fp32_op->context_.CopyItemsSameDevice(
89  Y_ref->dtype(),
90  Y_ref->size(),
91  Y_ref->raw_data(),
92  Y->raw_mutable_data(Y_ref->dtype()));
93  return true;
94  }
95 
96 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
97  chrono::time_point<chrono::system_clock> t_very_begin, t_begin, t_end;
98  /* if (VLOG_IS_ON(3)) */
99  {
100  t_begin = chrono::system_clock::now();
101  t_very_begin = t_begin;
102  }
103 #endif
104 
105  // Get quantization parameters
106  if (!GetQuantizationParameters_()) {
107  return false;
108  }
109 
110 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
111  /* if (VLOG_IS_ON(3)) */
112  {
113  t_end = chrono::system_clock::now();
114  double dt = chrono::duration<double>(t_end - t_begin).count();
115  LOG(INFO) << "@PERF this=" << this << " get_quant_params: " << dt * 1e3
116  << " ms";
117  t_begin = chrono::system_clock::now();
118  }
119 #endif
120 
121  const auto& X = InputTensorCPU_(0);
122  const auto& W = InputTensorCPU_(1);
123  auto* Y = OutputTensorCPU_(0);
124  const auto canonical_axis = X.canonical_axis_index(axis_);
125  const auto M = X.size_to_dim(canonical_axis);
126  const auto K = X.size_from_dim(canonical_axis);
127  const auto canonical_axis_w = W.canonical_axis_index(axis_w_);
128  const int N = W.size_to_dim(canonical_axis_w);
129 
130  const T_signed* Wdata = W_quantized_.data();
131 
132  Y_shape_cache_ = X.sizes().vec();
133  Y_shape_cache_.resize(canonical_axis + 1);
134  Y_shape_cache_[canonical_axis] = N;
135  Y->Resize(Y_shape_cache_);
136 
137  const T* Xdata = nullptr;
138  vector<T> X_temp;
139 
140 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
141  /* if (VLOG_IS_ON(1)) */
142  {
143  t_end = chrono::system_clock::now();
144  double dt = chrono::duration<double>(t_end - t_begin).count();
145  LOG(INFO) << "@PERF this=" << this << " initialize parameters: " << dt * 1e3
146  << " ms";
147  t_begin = chrono::system_clock::now();
148  }
149 #endif
150 
151  if (Wq_packed_) {
152  // fast path to use fbgemm
153  using namespace fbgemm;
154 
155  if (X.template IsType<T>() || !dequantize_output_) {
156  // Only when input and output are float, we don't need input to be
157  // quantized.
158 
159 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
160  /* if (VLOG_IS_ON(1)) */
161  { t_begin = chrono::system_clock::now(); }
162 #endif
163 
164  Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp);
165 
166 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
167  /* if (VLOG_IS_ON(1)) */
168  {
169  t_end = chrono::system_clock::now();
170  double dt = chrono::duration<double>(t_end - t_begin).count();
171  LOG(INFO) << "@PERF this=" << this
172  << " input quantization: " << dt * 1e3 << " ms";
173  t_begin = chrono::system_clock::now();
174  }
175 #endif
176  }
177 
178 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
179  /* if (VLOG_IS_ON(1)) */
180  { t_begin = chrono::system_clock::now(); }
181 #endif
182 
183  if (!dequantize_output_) {
184  Y_int32_.resize(Y->size());
185  DoNothing<> doNothingObj{};
186 
187  if (quantize_channelwise_ || filter_qparams_[0].zero_point) {
188  row_offsets_.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
189  X_pack_buf_.resize(PackAWithRowOffset<uint8_t>::packedBufferSize());
190 
191  PackAWithRowOffset<uint8_t> packA(
192  matrix_op_t::NoTranspose,
193  M,
194  K,
195  reinterpret_cast<const uint8_t*>(Xdata),
196  K,
197  X_pack_buf_.data(), // buffer for packed matrix
198  1, // group
199  row_offsets_.data());
200 
201  if (quantize_channelwise_) {
202  ReQuantizeOutput<
203  false /* FUSE_RELU */,
204  QuantizationGranularity::OUT_CHANNEL>
205  outputProcObj(
206  doNothingObj,
207  requantization_multipliers_.data(),
208  out_qparams_.zero_point,
209  column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
210  filter_zero_points_.data(),
211  packA.getRowOffsetBuffer(),
212  column_offsets_->empty() ? nullptr : column_offsets_->data(),
213  b_quantized_data_,
214  N);
215 
216  fbgemmPacked(
217  packA,
218  *Wq_packed_,
219  reinterpret_cast<uint8_t*>(
220  OutputTensorCPU_(0)->template mutable_data<T>()),
221  Y_int32_.data(),
222  N,
223  outputProcObj,
224  0, // thread_id
225  1); // num_threads
226  } else {
227  ReQuantizeOutput<false /* FUSE_RELU */> outputProcObj(
228  doNothingObj,
229  requantization_multipliers_.data(),
230  out_qparams_.zero_point,
231  column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
232  filter_zero_points_.data(),
233  packA.getRowOffsetBuffer(),
234  column_offsets_->empty() ? nullptr : column_offsets_->data(),
235  b_quantized_data_,
236  N);
237 
238  fbgemmPacked(
239  packA,
240  *Wq_packed_,
241  reinterpret_cast<uint8_t*>(
242  OutputTensorCPU_(0)->template mutable_data<T>()),
243  Y_int32_.data(),
244  N,
245  outputProcObj,
246  0, // thread_id
247  1); // num_threads
248  }
249  } else {
250  X_pack_buf_.resize(PackAMatrix<uint8_t>::packedBufferSize());
251 
252  PackAMatrix<uint8_t> packA(
253  matrix_op_t::NoTranspose,
254  M,
255  K,
256  reinterpret_cast<const uint8_t*>(Xdata),
257  K,
258  X_pack_buf_.data(), // buffer for packed matrix
259  1); // group
260 
261  ReQuantizeOutput<false /* FUSE_RELU */> outputProcObj(
262  doNothingObj,
263  requantization_multipliers_.data(),
264  out_qparams_.zero_point,
265  column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
266  filter_zero_points_.data(),
267  nullptr,
268  column_offsets_->empty() ? nullptr : column_offsets_->data(),
269  b_quantized_data_,
270  N);
271 
272  fbgemmPacked(
273  packA,
274  *Wq_packed_,
275  reinterpret_cast<uint8_t*>(
276  OutputTensorCPU_(0)->template mutable_data<T>()),
277  Y_int32_.data(),
278  N,
279  outputProcObj,
280  0, // thread_id
281  1); // num_threads
282  }
283  } else {
284  // dequantize_output
285  float* Y_data = OutputTensorCPU_(0)->template mutable_data<float>();
286 
287  if (!X.template IsType<T>()) {
288  // Both input and output are float
289  row_offsets_.resize(
290  PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
291  X_pack_buf_.resize(
292  PackAWithQuantRowOffset<uint8_t>::packedBufferSize());
293  PackAWithQuantRowOffset<uint8_t> packA(
294  matrix_op_t::NoTranspose,
295  M,
296  K,
297  X.template data<float>(),
298  K,
299  X_pack_buf_.data(), // buffer for packed matrix
300  in_qparams_[0].scale,
301  in_qparams_[0].zero_point,
302  1, // groups
303  row_offsets_.data());
304 
305  DoNothing<float, float> doNothingObj{};
306 
307  if (quantize_channelwise_) {
308  ReQuantizeForFloat<
309  false /* FUSE_RELU*/,
310  QuantizationGranularity::OUT_CHANNEL>
311  outputProcObj(
312  doNothingObj,
313  in_qparams_[0].scale,
314  filter_scales_.data(),
315  column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
316  filter_zero_points_.data(),
317  packA.getRowOffsetBuffer(),
318  column_offsets_->empty() ? nullptr : column_offsets_->data(),
319  b_dequantized_data_, // bias
320  N);
321 
322  fbgemmPacked(
323  packA,
324  *Wq_packed_,
325  Y_data,
326  reinterpret_cast<int32_t*>(Y_data),
327  N,
328  outputProcObj,
329  0, // thread_id
330  1); // num_threads
331  } else {
332  ReQuantizeForFloat<false /* FUSE_RELU*/> outputProcObj(
333  doNothingObj,
334  in_qparams_[0].scale,
335  filter_scales_.data(),
336  column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
337  filter_zero_points_.data(),
338  packA.getRowOffsetBuffer(),
339  column_offsets_->empty() ? nullptr : column_offsets_->data(),
340  b_dequantized_data_, // bias
341  N);
342 
343  fbgemmPacked(
344  packA,
345  *Wq_packed_,
346  Y_data,
347  reinterpret_cast<int32_t*>(Y_data),
348  N,
349  outputProcObj,
350  0, // thread_id
351  1); // num_threads
352  }
353  } else {
354  // Input quantized and output float
355  row_offsets_.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
356  X_pack_buf_.resize(PackAWithRowOffset<uint8_t>::packedBufferSize());
357  PackAWithRowOffset<uint8_t> packA(
358  matrix_op_t::NoTranspose,
359  M,
360  K,
361  reinterpret_cast<const uint8_t*>(Xdata),
362  K,
363  X_pack_buf_.data(), // buffer for packed matrix
364  1, // group
365  row_offsets_.data());
366 
367  DoNothing<float, float> doNothingObj{};
368 
369  if (quantize_channelwise_) {
370  ReQuantizeForFloat<
371  false /* FUSE_RELU*/,
372  QuantizationGranularity::OUT_CHANNEL>
373  outputProcObj(
374  doNothingObj,
375  in_qparams_[0].scale,
376  filter_scales_.data(),
377  column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
378  filter_zero_points_.data(),
379  packA.getRowOffsetBuffer(),
380  column_offsets_->empty() ? nullptr : column_offsets_->data(),
381  b_dequantized_data_, // bias
382  N);
383 
384  fbgemmPacked(
385  packA,
386  *Wq_packed_,
387  Y_data,
388  reinterpret_cast<int32_t*>(Y_data),
389  N,
390  outputProcObj,
391  0, // thread_id
392  1); // num_threads
393  } else {
394  ReQuantizeForFloat<false /* FUSE_RELU*/> outputProcObj(
395  doNothingObj,
396  in_qparams_[0].scale,
397  filter_scales_.data(),
398  column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
399  filter_zero_points_.data(),
400  packA.getRowOffsetBuffer(),
401  column_offsets_->empty() ? nullptr : column_offsets_->data(),
402  b_dequantized_data_, // bias
403  N);
404 
405  fbgemmPacked(
406  packA,
407  *Wq_packed_,
408  Y_data,
409  reinterpret_cast<int32_t*>(Y_data),
410  N,
411  outputProcObj,
412  0, // thread_id
413  1); // num_threads
414  }
415  }
416  } // dequantize_output
417  } else {
418  // Quantize X
419 
420 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
421  /* if (VLOG_IS_ON(1)) */
422  { t_begin = chrono::system_clock::now(); }
423 #endif
424 
425  Xdata = QuantizeInputIfNeeded<T>(this, 0, in_qparams_[0], X_temp);
426 
427 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
428  /* if (VLOG_IS_ON(1)) */
429  {
430  t_end = chrono::system_clock::now();
431  double dt = chrono::duration<double>(t_end - t_begin).count();
432  LOG(INFO) << "@PERF this=" << this << " input quantization: " << dt * 1e3
433  << " ms";
434  t_begin = chrono::system_clock::now();
435  }
436 #endif
437 
438  Y_int32_.resize(Y->size());
439  for (int i = 0; i < M; ++i) {
440  for (int j = 0; j < N; ++j) {
441  int32_t sum = 0;
442  for (int k = 0; k < K; ++k) {
443  int w = Wdata[j * K + k];
444  sum += Xdata[i * K + k] * w;
445  }
446  Y_int32_[i * N + j] = sum;
447  } // for each output element
448  } // for each row
449  }
450 
451  if (FLAGS_caffe2_dnnlowp_dump_tensors) {
452  // Dump input activation
453  StoreMatrixInMatrixMarketFormat(M, K, Xdata, this->debug_def().input(0));
454 
455  // Dump weight
456  StoreMatrixInMatrixMarketFormat(N, K, Wdata, this->debug_def().input(1));
457  }
458 
459 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
460  /* if (VLOG_IS_ON(1)) */
461  {
462  t_end = chrono::system_clock::now();
463  double dt = chrono::duration<double>(t_end - t_begin).count();
464  LOG(INFO) << "@PERF this=" << this << " gemm: " << dt * 1e3 << " ms";
465 
466  t_begin = chrono::system_clock::now();
467  }
468 #endif
469 
470  // Adjust with bias and zero_point and then requantize
471  // See batch_matmul_dnnlowp_op.cc to why we compute column_offsets,
472  // row_offset, and const_offset in this way.
473  if (dequantize_output_) {
474  if (!Wq_packed_) {
475  float* Ydata = OutputTensorCPU_(0)->template mutable_data<float>();
476 
477  for (int i = 0; i < M; ++i) {
478  int32_t row_offset = 0;
479  for (int k = 0; k < K; ++k) {
480  row_offset += Xdata[i * K + k];
481  }
482 
483  for (int j = 0; j < N; ++j) {
484  if (!column_offsets_->empty()) {
485  Y_int32_[i * N + j] -=
486  in_qparams_[0].zero_point * (*column_offsets_)[j];
487  }
488  int quant_group = quantize_channelwise_ ? j : 0;
489  Y_int32_[i * N + j] -=
490  row_offset * filter_qparams_[quant_group].zero_point;
491  Ydata[i * N + j] = Y_int32_[i * N + j] * in_qparams_[0].scale *
492  filter_qparams_[quant_group].scale +
493  b_dequantized_data_[j];
494  }
495  }
496  }
497  } else {
498  if (!Wq_packed_) {
499  T* Ydata = GetQuantizedOutputData_();
500  for (int i = 0; i < M; ++i) {
501  int32_t row_offset = 0;
502  for (int k = 0; k < K; ++k) {
503  row_offset += Xdata[i * K + k];
504  }
505 
506  for (int j = 0; j < N; ++j) {
507  if (!column_offsets_->empty()) {
508  // empty column offset means it's folded into bias
509  Y_int32_[i * N + j] -=
510  in_qparams_[0].zero_point * (*column_offsets_)[j];
511  }
512  int quant_group = quantize_channelwise_ ? j : 0;
513  Y_int32_[i * N + j] -=
514  row_offset * filter_qparams_[quant_group].zero_point;
515  Y_int32_[i * N + j] += b_quantized_data_[j];
516 
517  Ydata[i * N + j] = fbgemm::Requantize<T>(
518  Y_int32_[i * N + j], requantization_params_[quant_group]);
519  }
520  }
521  }
522 
523  PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
524  }
525 
526  MeasureQuantizationError_();
527 
528 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
529  /* if (VLOG_IS_ON(1)) */
530  {
531  t_end = chrono::system_clock::now();
532  double dt = chrono::duration<double>(t_end - t_begin).count();
533  LOG(INFO) << "@PERF this=" << this
534  << " bias-offset-requantization: " << dt * 1e3 << " ms";
535 
536  t_end = chrono::system_clock::now();
537  double ops = 2. * M * N * K;
538  dt = chrono::duration<double>(t_end - t_very_begin).count();
539  double gops = ops / dt / 1e9;
540  LOG(INFO) << "@PERF this=" << this
541  << " output=" << this->debug_def().output(0) << " " << M << "x"
542  << N << "x" << K << ": " << dt * 1e3 << " ms " << gops << " gops";
543  }
544 #endif
545 
546  return true;
547 }
548 
549 template <typename T>
550 bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
551  using namespace dnnlowp;
552 
553 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
554  chrono::time_point<chrono::system_clock> t_begin, t_end;
555  /* if (VLOG_IS_ON(1)) */
556  { t_begin = chrono::system_clock::now(); }
557 #endif
558 
559  // Choose quantization for X
560  in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
561 
562 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
563  /* if (VLOG_IS_ON(1)) */
564  {
565  t_end = chrono::system_clock::now();
566  double dt = chrono::duration<double>(t_end - t_begin).count();
567  LOG(INFO) << "@PERF this=" << this << " GetInputTensorQuantizationParamsOf "
568  << dt * 1e3 << " ms";
569  t_begin = chrono::system_clock::now();
570  }
571 #endif
572 
573  // Quantize W
574  const auto& X = InputTensorCPU_(0);
575  const auto& W = InputTensorCPU_(1);
576  const auto canonical_axis = X.canonical_axis_index(axis_);
577  const auto K = X.size_from_dim(canonical_axis);
578  const auto canonical_axis_w = W.canonical_axis_index(axis_w_);
579  const int N = W.size_to_dim(canonical_axis_w);
580 
581  int signed_min = -(1 << (qfactory_->GetWeightPrecision() - 1));
582  if (is_weight_constant_) {
583  bool fast_path = is_same<T, uint8_t>::value && GetCpuId().avx2() &&
584  this->debug_def().engine() != "DNNLOWP_ACC16";
585 
586  if ((fast_path && !Wq_packed_) || (!fast_path && W_quantized_.empty())) {
587  if (this->template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
588  const auto& packed_filter =
589  this->template Input<Int8FCDNNLowPPackedWeightBlob>(1);
590  filter_qparams_ = packed_filter.qparams;
591  if (quantize_channelwise_) {
592  CAFFE_ENFORCE_EQ(filter_qparams_.size(), N);
593  } else {
594  CAFFE_ENFORCE_EQ(filter_qparams_.size(), 1);
595  }
596  } else {
597  filter_qparams_.resize(quantize_channelwise_ ? N : 1);
598  QuantizeWeight<T>(
599  InputBlob(1), K, N, filter_qparams_, W_quantized_, qfactory_.get());
600  }
601 
602  filter_scales_.resize(filter_qparams_.size());
603  filter_zero_points_.resize(filter_qparams_.size());
604  requantization_params_.resize(filter_qparams_.size());
605  requantization_multipliers_.resize(filter_qparams_.size());
606  for (int i = 0; i < filter_qparams_.size(); ++i) {
607  filter_scales_[i] = filter_qparams_[i].scale;
608  filter_zero_points_[i] = filter_qparams_[i].zero_point;
609  }
610 
611  if (fast_path) {
612  // fast path using fbgemm
613  if (this->template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
614  const auto& packed_filter =
615  this->template Input<Int8FCDNNLowPPackedWeightBlob>(1);
616  Wq_packed_ = packed_filter.W;
617  } else {
618  Wq_packed_ = GetOrCreateFbgemmPackBMatrix<int32_t>(
619  fbgemm::matrix_op_t::Transpose,
620  K,
621  N,
622  W.raw_data(),
623  reinterpret_cast<const int8_t*>(W_quantized_.data()),
624  K); // ld
625  }
626  } else {
627  string reason;
628  if (!is_same<T, uint8_t>::value) {
629  reason = "fbgemm only supports 8-bit integers";
630  } else if (!GetCpuId().avx2()) {
631  reason = "fbgemm only supports AVX2";
632  } else if (this->debug_def().engine() == "DNNLOWP_ACC16") {
633  reason = "";
634  } else {
635  assert(false);
636  }
637  if (!reason.empty()) {
638  LOG(WARNING) << "Conv with weight " << this->debug_def().input(1)
639  << " falls back to slow path because " << reason;
640  }
641  }
642  }
643  } // is_weight_constant_
644  else {
645  // !is_weight_constant_
646  filter_qparams_.resize(1);
647  filter_qparams_[0] = GetInputTensorQuantizationParamsOf(
648  this, 1, qfactory_.get(), true /*weight*/);
649  filter_qparams_[0].zero_point += signed_min;
650 
651  W_quantized_.resize(W.size());
652  fbgemm::Quantize<T_signed>(
653  W.template data<float>(),
654  W_quantized_.data(),
655  W_quantized_.size(),
656  filter_qparams_[0]);
657  }
658 
659 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
660  /* if (VLOG_IS_ON(1)) */
661  {
662  t_end = chrono::system_clock::now();
663  double dt = chrono::duration<double>(t_end - t_begin).count();
664  LOG(INFO) << "@PERF this=" << this << " Quantize W " << dt * 1e3 << " ms";
665  t_begin = chrono::system_clock::now();
666  }
667 #endif
668 
669  // Pre-compute column_offset
670  // If input tensor doesn't use dynamic quantization, we fold column_offsets_
671  // into bias.
672  bool first_invocation = !b_quantized_data_ && !b_dequantized_data_;
673  bool fold_col_offset_into_bias =
674  this->template InputIsType<int8::Int8TensorCPU>(0) && !dequantize_output_;
675  if (!is_weight_constant_ ||
676  (first_invocation && !fold_col_offset_into_bias)) {
677  if (this->template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
678  const auto& packed_filter =
679  this->template Input<Int8FCDNNLowPPackedWeightBlob>(1);
680  column_offsets_ = packed_filter.column_offsets;
681  } else {
682  ComputeColumnOffsets<T_signed>(
683  K, N, W_quantized_.data(), filter_qparams_, *column_offsets_);
684  }
685  }
686 
687 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
688  /* if (VLOG_IS_ON(1)) */
689  {
690  t_end = chrono::system_clock::now();
691  double dt = chrono::duration<double>(t_end - t_begin).count();
692  LOG(INFO) << "@PERF this=" << this << " Calculate column offset "
693  << dt * 1e3 << " ms";
694  t_begin = chrono::system_clock::now();
695  }
696 #endif
697 
698  // Quantize bias
699  if (!is_weight_constant_ || (!b_quantized_data_ && !b_dequantized_data_) ||
700  in_qparams_[0].scale != in_qparams0_scale_old_ ||
701  in_qparams_[0].zero_point != in_qparams0_zero_point_old_) {
702  if (this->template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1) &&
703  this->template Input<Int8FCDNNLowPPackedWeightBlob>(1).bias.get()) {
704  const auto& packed_filter =
705  this->template Input<Int8FCDNNLowPPackedWeightBlob>(1);
706  CAFFE_ENFORCE(!dequantize_output_);
707  b_quantized_ = packed_filter.bias;
708  b_quantized_data_ = b_quantized_->data();
709  } else {
710  const auto& bias = InputTensorCPU_(2);
711  if (this->template InputIsType<int8::Int8TensorCPU>(2)) {
712  TensorQuantizationParams bias_qparams;
713  bias_qparams.scale = this->template Input<int8::Int8TensorCPU>(2).scale;
714  bias_qparams.zero_point =
715  this->template Input<int8::Int8TensorCPU>(2).zero_point;
716  CAFFE_ENFORCE_LE(
717  std::abs(
718  bias_qparams.scale -
719  in_qparams_[0].scale * filter_qparams_[0].scale),
720  1e-4);
721  CAFFE_ENFORCE_EQ(bias_qparams.zero_point, 0);
722  b_quantized_data_ = bias.template data<int32_t>();
723  if (dequantize_output_) {
724  b_dequantized_.resize(N);
725  for (int j = 0; j < N; ++j) {
726  b_dequantized_[j] = fbgemm::Dequantize<int32_t>(
727  b_quantized_data_[j], in_qparams_[2]);
728  }
729  b_dequantized_data_ = b_dequantized_.data();
730  }
731  } else {
732  b_dequantized_data_ = bias.template data<float>();
733  if (!dequantize_output_) {
734  b_quantized_->resize(N);
735  for (int j = 0; j < N; ++j) {
736  (*b_quantized_)[j] = fbgemm::Quantize<int32_t>(
737  b_dequantized_data_[j],
738  0,
739  in_qparams_[0].scale * filter_qparams_[0].scale,
740  32);
741  }
742  b_quantized_data_ = b_quantized_->data();
743  }
744  }
745  }
746  in_qparams0_scale_old_ = in_qparams_[0].scale;
747  in_qparams0_zero_point_old_ = in_qparams_[0].zero_point;
748 
749  // If column_offsets_ is empty even when we need column_offsets (asymmetric
750  // quantization in input), it means we need to fuse column_offsets to bias.
751  if (in_qparams_[0].zero_point && column_offsets_->empty() &&
752  b_quantized_data_) {
753  if (b_quantized_->empty()) {
754  b_quantized_->assign(b_quantized_data_, b_quantized_data_ + N);
755  b_quantized_data_ = b_quantized_->data();
756  }
757  vector<int32_t>* column_offset_ptr;
758  vector<int32_t> column_offset_temp;
759  if (this->template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
760  const auto& packed_filter =
761  this->template Input<Int8FCDNNLowPPackedWeightBlob>(1);
762  column_offset_ptr = packed_filter.column_offsets.get();
763  } else {
764  column_offset_temp.resize(N);
765  ComputeColumnOffsets<T_signed>(
766  K, N, W_quantized_.data(), filter_qparams_, column_offset_temp);
767  column_offset_ptr = &column_offset_temp;
768  }
769  for (int i = 0; i < N; ++i) {
770  (*b_quantized_)[i] -=
771  in_qparams_[0].zero_point * (*column_offset_ptr)[i];
772  }
773  }
774 
775  CAFFE_ENFORCE(
776  (dequantize_output_ && b_dequantized_data_) ||
777  (!dequantize_output_ && b_quantized_data_));
778  }
779 
780  if (Wq_packed_ && !FLAGS_caffe2_dnnlowp_dump_tensors) {
781  // From here, W_quantized_ is not used anymore when we have Wq_packed_
782  vector<T_signed>().swap(W_quantized_);
783  }
784 
785 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
786  /* if (VLOG_IS_ON(1)) */
787  {
788  t_end = chrono::system_clock::now();
789  double dt = chrono::duration<double>(t_end - t_begin).count();
790  LOG(INFO) << "@PERF this=" << this << " Quantize bias " << dt * 1e3
791  << " ms";
792  t_begin = chrono::system_clock::now();
793  }
794 #endif
795 
796  if (!dequantize_output_ && !requantization_param_selected_) {
797  GetOutputQuantizationParams_();
798 
799  for (int i = 0; i < filter_qparams_.size(); ++i) {
800  float real_multiplier =
801  in_qparams_[0].scale * filter_qparams_[i].scale / out_qparams_.scale;
802  requantization_params_[i] = qfactory_->ChooseRequantizationMultiplier(
803  real_multiplier, out_qparams_);
804  requantization_multipliers_[i] =
805  requantization_params_[i].real_multiplier;
806  }
807  requantization_param_selected_ = true;
808  } else {
809  if (measure_quantization_error_) {
810  // to measure quantization error, run ref impl.
811  Fp32Op_()->DequantizeInput();
812  Fp32Op_()->Get()->RunOnDevice();
813  }
814  }
815 
816 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
817  /* if (VLOG_IS_ON(1)) */
818  {
819  t_end = chrono::system_clock::now();
820  double dt = chrono::duration<double>(t_end - t_begin).count();
821  LOG(INFO) << "@PERF this=" << this << " GetOutputQuantizationParams "
822  << dt * 1e3 << " ms";
823  t_begin = chrono::system_clock::now();
824  }
825 #endif
826 
827  return true;
828 }
829 
830 REGISTER_CPU_OPERATOR_WITH_ENGINE(
831  FC,
832  DNNLOWP,
833  FullyConnectedDNNLowPOp<uint8_t>);
834 REGISTER_CPU_OPERATOR_WITH_ENGINE(
835  FC,
836  DNNLOWP_16,
837  FullyConnectedDNNLowPOp<uint16_t>);
838 
839 REGISTER_CPU_OPERATOR_WITH_ENGINE(
840  Int8FC,
841  DNNLOWP,
842  FullyConnectedDNNLowPOp<uint8_t>);
843 
844 REGISTER_CPU_OPERATOR_WITH_ENGINE(
845  FC,
846  DNNLOWP_ROWWISE,
847  FullyConnectedDNNLowPOp<uint8_t>);
848 REGISTER_CPU_OPERATOR_WITH_ENGINE(
849  FC,
850  DNNLOWP_ROWWISE_16,
851  FullyConnectedDNNLowPOp<uint16_t>);
852 
853 REGISTER_CPU_OPERATOR_WITH_ENGINE(
854  Int8FC,
855  DNNLOWP_ROWWISE,
856  FullyConnectedDNNLowPOp<uint8_t>);
857 
858 } // namespace caffe2
Definition: any.cpp:108
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: OpClasses.h:566