1 #include "fully_connected_dnnlowp_op.h" 3 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 7 #include "caffe2/core/flags.h" 8 #include "caffe2/core/tensor_int8.h" 9 #include "caffe2/utils/cpuid.h" 10 #include "fbgemm_pack_matrix_cache.h" 11 #include "fbgemm_pack_op.h" 15 caffe2_dnnlowp_enforce_default_operators,
17 "When true, enforce to use the default Caffe2 operators inside DNNLOWP" 18 "instead of using its own implementation that uses AVX2 instructions" 19 "(currently only honored by FC)");
21 C10_DECLARE_bool(caffe2_dnnlowp_dump_tensors);
28 FullyConnectedDNNLowPOp<T>::FullyConnectedDNNLowPOp(
29 const OperatorDef& operator_def,
31 : BaseType(operator_def, ws),
32 axis_(this->template GetSingleArgument<int32_t>(
"axis", 1)),
33 axis_w_(this->template GetSingleArgument<int32_t>(
"axis_w", 1)),
34 quantize_channelwise_(this->template GetSingleArgument<bool>(
35 "quantize_channelwise",
37 b_quantized_(make_shared<vector<int32_t>>()),
38 column_offsets_(make_shared<vector<int32_t>>()),
40 this->template GetSingleArgument<bool>(
"constant_weight", true)) {
41 if (!is_weight_constant_) {
42 LOG(INFO) << operator_def.output(0) <<
" is_weight_constant " 43 << is_weight_constant_;
45 if (this->debug_def().engine() ==
"DNNLOWP_ROWWISE" ||
46 this->debug_def().engine() ==
"DNNLOWP_ROWWISE_16") {
47 quantize_channelwise_ =
true;
50 VLOG(2) <<
"DNNLOWP FC with output " << operator_def.output(0);
54 bool FullyConnectedDNNLowPOp<T>::RunOnDevice() {
58 this->ParseDNNLowPOperatorArguments_();
60 if ((!GetCpuId().avx2() || FLAGS_caffe2_dnnlowp_enforce_default_operators) &&
62 if (!GetCpuId().avx2()) {
63 static int log_occurences = 0;
64 if (log_occurences < 32) {
67 <<
"Falling back to the default Caffe2 operator because AVX2 " 68 "instruction is not available";
71 static int log_occurences = 0;
72 if (log_occurences < 32) {
74 LOG(WARNING) <<
"Falling back to the default Caffe2 operator because " 75 "dnnlowp_enforce_default_caffe2_operators option is on";
79 Fp32Op_()->DequantizeInput();
80 FullyConnectedOp<CPUContext>* fp32_op = Fp32Op_()->Get();
81 if (!fp32_op->RunOnDevice()) {
85 auto* Y_ref = fp32_op->Output(0);
86 auto* Y = OutputTensorCPU_(0);
87 Y->ResizeLike(*Y_ref);
88 fp32_op->context_.CopyItemsSameDevice(
92 Y->raw_mutable_data(Y_ref->dtype()));
96 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 97 chrono::time_point<chrono::system_clock> t_very_begin, t_begin, t_end;
100 t_begin = chrono::system_clock::now();
101 t_very_begin = t_begin;
106 if (!GetQuantizationParameters_()) {
110 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 113 t_end = chrono::system_clock::now();
114 double dt = chrono::duration<double>(t_end - t_begin).count();
115 LOG(INFO) <<
"@PERF this=" <<
this <<
" get_quant_params: " << dt * 1e3
117 t_begin = chrono::system_clock::now();
121 const auto& X = InputTensorCPU_(0);
122 const auto& W = InputTensorCPU_(1);
123 auto* Y = OutputTensorCPU_(0);
124 const auto canonical_axis = X.canonical_axis_index(axis_);
125 const auto M = X.size_to_dim(canonical_axis);
126 const auto K = X.size_from_dim(canonical_axis);
127 const auto canonical_axis_w = W.canonical_axis_index(axis_w_);
128 const int N = W.size_to_dim(canonical_axis_w);
130 const T_signed* Wdata = W_quantized_.data();
132 Y_shape_cache_ = X.sizes().vec();
133 Y_shape_cache_.resize(canonical_axis + 1);
134 Y_shape_cache_[canonical_axis] = N;
135 Y->Resize(Y_shape_cache_);
137 const T* Xdata =
nullptr;
140 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 143 t_end = chrono::system_clock::now();
144 double dt = chrono::duration<double>(t_end - t_begin).count();
145 LOG(INFO) <<
"@PERF this=" <<
this <<
" initialize parameters: " << dt * 1e3
147 t_begin = chrono::system_clock::now();
155 if (X.template IsType<T>() || !dequantize_output_) {
159 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 161 { t_begin = chrono::system_clock::now(); }
164 Xdata = QuantizeInputIfNeeded<T>(
this, 0, in_qparams_[0], X_temp);
166 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 169 t_end = chrono::system_clock::now();
170 double dt = chrono::duration<double>(t_end - t_begin).count();
171 LOG(INFO) <<
"@PERF this=" <<
this 172 <<
" input quantization: " << dt * 1e3 <<
" ms";
173 t_begin = chrono::system_clock::now();
178 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 180 { t_begin = chrono::system_clock::now(); }
183 if (!dequantize_output_) {
184 Y_int32_.resize(Y->size());
185 DoNothing<> doNothingObj{};
187 if (quantize_channelwise_ || filter_qparams_[0].zero_point) {
188 row_offsets_.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
189 X_pack_buf_.resize(PackAWithRowOffset<uint8_t>::packedBufferSize());
191 PackAWithRowOffset<uint8_t> packA(
192 matrix_op_t::NoTranspose,
195 reinterpret_cast<const uint8_t*>(Xdata),
199 row_offsets_.data());
201 if (quantize_channelwise_) {
204 QuantizationGranularity::OUT_CHANNEL>
207 requantization_multipliers_.data(),
208 out_qparams_.zero_point,
209 column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
210 filter_zero_points_.data(),
211 packA.getRowOffsetBuffer(),
212 column_offsets_->empty() ?
nullptr : column_offsets_->data(),
219 reinterpret_cast<uint8_t*>(
220 OutputTensorCPU_(0)->
template mutable_data<T>()),
227 ReQuantizeOutput<
false > outputProcObj(
229 requantization_multipliers_.data(),
230 out_qparams_.zero_point,
231 column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
232 filter_zero_points_.data(),
233 packA.getRowOffsetBuffer(),
234 column_offsets_->empty() ?
nullptr : column_offsets_->data(),
241 reinterpret_cast<uint8_t*>(
242 OutputTensorCPU_(0)->
template mutable_data<T>()),
250 X_pack_buf_.resize(PackAMatrix<uint8_t>::packedBufferSize());
252 PackAMatrix<uint8_t> packA(
253 matrix_op_t::NoTranspose,
256 reinterpret_cast<const uint8_t*>(Xdata),
261 ReQuantizeOutput<
false > outputProcObj(
263 requantization_multipliers_.data(),
264 out_qparams_.zero_point,
265 column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
266 filter_zero_points_.data(),
268 column_offsets_->empty() ?
nullptr : column_offsets_->data(),
275 reinterpret_cast<uint8_t*>(
276 OutputTensorCPU_(0)->
template mutable_data<T>()),
285 float* Y_data = OutputTensorCPU_(0)->template mutable_data<float>();
287 if (!X.template IsType<T>()) {
290 PackAWithQuantRowOffset<uint8_t>::rowOffsetBufferSize());
292 PackAWithQuantRowOffset<uint8_t>::packedBufferSize());
293 PackAWithQuantRowOffset<uint8_t> packA(
294 matrix_op_t::NoTranspose,
297 X.template data<float>(),
300 in_qparams_[0].scale,
301 in_qparams_[0].zero_point,
303 row_offsets_.data());
305 DoNothing<float, float> doNothingObj{};
307 if (quantize_channelwise_) {
310 QuantizationGranularity::OUT_CHANNEL>
313 in_qparams_[0].scale,
314 filter_scales_.data(),
315 column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
316 filter_zero_points_.data(),
317 packA.getRowOffsetBuffer(),
318 column_offsets_->empty() ?
nullptr : column_offsets_->data(),
326 reinterpret_cast<int32_t*>(Y_data),
332 ReQuantizeForFloat<
false > outputProcObj(
334 in_qparams_[0].scale,
335 filter_scales_.data(),
336 column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
337 filter_zero_points_.data(),
338 packA.getRowOffsetBuffer(),
339 column_offsets_->empty() ?
nullptr : column_offsets_->data(),
347 reinterpret_cast<int32_t*>(Y_data),
355 row_offsets_.resize(PackAWithRowOffset<uint8_t>::rowOffsetBufferSize());
356 X_pack_buf_.resize(PackAWithRowOffset<uint8_t>::packedBufferSize());
357 PackAWithRowOffset<uint8_t> packA(
358 matrix_op_t::NoTranspose,
361 reinterpret_cast<const uint8_t*>(Xdata),
365 row_offsets_.data());
367 DoNothing<float, float> doNothingObj{};
369 if (quantize_channelwise_) {
372 QuantizationGranularity::OUT_CHANNEL>
375 in_qparams_[0].scale,
376 filter_scales_.data(),
377 column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
378 filter_zero_points_.data(),
379 packA.getRowOffsetBuffer(),
380 column_offsets_->empty() ?
nullptr : column_offsets_->data(),
388 reinterpret_cast<int32_t*>(Y_data),
394 ReQuantizeForFloat<
false > outputProcObj(
396 in_qparams_[0].scale,
397 filter_scales_.data(),
398 column_offsets_->empty() ? 0 : in_qparams_[0].zero_point,
399 filter_zero_points_.data(),
400 packA.getRowOffsetBuffer(),
401 column_offsets_->empty() ?
nullptr : column_offsets_->data(),
409 reinterpret_cast<int32_t*>(Y_data),
420 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 422 { t_begin = chrono::system_clock::now(); }
425 Xdata = QuantizeInputIfNeeded<T>(
this, 0, in_qparams_[0], X_temp);
427 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 430 t_end = chrono::system_clock::now();
431 double dt = chrono::duration<double>(t_end - t_begin).count();
432 LOG(INFO) <<
"@PERF this=" <<
this <<
" input quantization: " << dt * 1e3
434 t_begin = chrono::system_clock::now();
438 Y_int32_.resize(Y->size());
439 for (
int i = 0; i <
M; ++i) {
440 for (
int j = 0; j < N; ++j) {
442 for (
int k = 0; k < K; ++k) {
443 int w = Wdata[j * K + k];
444 sum += Xdata[i * K + k] * w;
446 Y_int32_[i * N + j] = sum;
451 if (FLAGS_caffe2_dnnlowp_dump_tensors) {
453 StoreMatrixInMatrixMarketFormat(M, K, Xdata, this->debug_def().input(0));
456 StoreMatrixInMatrixMarketFormat(N, K, Wdata, this->debug_def().input(1));
459 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 462 t_end = chrono::system_clock::now();
463 double dt = chrono::duration<double>(t_end - t_begin).count();
464 LOG(INFO) <<
"@PERF this=" <<
this <<
" gemm: " << dt * 1e3 <<
" ms";
466 t_begin = chrono::system_clock::now();
473 if (dequantize_output_) {
475 float* Ydata = OutputTensorCPU_(0)->template mutable_data<float>();
477 for (
int i = 0; i < M; ++i) {
478 int32_t row_offset = 0;
479 for (
int k = 0; k < K; ++k) {
480 row_offset += Xdata[i * K + k];
483 for (
int j = 0; j < N; ++j) {
484 if (!column_offsets_->empty()) {
485 Y_int32_[i * N + j] -=
486 in_qparams_[0].zero_point * (*column_offsets_)[j];
488 int quant_group = quantize_channelwise_ ? j : 0;
489 Y_int32_[i * N + j] -=
490 row_offset * filter_qparams_[quant_group].zero_point;
491 Ydata[i * N + j] = Y_int32_[i * N + j] * in_qparams_[0].scale *
492 filter_qparams_[quant_group].scale +
493 b_dequantized_data_[j];
499 T* Ydata = GetQuantizedOutputData_();
500 for (
int i = 0; i < M; ++i) {
501 int32_t row_offset = 0;
502 for (
int k = 0; k < K; ++k) {
503 row_offset += Xdata[i * K + k];
506 for (
int j = 0; j < N; ++j) {
507 if (!column_offsets_->empty()) {
509 Y_int32_[i * N + j] -=
510 in_qparams_[0].zero_point * (*column_offsets_)[j];
512 int quant_group = quantize_channelwise_ ? j : 0;
513 Y_int32_[i * N + j] -=
514 row_offset * filter_qparams_[quant_group].zero_point;
515 Y_int32_[i * N + j] += b_quantized_data_[j];
517 Ydata[i * N + j] = fbgemm::Requantize<T>(
518 Y_int32_[i * N + j], requantization_params_[quant_group]);
523 PropagateOutputTensorQuantizationParams(
this, 0, out_qparams_);
526 MeasureQuantizationError_();
528 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 531 t_end = chrono::system_clock::now();
532 double dt = chrono::duration<double>(t_end - t_begin).count();
533 LOG(INFO) <<
"@PERF this=" <<
this 534 <<
" bias-offset-requantization: " << dt * 1e3 <<
" ms";
536 t_end = chrono::system_clock::now();
537 double ops = 2. * M * N * K;
538 dt = chrono::duration<double>(t_end - t_very_begin).count();
539 double gops = ops / dt / 1e9;
540 LOG(INFO) <<
"@PERF this=" <<
this 541 <<
" output=" << this->debug_def().output(0) <<
" " << M <<
"x" 542 << N <<
"x" << K <<
": " << dt * 1e3 <<
" ms " << gops <<
" gops";
549 template <
typename T>
550 bool FullyConnectedDNNLowPOp<T>::GetQuantizationParameters_() {
553 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 554 chrono::time_point<chrono::system_clock> t_begin, t_end;
556 { t_begin = chrono::system_clock::now(); }
560 in_qparams_[0] = GetInputTensorQuantizationParamsOf(
this, 0, qfactory_.get());
562 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 565 t_end = chrono::system_clock::now();
566 double dt = chrono::duration<double>(t_end - t_begin).count();
567 LOG(INFO) <<
"@PERF this=" <<
this <<
" GetInputTensorQuantizationParamsOf " 568 << dt * 1e3 <<
" ms";
569 t_begin = chrono::system_clock::now();
574 const auto& X = InputTensorCPU_(0);
575 const auto& W = InputTensorCPU_(1);
576 const auto canonical_axis = X.canonical_axis_index(axis_);
577 const auto K = X.size_from_dim(canonical_axis);
578 const auto canonical_axis_w = W.canonical_axis_index(axis_w_);
579 const int N = W.size_to_dim(canonical_axis_w);
581 int signed_min = -(1 << (qfactory_->GetWeightPrecision() - 1));
582 if (is_weight_constant_) {
583 bool fast_path = is_same<T, uint8_t>::value && GetCpuId().avx2() &&
584 this->debug_def().engine() !=
"DNNLOWP_ACC16";
586 if ((fast_path && !Wq_packed_) || (!fast_path && W_quantized_.empty())) {
587 if (this->
template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
588 const auto& packed_filter =
589 this->
template Input<Int8FCDNNLowPPackedWeightBlob>(1);
590 filter_qparams_ = packed_filter.qparams;
591 if (quantize_channelwise_) {
592 CAFFE_ENFORCE_EQ(filter_qparams_.size(), N);
594 CAFFE_ENFORCE_EQ(filter_qparams_.size(), 1);
597 filter_qparams_.resize(quantize_channelwise_ ? N : 1);
599 InputBlob(1), K, N, filter_qparams_, W_quantized_, qfactory_.get());
602 filter_scales_.resize(filter_qparams_.size());
603 filter_zero_points_.resize(filter_qparams_.size());
604 requantization_params_.resize(filter_qparams_.size());
605 requantization_multipliers_.resize(filter_qparams_.size());
606 for (
int i = 0; i < filter_qparams_.size(); ++i) {
607 filter_scales_[i] = filter_qparams_[i].scale;
608 filter_zero_points_[i] = filter_qparams_[i].zero_point;
613 if (this->
template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
614 const auto& packed_filter =
615 this->
template Input<Int8FCDNNLowPPackedWeightBlob>(1);
616 Wq_packed_ = packed_filter.W;
618 Wq_packed_ = GetOrCreateFbgemmPackBMatrix<int32_t>(
619 fbgemm::matrix_op_t::Transpose,
623 reinterpret_cast<const int8_t*
>(W_quantized_.data()),
628 if (!is_same<T, uint8_t>::value) {
629 reason =
"fbgemm only supports 8-bit integers";
630 }
else if (!GetCpuId().avx2()) {
631 reason =
"fbgemm only supports AVX2";
632 }
else if (this->debug_def().engine() ==
"DNNLOWP_ACC16") {
637 if (!reason.empty()) {
638 LOG(WARNING) <<
"Conv with weight " << this->debug_def().input(1)
639 <<
" falls back to slow path because " << reason;
646 filter_qparams_.resize(1);
647 filter_qparams_[0] = GetInputTensorQuantizationParamsOf(
648 this, 1, qfactory_.get(),
true );
649 filter_qparams_[0].zero_point += signed_min;
651 W_quantized_.resize(W.size());
652 fbgemm::Quantize<T_signed>(
653 W.template data<float>(),
659 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 662 t_end = chrono::system_clock::now();
663 double dt = chrono::duration<double>(t_end - t_begin).count();
664 LOG(INFO) <<
"@PERF this=" <<
this <<
" Quantize W " << dt * 1e3 <<
" ms";
665 t_begin = chrono::system_clock::now();
672 bool first_invocation = !b_quantized_data_ && !b_dequantized_data_;
673 bool fold_col_offset_into_bias =
674 this->
template InputIsType<int8::Int8TensorCPU>(0) && !dequantize_output_;
675 if (!is_weight_constant_ ||
676 (first_invocation && !fold_col_offset_into_bias)) {
677 if (this->
template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
678 const auto& packed_filter =
679 this->
template Input<Int8FCDNNLowPPackedWeightBlob>(1);
680 column_offsets_ = packed_filter.column_offsets;
682 ComputeColumnOffsets<T_signed>(
683 K, N, W_quantized_.data(), filter_qparams_, *column_offsets_);
687 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 690 t_end = chrono::system_clock::now();
691 double dt = chrono::duration<double>(t_end - t_begin).count();
692 LOG(INFO) <<
"@PERF this=" <<
this <<
" Calculate column offset " 693 << dt * 1e3 <<
" ms";
694 t_begin = chrono::system_clock::now();
699 if (!is_weight_constant_ || (!b_quantized_data_ && !b_dequantized_data_) ||
700 in_qparams_[0].scale != in_qparams0_scale_old_ ||
701 in_qparams_[0].zero_point != in_qparams0_zero_point_old_) {
702 if (this->
template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1) &&
703 this->
template Input<Int8FCDNNLowPPackedWeightBlob>(1).bias.get()) {
704 const auto& packed_filter =
705 this->
template Input<Int8FCDNNLowPPackedWeightBlob>(1);
706 CAFFE_ENFORCE(!dequantize_output_);
707 b_quantized_ = packed_filter.bias;
708 b_quantized_data_ = b_quantized_->data();
710 const auto& bias = InputTensorCPU_(2);
711 if (this->
template InputIsType<int8::Int8TensorCPU>(2)) {
712 TensorQuantizationParams bias_qparams;
713 bias_qparams.scale = this->
template Input<int8::Int8TensorCPU>(2).scale;
714 bias_qparams.zero_point =
715 this->
template Input<int8::Int8TensorCPU>(2).zero_point;
719 in_qparams_[0].scale * filter_qparams_[0].scale),
721 CAFFE_ENFORCE_EQ(bias_qparams.zero_point, 0);
722 b_quantized_data_ = bias.template data<int32_t>();
723 if (dequantize_output_) {
724 b_dequantized_.resize(N);
725 for (
int j = 0; j < N; ++j) {
726 b_dequantized_[j] = fbgemm::Dequantize<int32_t>(
727 b_quantized_data_[j], in_qparams_[2]);
729 b_dequantized_data_ = b_dequantized_.data();
732 b_dequantized_data_ = bias.template data<float>();
733 if (!dequantize_output_) {
734 b_quantized_->resize(N);
735 for (
int j = 0; j < N; ++j) {
736 (*b_quantized_)[j] = fbgemm::Quantize<int32_t>(
737 b_dequantized_data_[j],
739 in_qparams_[0].scale * filter_qparams_[0].scale,
742 b_quantized_data_ = b_quantized_->data();
746 in_qparams0_scale_old_ = in_qparams_[0].scale;
747 in_qparams0_zero_point_old_ = in_qparams_[0].zero_point;
751 if (in_qparams_[0].zero_point && column_offsets_->empty() &&
753 if (b_quantized_->empty()) {
754 b_quantized_->assign(b_quantized_data_, b_quantized_data_ + N);
755 b_quantized_data_ = b_quantized_->data();
757 vector<int32_t>* column_offset_ptr;
758 vector<int32_t> column_offset_temp;
759 if (this->
template InputIsType<Int8FCDNNLowPPackedWeightBlob>(1)) {
760 const auto& packed_filter =
761 this->
template Input<Int8FCDNNLowPPackedWeightBlob>(1);
762 column_offset_ptr = packed_filter.column_offsets.get();
764 column_offset_temp.resize(N);
765 ComputeColumnOffsets<T_signed>(
766 K, N, W_quantized_.data(), filter_qparams_, column_offset_temp);
767 column_offset_ptr = &column_offset_temp;
769 for (
int i = 0; i < N; ++i) {
770 (*b_quantized_)[i] -=
771 in_qparams_[0].zero_point * (*column_offset_ptr)[i];
776 (dequantize_output_ && b_dequantized_data_) ||
777 (!dequantize_output_ && b_quantized_data_));
780 if (Wq_packed_ && !FLAGS_caffe2_dnnlowp_dump_tensors) {
782 vector<T_signed>().swap(W_quantized_);
785 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 788 t_end = chrono::system_clock::now();
789 double dt = chrono::duration<double>(t_end - t_begin).count();
790 LOG(INFO) <<
"@PERF this=" <<
this <<
" Quantize bias " << dt * 1e3
792 t_begin = chrono::system_clock::now();
796 if (!dequantize_output_ && !requantization_param_selected_) {
797 GetOutputQuantizationParams_();
799 for (
int i = 0; i < filter_qparams_.size(); ++i) {
800 float real_multiplier =
801 in_qparams_[0].scale * filter_qparams_[i].scale / out_qparams_.scale;
802 requantization_params_[i] = qfactory_->ChooseRequantizationMultiplier(
803 real_multiplier, out_qparams_);
804 requantization_multipliers_[i] =
805 requantization_params_[i].real_multiplier;
807 requantization_param_selected_ =
true;
809 if (measure_quantization_error_) {
811 Fp32Op_()->DequantizeInput();
812 Fp32Op_()->Get()->RunOnDevice();
816 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN 819 t_end = chrono::system_clock::now();
820 double dt = chrono::duration<double>(t_end - t_begin).count();
821 LOG(INFO) <<
"@PERF this=" <<
this <<
" GetOutputQuantizationParams " 822 << dt * 1e3 <<
" ms";
823 t_begin = chrono::system_clock::now();
830 REGISTER_CPU_OPERATOR_WITH_ENGINE(
833 FullyConnectedDNNLowPOp<uint8_t>);
834 REGISTER_CPU_OPERATOR_WITH_ENGINE(
837 FullyConnectedDNNLowPOp<uint16_t>);
839 REGISTER_CPU_OPERATOR_WITH_ENGINE(
842 FullyConnectedDNNLowPOp<uint8_t>);
844 REGISTER_CPU_OPERATOR_WITH_ENGINE(
847 FullyConnectedDNNLowPOp<uint8_t>);
848 REGISTER_CPU_OPERATOR_WITH_ENGINE(
851 FullyConnectedDNNLowPOp<uint16_t>);
853 REGISTER_CPU_OPERATOR_WITH_ENGINE(
856 FullyConnectedDNNLowPOp<uint8_t>);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...