1 #include "caffe2/quantization/server/group_norm_dnnlowp_op.h" 6 GroupNormDNNLowPOp<T>::GroupNormDNNLowPOp(
7 const OperatorDef& operator_def,
9 : BaseType(operator_def, ws),
10 OP_SINGLE_ARG(bool, OpSchema::Arg_IsTest, is_test_, true),
11 OP_SINGLE_ARG(int,
"group", group_, 32),
12 OP_SINGLE_ARG(float,
"epsilon", epsilon_, 1e-5),
13 order_(StringToStorageOrder(
14 this->template GetSingleArgument<
std::string>(
"order",
"NCHW"))),
15 OP_SINGLE_ARG(bool,
"is_param_constant", is_param_constant_, true) {
16 CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
17 if (!is_param_constant_) {
18 LOG(INFO) << operator_def.output(0) <<
" is_param_constant " 19 << is_param_constant_;
24 bool GroupNormDNNLowPOp<T>::RunOnDevice() {
25 this->ParseDNNLowPOperatorArguments_();
26 if (!GetQuantizationParameters()) {
29 return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW()
30 : RunOnDeviceWithOrderNHWC();
34 bool GroupNormDNNLowPOp<T>::GetQuantizationParameters() {
37 GetInputTensorQuantizationParamsOf(
this, INPUT, qfactory_.get());
40 if (!dequantize_output_) {
41 GetOutputQuantizationParams_();
42 }
else if (measure_quantization_error_) {
44 Fp32Op_()->DequantizeInput();
45 Fp32Op_()->Get()->RunOnDevice();
51 void GroupNormDNNLowPOp<T>::QuantizeGamma() {
52 if (is_param_constant_) {
53 if (gamma_quantized_data_ ==
nullptr &&
54 gamma_dequantized_data_ ==
nullptr) {
55 const auto& gamma = InputTensorCPU_(GAMMA);
56 const int C = gamma.size();
57 gamma_quantized_.resize(C);
58 gamma_quantized_data_ = gamma_quantized_.data();
59 if (this->
template InputIsType<int8::Int8TensorCPU>(GAMMA)) {
60 const auto& gamma_int8 =
61 this->
template Input<int8::Int8TensorCPU>(GAMMA);
62 auto& gamma_qparams = in_qparams_[GAMMA];
63 gamma_qparams.scale = gamma_int8.scale;
64 const T* gamma_data = gamma.template data<T>();
65 EigenVectorArrayMap<int32_t>(gamma_quantized_.data(), C) =
66 ConstEigenVectorArrayMap<T>(gamma_data, C)
67 .template cast<int32_t>() -
68 gamma_int8.zero_point;
69 gamma_qparams.zero_point = 0;
70 if (dequantize_output_) {
71 gamma_dequantized_.resize(C);
72 gamma_dequantized_data_ = gamma_dequantized_.data();
73 fbgemm::Dequantize<int32_t>(
74 gamma_quantized_data_,
75 gamma_dequantized_.data(),
89 void GroupNormDNNLowPOp<T>::QuantizeGammaImpl() {
90 const auto& gamma = InputTensorCPU_(GAMMA);
91 const int C = gamma.size();
92 auto& gamma_qparams = in_qparams_[GAMMA];
93 gamma_qparams = GetInputTensorQuantizationParamsOf(
94 this, GAMMA, qfactory_.get(),
true );
95 gamma_qparams.zero_point = 0;
96 gamma_quantized_.resize(C);
97 gamma_quantized_data_ = gamma_quantized_.data();
98 gamma_dequantized_data_ = gamma.template data<float>();
100 #pragma omp parallel for 102 for (
int i = 0; i < C; ++i) {
103 gamma_quantized_[i] = fbgemm::Quantize<int32_t>(
104 gamma_dequantized_data_[i],
105 gamma_qparams.zero_point,
111 template <
typename T>
112 void GroupNormDNNLowPOp<T>::QuantizeBeta() {
113 if (!is_param_constant_ ||
114 (beta_quantized_data_ ==
nullptr && beta_dequantized_data_ ==
nullptr) ||
115 cached_X_qparams_scale_ != in_qparams_[INPUT].scale) {
116 const auto& beta = InputTensorCPU_(BETA);
117 const int C = beta.size();
118 const auto& X_qparams = in_qparams_[INPUT];
119 const auto& gamma_qparams = in_qparams_[GAMMA];
120 auto& beta_qparams = in_qparams_[BETA];
121 if (this->
template InputIsType<int8::Int8TensorCPU>(BETA)) {
122 const auto& beta_int8 = this->
template Input<int8::Int8TensorCPU>(BETA);
123 beta_qparams.scale = beta_int8.scale;
124 beta_qparams.zero_point = beta_int8.zero_point;
126 std::abs(beta_qparams.scale - X_qparams.scale * gamma_qparams.scale),
128 CAFFE_ENFORCE_EQ(beta_qparams.zero_point, 0);
129 beta_quantized_data_ = beta.template data<int32_t>();
130 if (dequantize_output_) {
131 beta_dequantized_.resize(C);
132 beta_dequantized_data_ = beta_dequantized_.data();
133 fbgemm::Dequantize<int32_t>(
134 beta_quantized_data_, beta_dequantized_.data(), C, beta_qparams);
137 beta_qparams.scale = X_qparams.scale * gamma_qparams.scale;
138 beta_qparams.zero_point = 0;
139 beta_quantized_.resize(C);
140 beta_quantized_data_ = beta_quantized_.data();
141 beta_dequantized_data_ = beta.template data<float>();
143 #pragma omp parallel for 145 for (
int i = 0; i < C; ++i) {
146 beta_quantized_[i] = fbgemm::Quantize<int32_t>(
147 beta_dequantized_data_[i],
148 beta_qparams.zero_point,
153 cached_X_qparams_scale_ = in_qparams_[INPUT].scale;
157 template <
typename T>
158 void GroupNormDNNLowPOp<T>::QuantizedGroupMomentsNCHW(
166 const int outer_size = N * G;
167 const int inner_size = K * HxW;
168 const auto& X_qparams = in_qparams_[INPUT];
169 auto var_qparams = X_qparams;
170 var_qparams.scale = X_qparams.scale * X_qparams.scale;
171 var_qparams.zero_point = 0;
172 rsig_dequantized_.resize(outer_size);
174 #pragma omp parallel for 176 for (
int i = 0; i < outer_size; ++i) {
179 if (GetCpuId().avx2()) {
180 internal::VectorMomentsAVX2<T>(
181 inner_size, X + i * inner_size, &sum, &sumsq);
183 ConstEigenVectorArrayMap<T> X_arr(X + i * inner_size, inner_size);
184 sum = X_arr.template cast<int64_t>().sum();
185 sumsq = X_arr.template cast<int64_t>().square().sum();
187 const float mean =
static_cast<float>(sum) / static_cast<float>(inner_size);
188 mu[i] =
static_cast<int32_t
>(std::round(mean)) - X_qparams.zero_point;
190 static_cast<float>(sumsq) /
static_cast<float>(inner_size) -
192 rsig_dequantized_[i] = fbgemm::Dequantize<float>(var, var_qparams);
194 ComputeQuantizedInvStd(
195 outer_size, rsig_dequantized_.data(), rsig_dequantized_.data(), rsig);
198 template <
typename T>
199 void GroupNormDNNLowPOp<T>::QuantizedGroupMomentsNHWC(
207 const int outer_size = N * G;
208 const int inner_size = K * HxW;
209 const auto& X_qparams = in_qparams_[INPUT];
210 auto var_qparams = X_qparams;
211 var_qparams.scale = X_qparams.scale * X_qparams.scale;
212 var_qparams.zero_point = 0;
213 rsig_dequantized_.resize(outer_size);
215 #pragma omp parallel for 217 for (
int i = 0; i < outer_size; ++i) {
222 for (
int j = 0; j < HxW; ++j) {
223 const T* X_ptr = X + ((n * HxW + j) * G + g) * K;
224 if (GetCpuId().avx2()) {
225 internal::VectorMomentsAVX2<T>(K, X_ptr, &sum, &sumsq);
227 ConstEigenVectorArrayMap<T> X_arr(X + ((n * HxW + j) * G + g) * K, K);
228 sum += X_arr.template cast<int64_t>().sum();
229 sumsq += X_arr.template cast<int64_t>().square().sum();
232 const float mean =
static_cast<float>(sum) / static_cast<float>(inner_size);
233 mu[i] =
static_cast<int32_t
>(std::round(mean)) - X_qparams.zero_point;
235 static_cast<float>(sumsq) /
static_cast<float>(inner_size) -
237 rsig_dequantized_[i] = fbgemm::Dequantize<float>(var, var_qparams);
239 ComputeQuantizedInvStd(
240 outer_size, rsig_dequantized_.data(), rsig_dequantized_.data(), rsig);
243 template <
typename T>
244 void GroupNormDNNLowPOp<T>::DequantizedGroupMomentsNCHW(
253 const int size = N * C * HxW;
254 const int outer_size = N * G;
255 const int inner_size = K * HxW;
256 X_dequantized_.resize(size);
257 fbgemm::Dequantize<T>(X, X_dequantized_.data(), size, in_qparams_[INPUT]);
258 const std::array<int, 2> X_dims = {outer_size, inner_size};
259 const std::array<int, 2> Y_dims = {outer_size, 1};
260 math::Moments<float, CPUContext>(
264 X_dequantized_.data(),
268 math::InvStd<float>(outer_size, epsilon_, rsig, rsig, &context_);
271 template <
typename T>
272 void GroupNormDNNLowPOp<T>::DequantizedGroupMomentsNHWC(
281 const int size = N * C * HxW;
282 const int outer_size = N * G;
283 X_dequantized_.resize(size);
284 fbgemm::Dequantize<T>(X, X_dequantized_.data(), size, in_qparams_[INPUT]);
285 const std::array<int, 4> X_dims = {N, HxW, G, K};
286 const std::array<int, 4> Y_dims = {N, 1, G, 1};
287 math::Moments<float, CPUContext>(
291 X_dequantized_.data(),
295 math::InvStd<float>(outer_size, epsilon_, rsig, rsig, &context_);
298 template <
typename T>
299 bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {
300 const auto& X = InputTensorCPU_(INPUT);
301 const int N = X.dim32(0);
302 const int C = X.dim32(1);
303 const int HxW = X.size() / (N * C);
304 const int G = group_;
305 CAFFE_ENFORCE_EQ(C % G, 0);
307 auto* Y = OutputTensorCPU_(0);
309 std::vector<T> X_temp;
310 const T* X_data = dnnlowp::QuantizeInputIfNeeded<T>(
311 this, INPUT, in_qparams_[INPUT], X_temp);
313 if (dequantize_output_) {
314 float* Y_data = Y->template mutable_data<float>();
315 mu_dequantized_.resize(N * G);
316 rsig_dequantized_.resize(N * G);
317 float* mu_data = mu_dequantized_.data();
318 float* rsig_data = rsig_dequantized_.data();
319 DequantizedGroupMomentsNCHW(N, G, K, HxW, X_data, mu_data, rsig_data);
320 scale_dequantized_.resize(N * C);
321 bias_dequantized_.resize(N * C);
322 float* scale_data = scale_dequantized_.data();
323 float* bias_data = bias_dequantized_.data();
324 ComputeDequantizedFusedParams(
330 gamma_dequantized_data_,
331 beta_dequantized_data_,
334 AffineBatchChannelDequantizedNCHW(
335 N, C, HxW, X_dequantized_.data(), scale_data, bias_data, Y_data);
337 T* Y_data = GetQuantizedOutputData_();
338 mu_quantized_.resize(N * G);
339 rsig_quantized_.resize(N * G);
340 int32_t* mu_data = mu_quantized_.data();
341 int32_t* rsig_data = rsig_quantized_.data();
342 QuantizedGroupMomentsNCHW(N, G, K, HxW, X_data, mu_data, rsig_data);
343 scale_quantized_.resize(N * C);
344 bias_quantized_.resize(N * C);
345 int32_t* scale_data = scale_quantized_.data();
346 int32_t* bias_data = bias_quantized_.data();
347 ComputeQuantizedFusedParams(
353 gamma_quantized_data_,
354 beta_quantized_data_,
357 AffineBatchChannelQuantizedNCHW(
358 N, C, HxW, X_data, scale_data, bias_data, Y_data);
359 dnnlowp::PropagateOutputTensorQuantizationParams(
this, 0, out_qparams_);
361 MeasureQuantizationError_();
365 template <
typename T>
366 bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {
367 const auto& X = InputTensorCPU_(INPUT);
368 const int ndim = X.dim();
369 const int N = X.dim32(0);
370 const int C = X.dim32(ndim - 1);
371 const int HxW = X.size() / (N * C);
372 const int G = group_;
373 CAFFE_ENFORCE_EQ(C % G, 0);
375 auto* Y = OutputTensorCPU_(0);
377 std::vector<T> X_temp;
378 const T* X_data = dnnlowp::QuantizeInputIfNeeded<T>(
379 this, INPUT, in_qparams_[INPUT], X_temp);
381 if (dequantize_output_) {
382 float* Y_data = Y->template mutable_data<float>();
383 mu_dequantized_.resize(N * G);
384 rsig_dequantized_.resize(N * G);
385 float* mu_data = mu_dequantized_.data();
386 float* rsig_data = rsig_dequantized_.data();
387 DequantizedGroupMomentsNHWC(N, G, K, HxW, X_data, mu_data, rsig_data);
388 scale_dequantized_.resize(N * C);
389 bias_dequantized_.resize(N * C);
390 float* scale_data = scale_dequantized_.data();
391 float* bias_data = bias_dequantized_.data();
392 ComputeDequantizedFusedParams(
398 gamma_dequantized_data_,
399 beta_dequantized_data_,
402 AffineBatchChannelDequantizedNHWC(
403 N, C, HxW, X_dequantized_.data(), scale_data, bias_data, Y_data);
405 T* Y_data = GetQuantizedOutputData_();
406 mu_quantized_.resize(N * G);
407 rsig_quantized_.resize(N * G);
408 int32_t* mu_data = mu_quantized_.data();
409 int32_t* rsig_data = rsig_quantized_.data();
410 QuantizedGroupMomentsNHWC(N, G, K, HxW, X_data, mu_data, rsig_data);
411 scale_quantized_.resize(N * C);
412 bias_quantized_.resize(N * C);
413 int32_t* scale_data = scale_quantized_.data();
414 int32_t* bias_data = bias_quantized_.data();
415 ComputeQuantizedFusedParams(
421 gamma_quantized_data_,
422 beta_quantized_data_,
425 AffineBatchChannelQuantizedNHWC(
426 N, C, HxW, X_data, scale_data, bias_data, Y_data);
427 dnnlowp::PropagateOutputTensorQuantizationParams(
this, 0, out_qparams_);
429 MeasureQuantizationError_();
433 template <
typename T>
434 void GroupNormDNNLowPOp<T>::ComputeQuantizedInvStd(
438 int32_t* rsig_quantized) {
439 math::InvStd<float, CPUContext>(N, epsilon_, var, rsig, &context_);
440 rsig_qparams_ = qfactory_->ChooseQuantizationParams(
443 dnnlowp::QuantizationFactory::MIN_MAX_QUANTIZATION,
444 qfactory_->GetWeightPrecision(),
445 qfactory_->GetPreserveWeightSparsity());
446 rsig_qparams_.zero_point = 0;
448 #pragma omp parallel for 450 for (
int i = 0; i < N; ++i) {
451 rsig_quantized[i] = fbgemm::Quantize<int32_t>(
452 rsig[i], rsig_qparams_.zero_point, rsig_qparams_.scale, 32);
456 template <
typename T>
457 void GroupNormDNNLowPOp<T>::ComputeQuantizedFusedParams(
463 const int32_t* gamma,
468 ConstEigenArrayMap<int32_t> gamma_arr(gamma, K, G);
469 const auto& X_qparams = in_qparams_[INPUT];
470 const auto& gamma_qparams = in_qparams_[GAMMA];
471 internal_qparams_.scale =
472 rsig_qparams_.scale * gamma_qparams.scale * X_qparams.scale;
473 internal_qparams_.zero_point = 0;
474 internal_qparams_.precision = 32;
475 const float real_multiplier = 1.0f / rsig_qparams_.scale;
476 const auto beta_requantization_params =
477 qfactory_->ChooseRequantizationMultiplier(
478 real_multiplier, internal_qparams_);
479 for (
int i = 0; i < C; ++i) {
480 bias[i] = fbgemm::Requantize<int32_t>(
482 internal_qparams_.zero_point,
483 beta_requantization_params.multiplier,
484 beta_requantization_params.right_shift,
485 internal_qparams_.precision,
489 if (GetCpuId().avx2()) {
490 internal::ComputeQuantizedFusedParamsAVX2(
491 N, G, K, X_qparams.zero_point, mu, rsig, gamma, scale, bias);
493 ConstEigenArrayMap<int32_t> beta_arr(bias, K, G);
496 #pragma omp parallel for 498 for (
int i = N - 1; i >= 0; --i) {
499 EigenArrayMap<int32_t> scale_arr(scale + i * C, K, G);
500 scale_arr = gamma_arr.rowwise() *
501 ConstEigenVectorArrayMap<int32_t>(rsig + i * G, G).transpose();
502 EigenArrayMap<int32_t>(bias + i * C, K, G) = beta_arr -
503 scale_arr.rowwise() *
504 (ConstEigenVectorArrayMap<int32_t>(mu + i * G, G).transpose() +
505 X_qparams.zero_point);
510 template <
typename T>
511 void GroupNormDNNLowPOp<T>::ComputeDequantizedFusedParams(
522 ConstEigenArrayMap<float> gamma_arr(gamma, K, G);
523 ConstEigenArrayMap<float> beta_arr(beta, K, G);
525 #pragma omp parallel for 527 for (
int i = 0; i < N; ++i) {
528 EigenArrayMap<float> scale_arr(scale + i * C, K, G);
529 scale_arr = gamma_arr.rowwise() *
530 ConstEigenVectorArrayMap<float>(rsig + i * G, G).transpose();
531 EigenArrayMap<float>(bias + i * C, K, G) = beta_arr -
532 scale_arr.rowwise() *
533 ConstEigenVectorArrayMap<float>(mu + i * G, G).transpose();
537 template <
typename T>
538 void GroupNormDNNLowPOp<T>::AffineBatchChannelQuantizedNCHW(
543 const int32_t* scale,
546 const float real_multiplier = internal_qparams_.scale / out_qparams_.scale;
547 const auto out_requantization_params =
548 qfactory_->ChooseRequantizationMultiplier(real_multiplier, out_qparams_);
549 if (GetCpuId().avx2()) {
550 internal::AffineBatchChannelAndRequantizeNCHWAVX2<T>(
551 N, C, HxW, out_requantization_params, X, scale, bias, Y);
553 const int size = N * C * HxW;
554 Y_int32_.resize(size);
555 int32_t* Y_int32_data = Y_int32_.data();
556 EigenArrayMap<int32_t>(Y_int32_data, HxW, N * C) =
557 (ConstEigenArrayMap<T>(X, HxW, N * C)
558 .template cast<int32_t>()
560 ConstEigenVectorArrayMap<int32_t>(scale, N * C).transpose())
562 ConstEigenVectorArrayMap<int32_t>(bias, N * C).transpose();
563 fbgemm::Requantize<T>(Y_int32_data, Y, size, out_requantization_params);
567 template <
typename T>
568 void GroupNormDNNLowPOp<T>::AffineBatchChannelQuantizedNHWC(
573 const int32_t* scale,
576 const int size = N * C * HxW;
577 const int stride = HxW * C;
578 const float real_multiplier = internal_qparams_.scale / out_qparams_.scale;
579 const auto out_requantization_params =
580 qfactory_->ChooseRequantizationMultiplier(real_multiplier, out_qparams_);
581 if (GetCpuId().avx2()) {
582 internal::AffineBatchChannelAndRequantizeNHWCAVX2<T>(
583 N, C, HxW, out_requantization_params, X, scale, bias, Y);
585 Y_int32_.resize(size);
587 #pragma omp parallel for 589 for (
int i = 0; i < N; ++i) {
590 EigenArrayMap<int32_t>(Y_int32_.data() + i * stride, C, HxW) =
591 (ConstEigenArrayMap<T>(X + i * stride, C, HxW)
592 .template cast<int32_t>()
594 ConstEigenVectorArrayMap<int32_t>(scale + i * C, C))
596 ConstEigenVectorArrayMap<int32_t>(bias + i * C, C);
598 fbgemm::Requantize<T>(Y_int32_.data(), Y, size, out_requantization_params);
602 template <
typename T>
603 void GroupNormDNNLowPOp<T>::AffineBatchChannelDequantizedNCHW(
611 EigenArrayMap<float>(Y, HxW, N * C) =
612 (ConstEigenArrayMap<float>(X, HxW, N * C).rowwise() *
613 ConstEigenVectorArrayMap<float>(scale, N * C).transpose())
615 ConstEigenVectorArrayMap<float>(bias, N * C).transpose();
618 template <
typename T>
619 void GroupNormDNNLowPOp<T>::AffineBatchChannelDequantizedNHWC(
627 const int stride = HxW * C;
629 #pragma omp parallel for 631 for (
int i = 0; i < N; ++i) {
632 EigenArrayMap<float>(Y + i * stride, C, HxW) =
633 (ConstEigenArrayMap<float>(X + i * stride, C, HxW).colwise() *
634 ConstEigenVectorArrayMap<float>(scale + i * C, C))
636 ConstEigenVectorArrayMap<float>(bias + i * C, C);
640 REGISTER_CPU_OPERATOR_WITH_ENGINE(
643 GroupNormDNNLowPOp<uint8_t>);
644 REGISTER_CPU_OPERATOR_WITH_ENGINE(
647 GroupNormDNNLowPOp<uint8_t>);
649 OPERATOR_SCHEMA(Int8GroupNorm).NumInputs(3).NumOutputs({1, 3});
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...