Caffe2 - C++ API
A deep learning, cross platform ML framework
group_norm_dnnlowp_op.cc
1 #include "caffe2/quantization/server/group_norm_dnnlowp_op.h"
2 
3 namespace caffe2 {
4 
5 template <typename T>
6 GroupNormDNNLowPOp<T>::GroupNormDNNLowPOp(
7  const OperatorDef& operator_def,
8  Workspace* ws)
9  : BaseType(operator_def, ws),
10  OP_SINGLE_ARG(bool, OpSchema::Arg_IsTest, is_test_, true),
11  OP_SINGLE_ARG(int, "group", group_, 32),
12  OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5),
13  order_(StringToStorageOrder(
14  this->template GetSingleArgument<std::string>("order", "NCHW"))),
15  OP_SINGLE_ARG(bool, "is_param_constant", is_param_constant_, true) {
16  CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
17  if (!is_param_constant_) {
18  LOG(INFO) << operator_def.output(0) << " is_param_constant "
19  << is_param_constant_;
20  }
21 }
22 
23 template <typename T>
24 bool GroupNormDNNLowPOp<T>::RunOnDevice() {
25  this->ParseDNNLowPOperatorArguments_();
26  if (!GetQuantizationParameters()) {
27  return false;
28  }
29  return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW()
30  : RunOnDeviceWithOrderNHWC();
31 }
32 
33 template <typename T>
34 bool GroupNormDNNLowPOp<T>::GetQuantizationParameters() {
35  // Choose quantization for X
36  in_qparams_[INPUT] =
37  GetInputTensorQuantizationParamsOf(this, INPUT, qfactory_.get());
38  QuantizeGamma();
39  QuantizeBeta();
40  if (!dequantize_output_) {
41  GetOutputQuantizationParams_();
42  } else if (measure_quantization_error_) {
43  // to measure quantization error, run ref impl.
44  Fp32Op_()->DequantizeInput();
45  Fp32Op_()->Get()->RunOnDevice();
46  }
47  return true;
48 }
49 
50 template <typename T>
51 void GroupNormDNNLowPOp<T>::QuantizeGamma() {
52  if (is_param_constant_) {
53  if (gamma_quantized_data_ == nullptr &&
54  gamma_dequantized_data_ == nullptr) {
55  const auto& gamma = InputTensorCPU_(GAMMA);
56  const int C = gamma.size();
57  gamma_quantized_.resize(C);
58  gamma_quantized_data_ = gamma_quantized_.data();
59  if (this->template InputIsType<int8::Int8TensorCPU>(GAMMA)) {
60  const auto& gamma_int8 =
61  this->template Input<int8::Int8TensorCPU>(GAMMA);
62  auto& gamma_qparams = in_qparams_[GAMMA];
63  gamma_qparams.scale = gamma_int8.scale;
64  const T* gamma_data = gamma.template data<T>();
65  EigenVectorArrayMap<int32_t>(gamma_quantized_.data(), C) =
66  ConstEigenVectorArrayMap<T>(gamma_data, C)
67  .template cast<int32_t>() -
68  gamma_int8.zero_point;
69  gamma_qparams.zero_point = 0;
70  if (dequantize_output_) {
71  gamma_dequantized_.resize(C);
72  gamma_dequantized_data_ = gamma_dequantized_.data();
73  fbgemm::Dequantize<int32_t>(
74  gamma_quantized_data_,
75  gamma_dequantized_.data(),
76  C,
77  gamma_qparams);
78  }
79  } else {
80  QuantizeGammaImpl();
81  }
82  }
83  } else {
84  QuantizeGammaImpl();
85  }
86 }
87 
88 template <typename T>
89 void GroupNormDNNLowPOp<T>::QuantizeGammaImpl() {
90  const auto& gamma = InputTensorCPU_(GAMMA);
91  const int C = gamma.size();
92  auto& gamma_qparams = in_qparams_[GAMMA];
93  gamma_qparams = GetInputTensorQuantizationParamsOf(
94  this, GAMMA, qfactory_.get(), true /* is_weight */);
95  gamma_qparams.zero_point = 0;
96  gamma_quantized_.resize(C);
97  gamma_quantized_data_ = gamma_quantized_.data();
98  gamma_dequantized_data_ = gamma.template data<float>();
99 #ifdef _OPENMP
100 #pragma omp parallel for
101 #endif
102  for (int i = 0; i < C; ++i) {
103  gamma_quantized_[i] = fbgemm::Quantize<int32_t>(
104  gamma_dequantized_data_[i],
105  gamma_qparams.zero_point,
106  gamma_qparams.scale,
107  32);
108  }
109 }
110 
111 template <typename T>
112 void GroupNormDNNLowPOp<T>::QuantizeBeta() {
113  if (!is_param_constant_ ||
114  (beta_quantized_data_ == nullptr && beta_dequantized_data_ == nullptr) ||
115  cached_X_qparams_scale_ != in_qparams_[INPUT].scale) {
116  const auto& beta = InputTensorCPU_(BETA);
117  const int C = beta.size();
118  const auto& X_qparams = in_qparams_[INPUT];
119  const auto& gamma_qparams = in_qparams_[GAMMA];
120  auto& beta_qparams = in_qparams_[BETA];
121  if (this->template InputIsType<int8::Int8TensorCPU>(BETA)) {
122  const auto& beta_int8 = this->template Input<int8::Int8TensorCPU>(BETA);
123  beta_qparams.scale = beta_int8.scale;
124  beta_qparams.zero_point = beta_int8.zero_point;
125  CAFFE_ENFORCE_LE(
126  std::abs(beta_qparams.scale - X_qparams.scale * gamma_qparams.scale),
127  1e-4);
128  CAFFE_ENFORCE_EQ(beta_qparams.zero_point, 0);
129  beta_quantized_data_ = beta.template data<int32_t>();
130  if (dequantize_output_) {
131  beta_dequantized_.resize(C);
132  beta_dequantized_data_ = beta_dequantized_.data();
133  fbgemm::Dequantize<int32_t>(
134  beta_quantized_data_, beta_dequantized_.data(), C, beta_qparams);
135  }
136  } else {
137  beta_qparams.scale = X_qparams.scale * gamma_qparams.scale;
138  beta_qparams.zero_point = 0;
139  beta_quantized_.resize(C);
140  beta_quantized_data_ = beta_quantized_.data();
141  beta_dequantized_data_ = beta.template data<float>();
142 #ifdef _OPENMP
143 #pragma omp parallel for
144 #endif
145  for (int i = 0; i < C; ++i) {
146  beta_quantized_[i] = fbgemm::Quantize<int32_t>(
147  beta_dequantized_data_[i],
148  beta_qparams.zero_point,
149  beta_qparams.scale,
150  32);
151  }
152  }
153  cached_X_qparams_scale_ = in_qparams_[INPUT].scale;
154  }
155 }
156 
157 template <typename T>
158 void GroupNormDNNLowPOp<T>::QuantizedGroupMomentsNCHW(
159  const int N,
160  const int G,
161  const int K,
162  const int HxW,
163  const T* X,
164  int32_t* mu,
165  int32_t* rsig) {
166  const int outer_size = N * G;
167  const int inner_size = K * HxW;
168  const auto& X_qparams = in_qparams_[INPUT];
169  auto var_qparams = X_qparams;
170  var_qparams.scale = X_qparams.scale * X_qparams.scale;
171  var_qparams.zero_point = 0;
172  rsig_dequantized_.resize(outer_size);
173 #ifdef _OPENMP
174 #pragma omp parallel for
175 #endif
176  for (int i = 0; i < outer_size; ++i) {
177  int64_t sum = 0;
178  int64_t sumsq = 0;
179  if (GetCpuId().avx2()) {
180  internal::VectorMomentsAVX2<T>(
181  inner_size, X + i * inner_size, &sum, &sumsq);
182  } else {
183  ConstEigenVectorArrayMap<T> X_arr(X + i * inner_size, inner_size);
184  sum = X_arr.template cast<int64_t>().sum();
185  sumsq = X_arr.template cast<int64_t>().square().sum();
186  }
187  const float mean = static_cast<float>(sum) / static_cast<float>(inner_size);
188  mu[i] = static_cast<int32_t>(std::round(mean)) - X_qparams.zero_point;
189  const float var =
190  static_cast<float>(sumsq) / static_cast<float>(inner_size) -
191  mean * mean;
192  rsig_dequantized_[i] = fbgemm::Dequantize<float>(var, var_qparams);
193  }
194  ComputeQuantizedInvStd(
195  outer_size, rsig_dequantized_.data(), rsig_dequantized_.data(), rsig);
196 }
197 
198 template <typename T>
199 void GroupNormDNNLowPOp<T>::QuantizedGroupMomentsNHWC(
200  const int N,
201  const int G,
202  const int K,
203  const int HxW,
204  const T* X,
205  int32_t* mu,
206  int32_t* rsig) {
207  const int outer_size = N * G;
208  const int inner_size = K * HxW;
209  const auto& X_qparams = in_qparams_[INPUT];
210  auto var_qparams = X_qparams;
211  var_qparams.scale = X_qparams.scale * X_qparams.scale;
212  var_qparams.zero_point = 0;
213  rsig_dequantized_.resize(outer_size);
214 #ifdef _OPENMP
215 #pragma omp parallel for
216 #endif
217  for (int i = 0; i < outer_size; ++i) {
218  const int n = i / G;
219  const int g = i % G;
220  int64_t sum = 0;
221  int64_t sumsq = 0;
222  for (int j = 0; j < HxW; ++j) {
223  const T* X_ptr = X + ((n * HxW + j) * G + g) * K;
224  if (GetCpuId().avx2()) {
225  internal::VectorMomentsAVX2<T>(K, X_ptr, &sum, &sumsq);
226  } else {
227  ConstEigenVectorArrayMap<T> X_arr(X + ((n * HxW + j) * G + g) * K, K);
228  sum += X_arr.template cast<int64_t>().sum();
229  sumsq += X_arr.template cast<int64_t>().square().sum();
230  }
231  }
232  const float mean = static_cast<float>(sum) / static_cast<float>(inner_size);
233  mu[i] = static_cast<int32_t>(std::round(mean)) - X_qparams.zero_point;
234  const float var =
235  static_cast<float>(sumsq) / static_cast<float>(inner_size) -
236  mean * mean;
237  rsig_dequantized_[i] = fbgemm::Dequantize<float>(var, var_qparams);
238  }
239  ComputeQuantizedInvStd(
240  outer_size, rsig_dequantized_.data(), rsig_dequantized_.data(), rsig);
241 }
242 
243 template <typename T>
244 void GroupNormDNNLowPOp<T>::DequantizedGroupMomentsNCHW(
245  const int N,
246  const int G,
247  const int K,
248  const int HxW,
249  const T* X,
250  float* mu,
251  float* rsig) {
252  const int C = G * K;
253  const int size = N * C * HxW;
254  const int outer_size = N * G;
255  const int inner_size = K * HxW;
256  X_dequantized_.resize(size);
257  fbgemm::Dequantize<T>(X, X_dequantized_.data(), size, in_qparams_[INPUT]);
258  const std::array<int, 2> X_dims = {outer_size, inner_size};
259  const std::array<int, 2> Y_dims = {outer_size, 1};
260  math::Moments<float, CPUContext>(
261  2,
262  X_dims.data(),
263  Y_dims.data(),
264  X_dequantized_.data(),
265  mu,
266  rsig,
267  &context_);
268  math::InvStd<float>(outer_size, epsilon_, rsig, rsig, &context_);
269 }
270 
271 template <typename T>
272 void GroupNormDNNLowPOp<T>::DequantizedGroupMomentsNHWC(
273  const int N,
274  const int G,
275  const int K,
276  const int HxW,
277  const T* X,
278  float* mu,
279  float* rsig) {
280  const int C = G * K;
281  const int size = N * C * HxW;
282  const int outer_size = N * G;
283  X_dequantized_.resize(size);
284  fbgemm::Dequantize<T>(X, X_dequantized_.data(), size, in_qparams_[INPUT]);
285  const std::array<int, 4> X_dims = {N, HxW, G, K};
286  const std::array<int, 4> Y_dims = {N, 1, G, 1};
287  math::Moments<float, CPUContext>(
288  4,
289  X_dims.data(),
290  Y_dims.data(),
291  X_dequantized_.data(),
292  mu,
293  rsig,
294  &context_);
295  math::InvStd<float>(outer_size, epsilon_, rsig, rsig, &context_);
296 }
297 
298 template <typename T>
299 bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {
300  const auto& X = InputTensorCPU_(INPUT);
301  const int N = X.dim32(0);
302  const int C = X.dim32(1);
303  const int HxW = X.size() / (N * C);
304  const int G = group_;
305  CAFFE_ENFORCE_EQ(C % G, 0);
306  const int K = C / G;
307  auto* Y = OutputTensorCPU_(0);
308  Y->ResizeLike(X);
309  std::vector<T> X_temp;
310  const T* X_data = dnnlowp::QuantizeInputIfNeeded<T>(
311  this, INPUT, in_qparams_[INPUT], X_temp);
312 
313  if (dequantize_output_) {
314  float* Y_data = Y->template mutable_data<float>();
315  mu_dequantized_.resize(N * G);
316  rsig_dequantized_.resize(N * G);
317  float* mu_data = mu_dequantized_.data();
318  float* rsig_data = rsig_dequantized_.data();
319  DequantizedGroupMomentsNCHW(N, G, K, HxW, X_data, mu_data, rsig_data);
320  scale_dequantized_.resize(N * C);
321  bias_dequantized_.resize(N * C);
322  float* scale_data = scale_dequantized_.data();
323  float* bias_data = bias_dequantized_.data();
324  ComputeDequantizedFusedParams(
325  N,
326  G,
327  K,
328  mu_data,
329  rsig_data,
330  gamma_dequantized_data_,
331  beta_dequantized_data_,
332  scale_data,
333  bias_data);
334  AffineBatchChannelDequantizedNCHW(
335  N, C, HxW, X_dequantized_.data(), scale_data, bias_data, Y_data);
336  } else {
337  T* Y_data = GetQuantizedOutputData_();
338  mu_quantized_.resize(N * G);
339  rsig_quantized_.resize(N * G);
340  int32_t* mu_data = mu_quantized_.data();
341  int32_t* rsig_data = rsig_quantized_.data();
342  QuantizedGroupMomentsNCHW(N, G, K, HxW, X_data, mu_data, rsig_data);
343  scale_quantized_.resize(N * C);
344  bias_quantized_.resize(N * C);
345  int32_t* scale_data = scale_quantized_.data();
346  int32_t* bias_data = bias_quantized_.data();
347  ComputeQuantizedFusedParams(
348  N,
349  G,
350  K,
351  mu_data,
352  rsig_data,
353  gamma_quantized_data_,
354  beta_quantized_data_,
355  scale_data,
356  bias_data);
357  AffineBatchChannelQuantizedNCHW(
358  N, C, HxW, X_data, scale_data, bias_data, Y_data);
359  dnnlowp::PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
360  }
361  MeasureQuantizationError_();
362  return true;
363 }
364 
365 template <typename T>
366 bool GroupNormDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {
367  const auto& X = InputTensorCPU_(INPUT);
368  const int ndim = X.dim();
369  const int N = X.dim32(0);
370  const int C = X.dim32(ndim - 1);
371  const int HxW = X.size() / (N * C);
372  const int G = group_;
373  CAFFE_ENFORCE_EQ(C % G, 0);
374  const int K = C / G;
375  auto* Y = OutputTensorCPU_(0);
376  Y->ResizeLike(X);
377  std::vector<T> X_temp;
378  const T* X_data = dnnlowp::QuantizeInputIfNeeded<T>(
379  this, INPUT, in_qparams_[INPUT], X_temp);
380 
381  if (dequantize_output_) {
382  float* Y_data = Y->template mutable_data<float>();
383  mu_dequantized_.resize(N * G);
384  rsig_dequantized_.resize(N * G);
385  float* mu_data = mu_dequantized_.data();
386  float* rsig_data = rsig_dequantized_.data();
387  DequantizedGroupMomentsNHWC(N, G, K, HxW, X_data, mu_data, rsig_data);
388  scale_dequantized_.resize(N * C);
389  bias_dequantized_.resize(N * C);
390  float* scale_data = scale_dequantized_.data();
391  float* bias_data = bias_dequantized_.data();
392  ComputeDequantizedFusedParams(
393  N,
394  G,
395  K,
396  mu_data,
397  rsig_data,
398  gamma_dequantized_data_,
399  beta_dequantized_data_,
400  scale_data,
401  bias_data);
402  AffineBatchChannelDequantizedNHWC(
403  N, C, HxW, X_dequantized_.data(), scale_data, bias_data, Y_data);
404  } else {
405  T* Y_data = GetQuantizedOutputData_();
406  mu_quantized_.resize(N * G);
407  rsig_quantized_.resize(N * G);
408  int32_t* mu_data = mu_quantized_.data();
409  int32_t* rsig_data = rsig_quantized_.data();
410  QuantizedGroupMomentsNHWC(N, G, K, HxW, X_data, mu_data, rsig_data);
411  scale_quantized_.resize(N * C);
412  bias_quantized_.resize(N * C);
413  int32_t* scale_data = scale_quantized_.data();
414  int32_t* bias_data = bias_quantized_.data();
415  ComputeQuantizedFusedParams(
416  N,
417  G,
418  K,
419  mu_data,
420  rsig_data,
421  gamma_quantized_data_,
422  beta_quantized_data_,
423  scale_data,
424  bias_data);
425  AffineBatchChannelQuantizedNHWC(
426  N, C, HxW, X_data, scale_data, bias_data, Y_data);
427  dnnlowp::PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
428  }
429  MeasureQuantizationError_();
430  return true;
431 }
432 
433 template <typename T>
434 void GroupNormDNNLowPOp<T>::ComputeQuantizedInvStd(
435  const int N,
436  const float* var,
437  float* rsig,
438  int32_t* rsig_quantized) {
439  math::InvStd<float, CPUContext>(N, epsilon_, var, rsig, &context_);
440  rsig_qparams_ = qfactory_->ChooseQuantizationParams(
441  rsig,
442  N,
443  dnnlowp::QuantizationFactory::MIN_MAX_QUANTIZATION,
444  qfactory_->GetWeightPrecision(),
445  qfactory_->GetPreserveWeightSparsity());
446  rsig_qparams_.zero_point = 0;
447 #ifdef _OPENMP
448 #pragma omp parallel for
449 #endif
450  for (int i = 0; i < N; ++i) {
451  rsig_quantized[i] = fbgemm::Quantize<int32_t>(
452  rsig[i], rsig_qparams_.zero_point, rsig_qparams_.scale, 32);
453  }
454 }
455 
456 template <typename T>
457 void GroupNormDNNLowPOp<T>::ComputeQuantizedFusedParams(
458  const int N,
459  const int G,
460  const int K,
461  const int32_t* mu,
462  const int32_t* rsig,
463  const int32_t* gamma,
464  const int32_t* beta,
465  int32_t* scale,
466  int32_t* bias) {
467  const int C = G * K;
468  ConstEigenArrayMap<int32_t> gamma_arr(gamma, K, G);
469  const auto& X_qparams = in_qparams_[INPUT];
470  const auto& gamma_qparams = in_qparams_[GAMMA];
471  internal_qparams_.scale =
472  rsig_qparams_.scale * gamma_qparams.scale * X_qparams.scale;
473  internal_qparams_.zero_point = 0;
474  internal_qparams_.precision = 32;
475  const float real_multiplier = 1.0f / rsig_qparams_.scale;
476  const auto beta_requantization_params =
477  qfactory_->ChooseRequantizationMultiplier(
478  real_multiplier, internal_qparams_);
479  for (int i = 0; i < C; ++i) {
480  bias[i] = fbgemm::Requantize<int32_t>(
481  beta[i],
482  internal_qparams_.zero_point,
483  beta_requantization_params.multiplier,
484  beta_requantization_params.right_shift,
485  internal_qparams_.precision,
486  true);
487  }
488 
489  if (GetCpuId().avx2()) {
490  internal::ComputeQuantizedFusedParamsAVX2(
491  N, G, K, X_qparams.zero_point, mu, rsig, gamma, scale, bias);
492  } else {
493  ConstEigenArrayMap<int32_t> beta_arr(bias, K, G);
494  // Reverse order for-loop to avoid overriding bias data.
495 #ifdef _OPENMP
496 #pragma omp parallel for
497 #endif
498  for (int i = N - 1; i >= 0; --i) {
499  EigenArrayMap<int32_t> scale_arr(scale + i * C, K, G);
500  scale_arr = gamma_arr.rowwise() *
501  ConstEigenVectorArrayMap<int32_t>(rsig + i * G, G).transpose();
502  EigenArrayMap<int32_t>(bias + i * C, K, G) = beta_arr -
503  scale_arr.rowwise() *
504  (ConstEigenVectorArrayMap<int32_t>(mu + i * G, G).transpose() +
505  X_qparams.zero_point);
506  }
507  }
508 }
509 
510 template <typename T>
511 void GroupNormDNNLowPOp<T>::ComputeDequantizedFusedParams(
512  const int N,
513  const int G,
514  const int K,
515  const float* mu,
516  const float* rsig,
517  const float* gamma,
518  const float* beta,
519  float* scale,
520  float* bias) {
521  const int C = G * K;
522  ConstEigenArrayMap<float> gamma_arr(gamma, K, G);
523  ConstEigenArrayMap<float> beta_arr(beta, K, G);
524 #ifdef _OPENMP
525 #pragma omp parallel for
526 #endif
527  for (int i = 0; i < N; ++i) {
528  EigenArrayMap<float> scale_arr(scale + i * C, K, G);
529  scale_arr = gamma_arr.rowwise() *
530  ConstEigenVectorArrayMap<float>(rsig + i * G, G).transpose();
531  EigenArrayMap<float>(bias + i * C, K, G) = beta_arr -
532  scale_arr.rowwise() *
533  ConstEigenVectorArrayMap<float>(mu + i * G, G).transpose();
534  }
535 }
536 
537 template <typename T>
538 void GroupNormDNNLowPOp<T>::AffineBatchChannelQuantizedNCHW(
539  const int N,
540  const int C,
541  const int HxW,
542  const T* X,
543  const int32_t* scale,
544  const int32_t* bias,
545  T* Y) {
546  const float real_multiplier = internal_qparams_.scale / out_qparams_.scale;
547  const auto out_requantization_params =
548  qfactory_->ChooseRequantizationMultiplier(real_multiplier, out_qparams_);
549  if (GetCpuId().avx2()) {
550  internal::AffineBatchChannelAndRequantizeNCHWAVX2<T>(
551  N, C, HxW, out_requantization_params, X, scale, bias, Y);
552  } else {
553  const int size = N * C * HxW;
554  Y_int32_.resize(size);
555  int32_t* Y_int32_data = Y_int32_.data();
556  EigenArrayMap<int32_t>(Y_int32_data, HxW, N * C) =
557  (ConstEigenArrayMap<T>(X, HxW, N * C)
558  .template cast<int32_t>()
559  .rowwise() *
560  ConstEigenVectorArrayMap<int32_t>(scale, N * C).transpose())
561  .rowwise() +
562  ConstEigenVectorArrayMap<int32_t>(bias, N * C).transpose();
563  fbgemm::Requantize<T>(Y_int32_data, Y, size, out_requantization_params);
564  }
565 }
566 
567 template <typename T>
568 void GroupNormDNNLowPOp<T>::AffineBatchChannelQuantizedNHWC(
569  const int N,
570  const int C,
571  const int HxW,
572  const T* X,
573  const int32_t* scale,
574  const int32_t* bias,
575  T* Y) {
576  const int size = N * C * HxW;
577  const int stride = HxW * C;
578  const float real_multiplier = internal_qparams_.scale / out_qparams_.scale;
579  const auto out_requantization_params =
580  qfactory_->ChooseRequantizationMultiplier(real_multiplier, out_qparams_);
581  if (GetCpuId().avx2()) {
582  internal::AffineBatchChannelAndRequantizeNHWCAVX2<T>(
583  N, C, HxW, out_requantization_params, X, scale, bias, Y);
584  } else {
585  Y_int32_.resize(size);
586 #ifdef _OPENMP
587 #pragma omp parallel for
588 #endif
589  for (int i = 0; i < N; ++i) {
590  EigenArrayMap<int32_t>(Y_int32_.data() + i * stride, C, HxW) =
591  (ConstEigenArrayMap<T>(X + i * stride, C, HxW)
592  .template cast<int32_t>()
593  .colwise() *
594  ConstEigenVectorArrayMap<int32_t>(scale + i * C, C))
595  .colwise() +
596  ConstEigenVectorArrayMap<int32_t>(bias + i * C, C);
597  }
598  fbgemm::Requantize<T>(Y_int32_.data(), Y, size, out_requantization_params);
599  }
600 }
601 
602 template <typename T>
603 void GroupNormDNNLowPOp<T>::AffineBatchChannelDequantizedNCHW(
604  const int N,
605  const int C,
606  const int HxW,
607  const float* X,
608  const float* scale,
609  const float* bias,
610  float* Y) {
611  EigenArrayMap<float>(Y, HxW, N * C) =
612  (ConstEigenArrayMap<float>(X, HxW, N * C).rowwise() *
613  ConstEigenVectorArrayMap<float>(scale, N * C).transpose())
614  .rowwise() +
615  ConstEigenVectorArrayMap<float>(bias, N * C).transpose();
616 }
617 
618 template <typename T>
619 void GroupNormDNNLowPOp<T>::AffineBatchChannelDequantizedNHWC(
620  const int N,
621  const int C,
622  const int HxW,
623  const float* X,
624  const float* scale,
625  const float* bias,
626  float* Y) {
627  const int stride = HxW * C;
628 #ifdef _OPENMP
629 #pragma omp parallel for
630 #endif
631  for (int i = 0; i < N; ++i) {
632  EigenArrayMap<float>(Y + i * stride, C, HxW) =
633  (ConstEigenArrayMap<float>(X + i * stride, C, HxW).colwise() *
634  ConstEigenVectorArrayMap<float>(scale + i * C, C))
635  .colwise() +
636  ConstEigenVectorArrayMap<float>(bias + i * C, C);
637  }
638 }
639 
640 REGISTER_CPU_OPERATOR_WITH_ENGINE(
641  GroupNorm,
642  DNNLOWP,
643  GroupNormDNNLowPOp<uint8_t>);
644 REGISTER_CPU_OPERATOR_WITH_ENGINE(
645  Int8GroupNorm,
646  DNNLOWP,
647  GroupNormDNNLowPOp<uint8_t>);
648 
649 OPERATOR_SCHEMA(Int8GroupNorm).NumInputs(3).NumOutputs({1, 3});
650 
651 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64