Caffe2 - C++ API
A deep learning, cross platform ML framework
channel_shuffle_dnnlowp_op.cc
1 #include "caffe2/quantization/server/channel_shuffle_dnnlowp_op.h"
2 
3 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
4 #include "caffe2/quantization/server/transpose.h"
5 #include "caffe2/utils/eigen_utils.h"
6 
7 namespace caffe2 {
8 
9 template <typename T>
10 ChannelShuffleDNNLowPOp<T>::ChannelShuffleDNNLowPOp(
11  const OperatorDef& operator_def,
12  Workspace* ws)
13  : BaseType(operator_def, ws),
14  order_(StringToStorageOrder(
15  this->template GetSingleArgument<std::string>("order", "NCHW"))),
16  OP_SINGLE_ARG(int, "group", group_, 1) {
17  CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
18 }
19 
20 template <typename T>
21 bool ChannelShuffleDNNLowPOp<T>::RunOnDevice() {
22  return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW()
23  : RunOnDeviceWithOrderNHWC();
24 }
25 
26 template <typename T>
27 bool ChannelShuffleDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {
28  using namespace dnnlowp;
29 
30  this->ParseDNNLowPOperatorArguments_();
31 
32  // Choose quantization params
33  TensorQuantizationParams in_qparams =
34  GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
35 
36  const auto& X = InputTensorCPU_(0);
37  auto* Y = OutputTensorCPU_(0);
38  Y->ResizeLike(X);
39  const int N = X.dim32(0);
40  const int C = X.dim32(1);
41  const int G = group_;
42  CAFFE_ENFORCE_EQ(C % G, 0);
43  const int K = C / G;
44  const int HxW = X.numel() / (N * C);
45  const int stride = C * HxW;
46  const T* X_data = X.template data<T>();
47  T* Y_data = Y->template mutable_data<T>();
48 #ifdef _OPENMP
49 #pragma omp parallel for
50 #endif
51  for (int i = 0; i < N; ++i) {
52  ConstEigenMatrixMap<T> X_mat(X_data + i * stride, K * HxW, G);
53  for (int j = 0; j < K; ++j) {
54  EigenMatrixMap<T>(Y_data + i * stride + j * G * HxW, HxW, G) =
55  X_mat.block(j * HxW, 0, HxW, G);
56  }
57  }
58 
59  // Even if there is a pre-chosen quantization parameters for the output,
60  // it is ignored because channel shuffle output quantization should be same
61  // as the input.
62  PropagateOutputTensorQuantizationParams(this, 0, in_qparams);
63 
64  return true;
65 }
66 
67 template <typename T>
68 bool ChannelShuffleDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {
69  using namespace dnnlowp;
70 
71  this->ParseDNNLowPOperatorArguments_();
72 
73  // Choose quantization params
74  TensorQuantizationParams in_qparams =
75  GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
76 
77  const auto& X = InputTensorCPU_(0);
78  auto* Y = OutputTensorCPU_(0);
79  Y->ResizeLike(X);
80  const auto C = X.dim32(X.ndim() - 1);
81  const auto G = this->group_;
82  CAFFE_ENFORCE(C % G == 0, "");
83  const auto K = C / G;
84  std::array<int, 2> dims = {G, K};
85  std::array<int, 2> axes = {1, 0};
86  const T* X_data = X.template data<T>();
87  T* Y_data = Y->template mutable_data<T>();
88 
89  if (G == 4 && std::is_same<T, std::uint8_t>::value && GetCpuId().avx2()) {
90 #ifdef _OPENMP
91 #pragma omp parallel for
92 #endif
93  for (auto i = 0; i < X.numel(); i += C) {
94  // Transpose each C = GxK matrix
95  fbgemm::transpose_4rows(
96  K,
97  reinterpret_cast<const std::uint8_t*>(X_data + i),
98  reinterpret_cast<std::uint8_t*>(Y_data + i));
99  }
100  } else {
101 #ifdef _OPENMP
102 #pragma omp parallel for
103 #endif
104  for (auto i = 0; i < X.numel(); i += C) {
105  // Transpose each C = GxK matrix
106  math::Transpose(
107  2, dims.data(), axes.data(), X_data + i, Y_data + i, &context_);
108  }
109  }
110 
111  // Even if there is a pre-chosen quantization parameters for the output,
112  // it is ignored because channel shuffle output quantization should be same
113  // as the input.
114  PropagateOutputTensorQuantizationParams(this, 0, in_qparams);
115 
116  return true;
117 }
118 
119 REGISTER_CPU_OPERATOR_WITH_ENGINE(
121  DNNLOWP,
122  ChannelShuffleDNNLowPOp<uint8_t>);
123 
124 REGISTER_CPU_OPERATOR_WITH_ENGINE(
125  Int8ChannelShuffle,
126  DNNLOWP,
127  ChannelShuffleDNNLowPOp<uint8_t>);
128 
129 REGISTER_CPU_OPERATOR_WITH_ENGINE(
131  DNNLOWP_16,
132  ChannelShuffleDNNLowPOp<uint16_t>);
133 
134 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64