1 #include "caffe2/quantization/server/channel_shuffle_dnnlowp_op.h" 3 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h" 4 #include "caffe2/quantization/server/transpose.h" 5 #include "caffe2/utils/eigen_utils.h" 10 ChannelShuffleDNNLowPOp<T>::ChannelShuffleDNNLowPOp(
11 const OperatorDef& operator_def,
13 : BaseType(operator_def, ws),
14 order_(StringToStorageOrder(
15 this->template GetSingleArgument<
std::string>(
"order",
"NCHW"))),
16 OP_SINGLE_ARG(int,
"group", group_, 1) {
17 CAFFE_ENFORCE_NE(order_, StorageOrder::UNKNOWN);
21 bool ChannelShuffleDNNLowPOp<T>::RunOnDevice() {
22 return order_ == StorageOrder::NCHW ? RunOnDeviceWithOrderNCHW()
23 : RunOnDeviceWithOrderNHWC();
27 bool ChannelShuffleDNNLowPOp<T>::RunOnDeviceWithOrderNCHW() {
30 this->ParseDNNLowPOperatorArguments_();
33 TensorQuantizationParams in_qparams =
34 GetInputTensorQuantizationParamsOf(
this, 0, qfactory_.get());
36 const auto& X = InputTensorCPU_(0);
37 auto* Y = OutputTensorCPU_(0);
39 const int N = X.dim32(0);
40 const int C = X.dim32(1);
42 CAFFE_ENFORCE_EQ(C % G, 0);
44 const int HxW = X.numel() / (N * C);
45 const int stride = C * HxW;
46 const T* X_data = X.template data<T>();
47 T* Y_data = Y->template mutable_data<T>();
49 #pragma omp parallel for 51 for (
int i = 0; i < N; ++i) {
52 ConstEigenMatrixMap<T> X_mat(X_data + i * stride, K * HxW, G);
53 for (
int j = 0; j < K; ++j) {
54 EigenMatrixMap<T>(Y_data + i * stride + j * G * HxW, HxW, G) =
55 X_mat.block(j * HxW, 0, HxW, G);
62 PropagateOutputTensorQuantizationParams(
this, 0, in_qparams);
68 bool ChannelShuffleDNNLowPOp<T>::RunOnDeviceWithOrderNHWC() {
71 this->ParseDNNLowPOperatorArguments_();
74 TensorQuantizationParams in_qparams =
75 GetInputTensorQuantizationParamsOf(
this, 0, qfactory_.get());
77 const auto& X = InputTensorCPU_(0);
78 auto* Y = OutputTensorCPU_(0);
80 const auto C = X.dim32(X.ndim() - 1);
81 const auto G = this->group_;
82 CAFFE_ENFORCE(C % G == 0,
"");
84 std::array<int, 2> dims = {G, K};
85 std::array<int, 2> axes = {1, 0};
86 const T* X_data = X.template data<T>();
87 T* Y_data = Y->template mutable_data<T>();
89 if (G == 4 && std::is_same<T, std::uint8_t>::value && GetCpuId().avx2()) {
91 #pragma omp parallel for 93 for (
auto i = 0; i < X.numel(); i += C) {
95 fbgemm::transpose_4rows(
97 reinterpret_cast<const std::uint8_t*>(X_data + i),
98 reinterpret_cast<std::uint8_t*>(Y_data + i));
102 #pragma omp parallel for 104 for (
auto i = 0; i < X.numel(); i += C) {
107 2, dims.data(), axes.data(), X_data + i, Y_data + i, &context_);
114 PropagateOutputTensorQuantizationParams(
this, 0, in_qparams);
119 REGISTER_CPU_OPERATOR_WITH_ENGINE(
122 ChannelShuffleDNNLowPOp<uint8_t>);
124 REGISTER_CPU_OPERATOR_WITH_ENGINE(
127 ChannelShuffleDNNLowPOp<uint8_t>);
129 REGISTER_CPU_OPERATOR_WITH_ENGINE(
132 ChannelShuffleDNNLowPOp<uint16_t>);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...