Caffe2 - C++ API
A deep learning, cross platform ML framework
concat_dnnlowp_op.cc
1 #include "concat_dnnlowp_op.h"
2 
3 #ifdef _OPENMP
4 #include <omp.h>
5 #endif
6 
7 #include "dnnlowp_partition.h"
8 
9 namespace caffe2 {
10 
11 using namespace std;
12 
13 template <typename T>
14 ConcatDNNLowPOp<T>::ConcatDNNLowPOp(
15  const OperatorDef& operator_def,
16  Workspace* ws)
17  : BaseType(operator_def, ws) {
18  if (HasArgument("axis")) {
19  axis_ = this->template GetSingleArgument<int>("axis", -1);
20  add_axis_ = this->template GetSingleArgument<int>("add_axis", 0);
21  } else {
22  axis_ = GetDimFromOrderString(
23  this->template GetSingleArgument<string>("order", "NCHW"));
24  add_axis_ = 0;
25  }
26  CAFFE_ENFORCE_GE(axis_, 0);
27  requantization_params_.resize(InputSize());
28 }
29 
30 template <typename T>
31 bool ConcatDNNLowPOp<T>::RunOnDevice() {
32  GetQuantizationParameters_();
33 
34  auto* output = OutputTensorCPU_(0);
35  Tensor* split = nullptr;
36  int* axis_data = nullptr;
37  if (OutputSize() >= 2) {
38  split = this->template Output<Tensor>(1, CPU);
39  split->Resize(vector<int64_t>(1, InputSize()));
40  axis_data = split->template mutable_data<int>();
41  }
42  auto& input_zero = InputTensorCPU_(0);
43  CAFFE_ENFORCE_LT(
44  axis_,
45  input_zero.ndim() + (add_axis_ ? 1 : 0),
46  "Axis not in input ndim range.");
47  for (int i = 1; i < InputSize(); ++i) {
48  CAFFE_ENFORCE(
49  InputTensorCPU_(i).dtype() == input_zero.dtype(),
50  "All inputs must have the same type, expected: ",
51  input_zero.dtype().name(),
52  " but got: ",
53  InputTensorCPU_(i).dtype().name(),
54  " for input: ",
55  i);
56  }
57 
58  int before = 1, after = 1;
59  vector<int64_t> output_dims(input_zero.sizes().vec());
60  for (int i = 0; i < input_zero.ndim(); ++i) {
61  if (i == axis_ && !add_axis_) {
62  continue;
63  }
64  int dim = input_zero.dim32(i);
65  if (i < axis_) {
66  before *= dim;
67  } else { // i > axis_ || i == axis_ && add_axis_
68  after *= dim;
69  }
70  // check the input dims are compatible.
71  for (int j = 1; j < InputSize(); ++j) {
72  int dim_j = InputTensorCPU_(j).dim32(i);
73  CAFFE_ENFORCE(
74  dim == dim_j,
75  "Expect dimension = ",
76  dim,
77  " got ",
78  dim_j,
79  " at axis = ",
80  i,
81  " for input: ",
82  j,
83  ". The input tensors can only have different dimensions "
84  "when arg 'add_axis' = 0 and along the axis = ",
85  axis_,
86  " <",
87  InputTensorCPU_(0).sizes(),
88  "> vs <",
89  InputTensorCPU_(j).sizes(),
90  ">.");
91  }
92  }
93 
94  int output_channels = 0;
95  for (int i = 0; i < InputSize(); ++i) {
96  auto dim = add_axis_ ? 1 : InputTensorCPU_(i).dim32(axis_);
97  if (axis_data) {
98  axis_data[i] = dim;
99  }
100  output_channels += dim;
101  }
102  if (add_axis_) {
103  output_dims.insert(output_dims.begin() + axis_, output_channels);
104  } else {
105  output_dims[axis_] = output_channels;
106  }
107  output->Resize(output_dims);
108  size_t output_offset = 0;
109 
110  char* output_data = reinterpret_cast<char*>(GetQuantizedOutputData_());
111 
112  for (int i = 0; i < InputSize(); ++i) {
113  auto& input = InputTensorCPU_(i);
114  auto axis_dim = add_axis_ ? 1 : input.dim32(axis_);
115 
116  vector<T> input_temp(input.numel());
117 #ifdef _OPENMP
118 #pragma omp parallel
119 #endif
120  {
121  int nthreads = dnnlowp_get_num_threads();
122  int tid = dnnlowp_get_thread_num();
123  int before_begin, before_end;
124  int after_begin, after_end;
125 
127  before,
128  axis_dim * after,
129  nthreads,
130  tid,
131  &before_begin,
132  &before_end,
133  &after_begin,
134  &after_end);
135 
136  int j_begin = before_begin * axis_dim * after + after_begin;
137  int j_end = (before_end - 1) * axis_dim * after + after_end;
138 
139  if (InputTensorCPU_(i).template IsType<T>()) {
140  const T* input_data = input.template data<T>();
141  for (int j = j_begin; j < j_end; ++j) {
142  input_temp[j] = fbgemm::Requantize<T>(
143  input_data[j] - in_qparams_[i].zero_point,
144  requantization_params_[i]);
145  }
146  } else {
147  fbgemm::Quantize<T>(
148  input.template data<float>() + j_begin,
149  input_temp.data() + j_begin,
150  j_end - j_begin,
151  out_qparams_);
152  }
153 
154  math::CopyMatrix<CPUContext>(
155  sizeof(T),
156  before_end - before_begin,
157  after_end - after_begin,
158  input_temp.data() + before_begin * axis_dim * after + after_begin,
159  axis_dim * after,
160  output_data + output_offset + before_begin * output_channels * after +
161  after_begin * sizeof(T),
162  output_channels * after,
163  &context_,
164  input_zero.dtype().copy());
165  }
166 
167  output_offset += axis_dim * after * sizeof(T);
168  }
169 
170  RunOnDeviceEpilogue_();
171 
172  return true;
173 }
174 
175 template <typename T>
176 void ConcatDNNLowPOp<T>::GetQuantizationParameters_() {
177  using namespace dnnlowp;
178  for (int i = 0; i < InputSize(); ++i) {
179  in_qparams_[i] =
180  GetInputTensorQuantizationParamsOf(this, i, qfactory_.get());
181  }
182 
183  GetOutputQuantizationParams_();
184 
185  for (int i = 0; i < InputSize(); ++i) {
186  float real_multiplier = in_qparams_[i].scale / out_qparams_.scale;
187  requantization_params_[i] = qfactory_->ChooseRequantizationMultiplier(
188  real_multiplier, out_qparams_);
189  }
190 }
191 
192 REGISTER_CPU_OPERATOR_WITH_ENGINE(Concat, DNNLOWP, ConcatDNNLowPOp<uint8_t>);
193 REGISTER_CPU_OPERATOR_WITH_ENGINE(
194  Int8Concat,
195  DNNLOWP,
196  ConcatDNNLowPOp<uint8_t>);
197 
198 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
bool HasArgument(const string &name) const
Checks if the operator has an argument of the given name.
Definition: operator.h:70
void Get1DPartitionOf2D(int m, int n, int nthreads, int tid, int *m_begin, int *m_end, int *n_begin, int *n_end, int n_align)
1D-partition m x n 2D work.