Caffe2 - C++ API
A deep learning, cross platform ML framework
elementwise_sum_dnnlowp_op.cc
1 #include "utility_dnnlowp_ops.h"
2 
3 #include <array>
4 #include <tuple>
5 #include <type_traits>
6 
7 // #define DNNLOWP_MEASURE_TIME_BREAKDOWN
8 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
9 #include <chrono>
10 #endif
11 
12 #include "dnnlowp_partition.h"
13 
14 namespace caffe2 {
15 
16 using namespace std;
17 
18 template <typename T, bool ReluFused>
19 SumDNNLowPOp<T, ReluFused>::SumDNNLowPOp(
20  const OperatorDef& operator_def,
21  Workspace* ws)
22  : BaseType(operator_def, ws) {}
23 
24 template <typename T, bool ReluFused>
25 bool SumDNNLowPOp<T, ReluFused>::RunOnDevice() {
26  if (!this->arguments_parsed_) {
27  dnnlowp::ParseDNNLowPOperatorArguments(
28  this, &dequantize_output_, &measure_quantization_error_, &followed_by_);
29 
30  if (ReluFused) {
31  // It's actually fused with Relu not followed by but setting this to make
32  // sure quantization error is correctly measured in
33  // this->MeasureQuantizationError_
34  followed_by_ = "Relu";
35  dnnlowp::AdjustOutputTensorQuantizationParamsWithFollowedBy(
36  this, followed_by_);
37  }
38  this->arguments_parsed_ = true;
39  }
40 
41 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
42  chrono::time_point<chrono::system_clock> t_begin, t_end;
43 
44  t_begin = chrono::system_clock::now();
45 #endif
46 
47  if (!GetQuantizationParameters_()) {
48  return false;
49  }
50 
51 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
52  t_end = chrono::system_clock::now();
53  double dt = chrono::duration<double>(t_end - t_begin).count();
54  LOG(INFO) << "this=" << this << " get_quant_params: " << dt * 1e3 << " ms";
55 
56  t_begin = chrono::system_clock::now();
57 #endif
58 
59  using namespace dnnlowp;
60  // Quantize inputs
61  int len = InputTensorCPU_(0).size();
62 
63  // Element-wise sum
64  int32_t intermediate_zero_point =
65  intermediate_qparams_.zero_point * InputSize();
66 
67  auto* output = OutputTensorCPU_(0);
68  output->ResizeLike(InputTensorCPU_(0));
69 
70  T* output_data = GetQuantizedOutputData_();
71 
72  if (InputTensorCPU_(0).template IsType<T>()) {
73  if (InputSize() == 2 && is_same<T, uint8_t>::value && GetCpuId().avx2() &&
74  GetCpuId().fma()) {
75  // fast path when we have 2 uint8_t inputs with AVX2 / FMA support
76  array<const T*, 2> input_data;
77  for (int i = 0; i < 2; ++i) {
78  input_data[i] = InputTensorCPU_(i).template data<T>();
79  }
80 
81 #ifdef _OPENMP
82 #pragma omp parallel
83 #endif
84  {
85  constexpr int VLEN = 8;
86  int j_begin, j_end;
87  tie(j_begin, j_end) = Get1DPartition(
88  len, dnnlowp_get_num_threads(), dnnlowp_get_thread_num(), VLEN);
89 
90  internal::ElementWiseSumAVX2<T, ReluFused>(
91  input_data[0] + j_begin,
92  input_data[1] + j_begin,
93  output_data + j_begin,
94  j_end - j_begin,
95  in_qparams_[0].scale,
96  in_qparams_[0].zero_point,
97  in_qparams_[1].scale,
98  in_qparams_[1].zero_point,
99  out_qparams_.scale,
100  out_qparams_.zero_point);
101  } // omp parallel
102  } else {
103  RequantizationParams in_requantization_params[InputSize()];
104  const T* input_data[InputSize()];
105  for (int i = 0; i < InputSize(); ++i) {
106  float real_multiplier =
107  in_qparams_[i].scale / intermediate_qparams_.scale;
108  in_requantization_params[i] = qfactory_->ChooseRequantizationMultiplier(
109  real_multiplier, intermediate_qparams_);
110  input_data[i] = InputTensorCPU_(i).template data<T>();
111  }
112 
113 #ifdef _OPENMP
114 #pragma omp parallel
115 #endif
116  {
117  int j_begin, j_end;
118  tie(j_begin, j_end) = Get1DPartition(
119  len, dnnlowp_get_num_threads(), dnnlowp_get_thread_num());
120 
121  for (int j = j_begin; j < j_end; ++j) {
122  int32_t acc = 0;
123  for (int i = 0; i < InputSize(); ++i) {
124  acc += fbgemm::Requantize<int32_t>(
125  input_data[i][j] - in_qparams_[i].zero_point,
126  in_requantization_params[i]);
127  }
128  int32_t raw = acc - intermediate_zero_point;
129  if (ReluFused) {
130  raw = std::max(0, raw);
131  }
132  output_data[j] =
133  fbgemm::Requantize<T>(raw, out_requantization_params_);
134  }
135  }
136  }
137  } // InputTensorCPU_(0).template IsType<T>()
138  else {
139  const float* input_data[InputSize()];
140  for (int i = 0; i < InputSize(); ++i) {
141  input_data[i] = InputTensorCPU_(i).template data<float>();
142  }
143 
144 #ifdef _OPENMP
145 #pragma omp parallel
146 #endif
147  {
148  int j_begin, j_end;
149  tie(j_begin, j_end) = Get1DPartition(
150  len, dnnlowp_get_num_threads(), dnnlowp_get_thread_num());
151 
152  for (int j = j_begin; j < j_end; ++j) {
153  int32_t acc = 0;
154  for (int i = 0; i < InputSize(); ++i) {
155  acc += fbgemm::Quantize<int32_t>(
156  ((const float*)input_data[i])[j],
157  intermediate_qparams_.zero_point,
158  intermediate_qparams_.scale,
159  qfactory_->GetEltwiseQuantizePrecision());
160  }
161  int32_t raw = acc - intermediate_zero_point;
162  if (ReluFused) {
163  raw = std::max(0, raw);
164  }
165  output_data[j] = fbgemm::Requantize<T>(raw, out_requantization_params_);
166  }
167  }
168  } // !InputTensorCPU_(0).template IsType<T>()
169 
170 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
171  t_end = chrono::system_clock::now();
172  dt = chrono::duration<double>(t_end - t_begin).count();
173  LOG(INFO) << "this=" << this << " requantize inputs: " << dt * 1e3 << " ms";
174 
175  t_begin = chrono::system_clock::now();
176 #endif
177 
178  RunOnDeviceEpilogue_();
179 
180 #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN
181  t_end = chrono::system_clock::now();
182  dt = chrono::duration<double>(t_end - t_begin).count();
183  LOG(INFO) << "this=" << this << " prologue: " << dt * 1e3 << " ms";
184 
185  t_begin = chrono::system_clock::now();
186 #endif
187 
188  return true;
189 } // DoRunQuantizedWithType_
190 
191 template <typename T, bool ReluFused>
192 bool SumDNNLowPOp<T, ReluFused>::GetQuantizationParameters_() {
193  using namespace dnnlowp;
194 
195  // Find global min and max of all inputs
196  float global_min = numeric_limits<float>::max(),
197  global_max = numeric_limits<float>::lowest();
198 
199  for (int i = 0; i < InputSize(); ++i) {
200  in_qparams_[i] =
201  GetInputTensorQuantizationParamsOf(this, i, qfactory_.get());
202 
203  global_min = std::min(global_min, in_qparams_[i].Min());
204  global_max = std::max(global_max, in_qparams_[i].Max());
205  }
206 
207  intermediate_qparams_ = qfactory_->ChooseQuantizationParams(
208  global_min,
209  global_max,
210  qfactory_->GetEltwiseQuantizePrecision(),
211  qfactory_->GetPreserveActivationSparsity());
212 
213  GetOutputQuantizationParams_();
214 
215  // requantize from the intermediate precision to the final precision
216  float real_multiplier = intermediate_qparams_.scale / out_qparams_.scale;
217  out_requantization_params_ =
218  qfactory_->ChooseRequantizationMultiplier(real_multiplier, out_qparams_);
219 
220  return true;
221 }
222 
223 OPERATOR_SCHEMA(SumRelu)
224  .NumInputs(1, INT_MAX)
225  .NumOutputs(1)
226  .AllowInplace({{0, 0}})
227  .InputsCanCrossDevices()
228  .IdenticalTypeAndShapeOfInput(0)
229  .Input(0, "data_0", "First of the input tensors. Can be inplace.")
230  .Output(0, "sum", "Output tensor. Same dimension as inputs.");
231 
232 REGISTER_CPU_OPERATOR_WITH_ENGINE(Sum, DNNLOWP, SumDNNLowPOp<uint8_t, false>);
233 REGISTER_CPU_OPERATOR_WITH_ENGINE(
234  SumRelu,
235  DNNLOWP,
236  SumDNNLowPOp<uint8_t, true>);
237 
238 REGISTER_CPU_OPERATOR_WITH_ENGINE(
239  Int8Sum,
240  DNNLOWP,
241  SumDNNLowPOp<uint8_t, false>);
242 REGISTER_CPU_OPERATOR_WITH_ENGINE(
243  Int8SumRelu,
244  DNNLOWP,
245  SumDNNLowPOp<uint8_t, true>);
246 
247 REGISTER_CPU_OPERATOR_WITH_ENGINE(
248  Sum,
249  DNNLOWP_16,
250  SumDNNLowPOp<uint16_t, false>);
251 REGISTER_CPU_OPERATOR_WITH_ENGINE(
252  SumRelu,
253  DNNLOWP_16,
254  SumDNNLowPOp<uint16_t, true>);
255 
256 } // namespace caffe2
Definition: OpClasses.h:414
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13