Caffe2 - C++ API
A deep learning, cross platform ML framework
caffe2_dnnlowp_utils.cc
1 #include "caffe2_dnnlowp_utils.h"
2 #include "caffe2/core/tensor_int8.h"
3 #include "caffe2/quantization/server/sigmoid.h"
4 #include "caffe2/quantization/server/tanh.h"
5 
6 #include <map>
7 #ifdef _OPENMP
8 #include <omp.h>
9 #endif
10 
11 C10_DECLARE_int32(caffe2_dnnlowp_activation_quantization_precision);
12 C10_DECLARE_int32(caffe2_dnnlowp_weight_quantization_precision);
13 C10_DECLARE_int32(caffe2_dnnlowp_requantization_multiplier_precision);
14 C10_DECLARE_int32(caffe2_dnnlowp_eltwise_quantization_precision);
15 C10_DECLARE_bool(caffe2_dnnlowp_force_scale_power_of_two);
16 C10_DECLARE_bool(caffe2_dnnlowp_preserve_activation_sparsity);
17 C10_DECLARE_bool(caffe2_dnnlowp_preserve_weight_sparsity);
18 C10_DECLARE_string(caffe2_dnnlowp_activation_quantization_kind);
19 C10_DECLARE_string(caffe2_dnnlowp_weight_quantization_kind);
20 
21 namespace dnnlowp {
22 
23 using namespace std;
24 using namespace caffe2;
26 
27 static bool HasDNNLowPEngine_(const OperatorDef& op_def) {
28  const string ENGINE_PREFIX = "DNNLOWP";
29  return strncmp(
30  op_def.engine().c_str(),
31  ENGINE_PREFIX.c_str(),
32  ENGINE_PREFIX.size()) == 0;
33 }
34 
35 static bool HasDNNLowPEngine_(const OperatorBase& op) {
36  return HasDNNLowPEngine_(op.debug_def());
37 }
38 
39 void PropagateOutputTensorQuantizationParams(
40  OperatorBase* op,
41  int idx,
42  const TensorQuantizationParams& qparams) {
43  LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
44  Int8TensorCPU* output =
45  op->Outputs()[idx]->template GetMutable<Int8TensorCPU>();
46  output->scale = qparams.scale;
47  output->zero_point = qparams.zero_point;
48 }
49 
50 TensorQuantizationParams GetInputTensorQuantizationParamsOf(
51  OperatorBase* op,
52  int idx,
53  const QuantizationFactory* qfactory,
54  bool is_weight /*=false*/) {
55  LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
56 
57  if (op->InputIsType<Int8TensorCPU>(idx)) {
58  const Int8TensorCPU& int8_tensor = op->Input<Int8TensorCPU>(idx);
59  TensorQuantizationParams qparams;
60  qparams.scale = int8_tensor.scale;
61  qparams.zero_point = int8_tensor.zero_point;
62  qparams.precision = qfactory->GetActivationPrecision();
63  return qparams;
64  } else {
65  const TensorCPU* tensor = &op->template Input<Tensor>(idx, CPU);
66  CAFFE_ENFORCE(tensor->template IsType<float>());
67  CAFFE_ENFORCE(tensor->numel() == 0 || tensor->template data<float>());
68 
69  float min, max;
70  fbgemm::FindMinMax(
71  tensor->template data<float>(), &min, &max, tensor->numel());
72 
73  return qfactory->ChooseQuantizationParams(min, max, is_weight);
74  }
75 }
76 
77 static string OutputArgumentIdxString_(int idx) {
78  return idx == 0 ? "" : to_string(idx + 1);
79 }
80 
81 static string OutputScaleArgumentName(int idx) {
82  return "Y" + OutputArgumentIdxString_(idx) + "_scale";
83 }
84 
85 static string OutputZeroPointArgumentName(int idx) {
86  return "Y" + OutputArgumentIdxString_(idx) + "_zero_point";
87 }
88 
89 static void SetStaticQuantizationParams_(
90  OperatorDef* op_def,
91  int output_index,
92  const TensorQuantizationParams& qparams) {
93  AddArgument<float>(
94  OutputScaleArgumentName(output_index), qparams.scale, op_def);
95  AddArgument<int32_t>(
96  OutputZeroPointArgumentName(output_index), qparams.zero_point, op_def);
97 }
98 
99 void SetStaticQuantizationParams(
100  OperatorBase* op,
101  int output_index,
102  const TensorQuantizationParams& qparams) {
103  LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
104  auto op_def = make_shared<OperatorDef>();
105  *op_def = op->debug_def();
106  SetStaticQuantizationParams_(op_def.get(), output_index, qparams);
107  op->set_debug_def(op_def);
108 }
109 
110 bool HasStaticQuantization(
111  const caffe2::OperatorBase* op,
112  int output_index /*=0*/) {
113  LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
114  return op->HasSingleArgumentOfType<float>(
115  OutputScaleArgumentName(output_index));
116 }
117 
118 TensorQuantizationParams GetStaticQuantizationParamsOf(
119  const caffe2::OperatorBase* op,
120  int idx) {
121  LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
122  unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);
123 
124  TensorQuantizationParams qparams;
125  qparams.scale = op->GetSingleArgument<float>(OutputScaleArgumentName(idx), 0);
126  qparams.zero_point =
127  op->GetSingleArgument<int32_t>(OutputZeroPointArgumentName(idx), 0);
128  qparams.precision = qfactory->GetActivationPrecision();
129 
130  return qparams;
131 }
132 
133 template <typename T>
134 const T* QuantizeInputIfNeeded(
135  OperatorBase* op,
136  int input_index,
137  const TensorQuantizationParams& qparams,
138  vector<T>& temp) {
139  if (op->InputIsType<int8::Int8TensorCPU>(input_index)) {
140  // Already quantized
141  return op->Input<int8::Int8TensorCPU>(input_index).t.data<T>();
142  } else {
143  // Need to quantize
144  const TensorCPU& tensor = op->Input<Tensor>(input_index, CPU);
145  temp.resize(tensor.numel());
146  fbgemm::Quantize<T>(
147  tensor.data<float>(), temp.data(), temp.size(), qparams);
148  return temp.data();
149  }
150 }
151 
152 template <typename T>
153 const T* RowWiseQuantizeInputIfNeeded(
154  OperatorBase* op,
155  int input_index,
156  const std::vector<TensorQuantizationParams>& qparams,
157  vector<T>& temp) {
158  if (op->InputIsType<int8::Int8TensorCPU>(input_index)) {
159  // Already quantized
160  return op->Input<int8::Int8TensorCPU>(input_index).t.data<T>();
161  } else {
162  // Need to quantize
163  const TensorCPU& tensor = op->Input<Tensor>(input_index, CPU);
164  temp.resize(tensor.numel());
165  // number of rows
166  int N = qparams.size();
167  int rowwidth = temp.size() / N;
168  // quantize each row
169  for (int i = 0; i < N; i++) {
170  fbgemm::Quantize<T>(
171  tensor.data<float>() + rowwidth * i,
172  temp.data() + rowwidth * i,
173  rowwidth,
174  qparams[i]);
175  }
176  return temp.data();
177  }
178 }
179 
180 template const uint8_t* QuantizeInputIfNeeded<uint8_t>(
181  OperatorBase* op,
182  int input_index,
183  const TensorQuantizationParams& qparams,
184  vector<uint8_t>& temp);
185 
186 template const int8_t* QuantizeInputIfNeeded<int8_t>(
187  OperatorBase* op,
188  int input_index,
189  const TensorQuantizationParams& qparams,
190  vector<int8_t>& temp);
191 
192 template const uint16_t* QuantizeInputIfNeeded<uint16_t>(
193  OperatorBase* op,
194  int input_index,
195  const TensorQuantizationParams& qparams,
196  vector<uint16_t>& temp);
197 
198 template const int16_t* QuantizeInputIfNeeded<int16_t>(
199  OperatorBase* op,
200  int input_index,
201  const TensorQuantizationParams& qparams,
202  vector<int16_t>& temp);
203 
204 template const uint8_t* RowWiseQuantizeInputIfNeeded<uint8_t>(
205  OperatorBase* op,
206  int input_index,
207  const std::vector<TensorQuantizationParams>& qparams,
208  vector<uint8_t>& temp);
209 
210 template const uint16_t* RowWiseQuantizeInputIfNeeded<uint16_t>(
211  OperatorBase* op,
212  int input_index,
213  const std::vector<TensorQuantizationParams>& qparams,
214  vector<uint16_t>& temp);
215 
216 void MeasureQuantizationError(
217  const float* actual,
218  const float* ref,
219  size_t len,
220  QuantizationErrorStats* stat) {
221  for (int i = 0; i < len; ++i) {
222  stat->sum_sq += ref[i] * ref[i];
223  float err = actual[i] - ref[i];
224  stat->sum_err_sq += err * err;
225 
226  if (fabs(err) > stat->max_abs_err) {
227  stat->max_abs_err = fabs(err);
228  stat->max_err_actual = actual[i];
229  stat->max_err_ref = ref[i];
230  }
231  }
232  ++stat->measure_cnt;
233 }
234 
235 void ReportQuantizationError(
236  const OperatorBase* op,
237  const QuantizationErrorStats& stat) {
238  if (stat.sum_sq == 0) {
239  LOG(INFO) << " output " << op->debug_def().output(0) << " of operator "
240  << op << " with type " << op->debug_def().type()
241  << " has l2 relative error nan (stat.sum_err_sq "
242  << stat.sum_err_sq << " stat.sum_sq 0)"
243  << " and max abs error " << stat.max_abs_err << " (reference is "
244  << stat.max_err_ref << " and actual is " << stat.max_err_actual
245  << ")"
246  << " sum_err_sq " << stat.sum_err_sq << " sum_sq_ " << stat.sum_sq
247  << " cnt " << stat.measure_cnt;
248  } else {
249  LOG(INFO) << " output " << op->debug_def().output(0) << " of operator "
250  << op << " with type " << op->debug_def().type()
251  << " has l2 relative error "
252  << std::sqrt(stat.sum_err_sq) / std::sqrt(stat.sum_sq)
253  << " and max abs error " << stat.max_abs_err << " (reference is "
254  << stat.max_err_ref << " and actual is " << stat.max_err_actual
255  << ")"
256  << " sum_err_sq " << stat.sum_err_sq << " sum_sq_ " << stat.sum_sq
257  << " cnt " << stat.measure_cnt;
258  }
259 }
260 
261 static unique_ptr<QuantizationFactory> GetQuantizationFactoryOf_(
262  const OperatorDef& op_def) {
263  int activation_precision =
264  ArgumentHelper::GetSingleArgument<OperatorDef, int>(
265  op_def,
266  "activation_precision",
267  FLAGS_caffe2_dnnlowp_activation_quantization_precision);
268  int weight_precision = ArgumentHelper::GetSingleArgument<OperatorDef, int>(
269  op_def,
270  "weight_precision",
271  FLAGS_caffe2_dnnlowp_weight_quantization_precision);
272  int requantization_multiplier_precision =
273  ArgumentHelper::GetSingleArgument<OperatorDef, int>(
274  op_def,
275  "requantization_multiplier_precision",
276  FLAGS_caffe2_dnnlowp_requantization_multiplier_precision);
277  int eltwise_quantization_precision =
278  ArgumentHelper::GetSingleArgument<OperatorDef, int>(
279  op_def,
280  "eltwise_quantization_precision",
281  FLAGS_caffe2_dnnlowp_eltwise_quantization_precision);
282  bool preserve_activation_sparsity =
283  ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
284  op_def,
285  "preserve_activation_sparsity",
286  FLAGS_caffe2_dnnlowp_preserve_activation_sparsity);
287  bool preserve_weight_sparsity =
288  ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
289  op_def,
290  "preserve_weight_sparsity",
291  FLAGS_caffe2_dnnlowp_preserve_weight_sparsity);
292  bool force_scale_power_of_two =
293  ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
294  op_def,
295  "force_scale_power_of_two",
296  FLAGS_caffe2_dnnlowp_force_scale_power_of_two);
297  string activation_quantization_kind =
298  ArgumentHelper::GetSingleArgument<OperatorDef, string>(
299  op_def,
300  "activation_quantization_kind",
301  FLAGS_caffe2_dnnlowp_activation_quantization_kind);
302  string weight_quantization_kind =
303  ArgumentHelper::GetSingleArgument<OperatorDef, string>(
304  op_def,
305  "weight_quantization_kind",
306  FLAGS_caffe2_dnnlowp_weight_quantization_kind);
307 
308  VLOG(2) << "Quantization method for op with output " << op_def.output(0)
309  << " activation_precision " << activation_precision
310  << " weight_precision " << weight_precision
311  << " requantization_multiplier_precision "
312  << requantization_multiplier_precision
313  << " eltwise_quantization_precision "
314  << eltwise_quantization_precision << " preserve_activation_sparsity "
315  << preserve_activation_sparsity << " preserve_weight_sparsity "
316  << preserve_weight_sparsity << " force_scale_power_of_two "
317  << force_scale_power_of_two << " activation_quantization_kind "
318  << activation_quantization_kind << " weight_quantization_kind "
319  << weight_quantization_kind;
320 
321  return unique_ptr<QuantizationFactory>(new QuantizationFactory(
322  activation_precision,
323  weight_precision,
324  requantization_multiplier_precision,
325  eltwise_quantization_precision,
326  preserve_activation_sparsity,
327  preserve_weight_sparsity,
328  force_scale_power_of_two,
329  StringToKind(activation_quantization_kind),
330  StringToKind(weight_quantization_kind)));
331 }
332 
333 unique_ptr<QuantizationFactory> GetQuantizationFactoryOf(
334  const OperatorBase* op) {
335  return GetQuantizationFactoryOf_(op->debug_def());
336 }
337 
338 void AdjustOutputTensorQuantizationParamsWithFollowedBy(
339  OperatorBase* op,
340  const string& followed_by) {
341  LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
342 
343  auto op_def = make_shared<OperatorDef>();
344  *op_def = op->debug_def();
345  AddArgument<string>("followed_by", followed_by, op_def.get());
346  op->set_debug_def(op_def);
347 
348  if (followed_by == "Sigmoid") {
349  SetStaticQuantizationParams(
350  op, 0, Sigmoid<uint8_t>().GetInputQuantizationParams());
351  } else if (followed_by == "Tanh") {
352  SetStaticQuantizationParams(
353  op, 0, Tanh<uint8_t>().GetInputQuantizationParams());
354  } else if (followed_by == "Relu") {
355  if (HasStaticQuantization(op)) {
356  unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);
357  TensorQuantizationParams qparams = GetStaticQuantizationParamsOf(op, 0);
358  qparams = qfactory->ChooseQuantizationParams(0, qparams.Max());
359  SetStaticQuantizationParams(op, 0, qparams);
360  }
361  } else {
362  LOG(WARNING) << "Unknown followed_by " << followed_by;
363  }
364 }
365 
366 void ParseDNNLowPOperatorArguments(
367  OperatorBase* op,
368  bool* dequantize_output,
369  bool* measure_quantization_error,
370  string* followed_by) {
371  // When exiting quantized region or we're just doing per-op quantization,
372  // dequantize the outputs as floats.
373  if (dequantize_output) {
374  *dequantize_output =
375  op->GetSingleArgument<bool>("dequantize_output", false);
376  if (*dequantize_output) {
377  VLOG(2) << "Dequantize output " << op->debug_def().output(0)
378  << " of operator type " << op->debug_def().type();
379  }
380  }
381 
382  // Measure quantization error by comparing with reference fp32 operators.
383  if (measure_quantization_error) {
384  *measure_quantization_error =
385  op->GetSingleArgument<bool>("measure_quantization_error", false);
386  }
387 
388  // Output scale and zero_point can be specified (actually recommended to be
389  // specified for performance to avoid on-the-fly quantization parameter
390  // selection) from activation distributions collected from profiling.
391  if (HasStaticQuantization(op)) {
392  TensorQuantizationParams qparams = GetStaticQuantizationParamsOf(op, 0);
393  unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);
394  if (qparams.zero_point != (1 << (qfactory->GetActivationPrecision() - 1)) &&
395  qparams.zero_point != 0 && qfactory->GetPreserveActivationSparsity()) {
396  LOG(WARNING) << "Symmetric quantization is used for activation but "
397  "Y_zero_point is "
398  << qparams.zero_point << " for " << op->debug_def().output(0)
399  << " output activation of an operator with type "
400  << op->debug_def().type();
401  }
402  } else {
403  if (op->HasSingleArgumentOfType<int>("Y_zero_point")) {
404  LOG(WARNING) << "Y_zero_point without Y_scale for "
405  << op->debug_def().output(0)
406  << " (an output of operator type " << op->debug_def().type()
407  << ") doesn't make sense";
408  }
409  }
410 
411  // When an operator has only one consumer and the consumer only cares about
412  // a limited range of values, we can quantize more precisely.
413  if (op->HasSingleArgumentOfType<string>("followed_by")) {
414  string followed_by_ = op->GetSingleArgument<string>("followed_by", "");
415  VLOG(2) << "Operator with type " << op->debug_def().type() << " and output "
416  << op->debug_def().output(0) << " is followed by " << followed_by_;
417 
418  AdjustOutputTensorQuantizationParamsWithFollowedBy(op, followed_by_);
419  if (followed_by) {
420  *followed_by = followed_by_;
421  }
422  }
423 }
424 
425 NetDef AddScaleZeroOffsetArgumentsWithHistogram(
426  NetDef net_def,
427  const string& histogram_file_name) {
428  ifstream f(histogram_file_name);
429 
430  // check the format by looking at the first line
431  string first_line, word;
432  getline(f, first_line);
433  f.seekg(0, f.beg);
434  istringstream ist(first_line);
435  int nwords_first_line = 0;
436  while (ist >> word) {
437  ++nwords_first_line;
438  }
439 
440  ist.str(first_line);
441  ist.clear();
442 
443  bool new_format = true;
444  int op_index, i, nbins;
445  string op_type, tensor_name;
446  float min, max;
447  ist >> op_index >> op_type >> i >> tensor_name >> min >> max >> nbins;
448  if (nwords_first_line != nbins + 7) {
449  ist.str(first_line);
450  ist.clear();
451  ist >> op_index >> i >> tensor_name >> min >> max >> nbins;
452  if (nwords_first_line == nbins + 6) {
453  new_format = false;
454  } else {
455  LOG(WARNING) << "histogram file " << histogram_file_name
456  << " has an invalid format";
457  return net_def;
458  }
459  }
460 
461  // parse the input file
462  op_index = 0;
463  for (auto& op_def : *net_def.mutable_op()) {
464  ArgumentHelper arg_helper(op_def);
465 
466  for (i = 0; i < op_def.output().size(); ++i) {
467  int op_index2, i2;
468 
469  if (new_format) {
470  f >> op_index2 >> op_type >> i2 >> tensor_name >> min >> max >> nbins;
471  } else {
472  f >> op_index2 >> i2 >> tensor_name >> min >> max >> nbins;
473  }
474  LOG_IF(WARNING, op_index2 != op_index)
475  << "op index " << op_index2 << " doesn't match with " << op_index;
476  LOG_IF(WARNING, tensor_name != op_def.output(i))
477  << tensor_name << " in histogram file line " << op_index
478  << " doesn't match with operation def " << op_def.output(i);
479  LOG_IF(WARNING, i2 != i)
480  << "output tensor index " << i2 << " doesn't match with " << i;
481  if (new_format) {
482  LOG_IF(WARNING, op_type != op_def.type())
483  << "operator type " << op_type << " in histogram file line "
484  << op_index << " doesn't match with operation def "
485  << op_def.type();
486  }
487 
488  vector<uint64_t> bins;
489  for (int j = 0; j < nbins; ++j) {
490  uint64_t cnt;
491  f >> cnt;
492  bins.push_back(cnt);
493  }
494 
495  if (!HasDNNLowPEngine_(op_def) ||
496  arg_helper.GetSingleArgument<int>("dequantize_output", 0) != 0 ||
497  i > 0) {
498  LOG(INFO) << "Skip " << op_def.type() << " " << op_def.output(0);
499  continue;
500  }
501 
502  Histogram hist = Histogram(min, max, bins);
503 
504  unique_ptr<QuantizationFactory> qfactory =
505  GetQuantizationFactoryOf_(op_def);
506  TensorQuantizationParams qparams =
507  qfactory->ChooseQuantizationParams(hist);
508 
509  SetStaticQuantizationParams_(&op_def, 0, qparams);
510  }
511  ++op_index;
512  }
513 
514  return net_def;
515 }
516 
517 } // namespace dnnlowp
Tensor class holds a shared pointer to the implementation TensorImpl, redirects API calls to TensorIm...
Definition: tensor.h:25
A helper class to index into arguments.
Definition: proto_utils.h:200
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13