1 #include "caffe2_dnnlowp_utils.h" 2 #include "caffe2/core/tensor_int8.h" 3 #include "caffe2/quantization/server/sigmoid.h" 4 #include "caffe2/quantization/server/tanh.h" 11 C10_DECLARE_int32(caffe2_dnnlowp_activation_quantization_precision);
12 C10_DECLARE_int32(caffe2_dnnlowp_weight_quantization_precision);
13 C10_DECLARE_int32(caffe2_dnnlowp_requantization_multiplier_precision);
14 C10_DECLARE_int32(caffe2_dnnlowp_eltwise_quantization_precision);
15 C10_DECLARE_bool(caffe2_dnnlowp_force_scale_power_of_two);
16 C10_DECLARE_bool(caffe2_dnnlowp_preserve_activation_sparsity);
17 C10_DECLARE_bool(caffe2_dnnlowp_preserve_weight_sparsity);
18 C10_DECLARE_string(caffe2_dnnlowp_activation_quantization_kind);
19 C10_DECLARE_string(caffe2_dnnlowp_weight_quantization_kind);
27 static bool HasDNNLowPEngine_(
const OperatorDef& op_def) {
28 const string ENGINE_PREFIX =
"DNNLOWP";
30 op_def.engine().c_str(),
31 ENGINE_PREFIX.c_str(),
32 ENGINE_PREFIX.size()) == 0;
36 return HasDNNLowPEngine_(op.debug_def());
39 void PropagateOutputTensorQuantizationParams(
42 const TensorQuantizationParams& qparams) {
43 LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
44 Int8TensorCPU* output =
45 op->Outputs()[idx]->template GetMutable<Int8TensorCPU>();
46 output->scale = qparams.scale;
47 output->zero_point = qparams.zero_point;
50 TensorQuantizationParams GetInputTensorQuantizationParamsOf(
53 const QuantizationFactory* qfactory,
55 LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
57 if (op->InputIsType<Int8TensorCPU>(idx)) {
58 const Int8TensorCPU& int8_tensor = op->Input<Int8TensorCPU>(idx);
59 TensorQuantizationParams qparams;
60 qparams.scale = int8_tensor.scale;
61 qparams.zero_point = int8_tensor.zero_point;
62 qparams.precision = qfactory->GetActivationPrecision();
65 const TensorCPU* tensor = &op->template Input<Tensor>(idx, CPU);
66 CAFFE_ENFORCE(tensor->template IsType<float>());
67 CAFFE_ENFORCE(tensor->numel() == 0 || tensor->template data<float>());
71 tensor->template data<float>(), &min, &max, tensor->numel());
73 return qfactory->ChooseQuantizationParams(min, max, is_weight);
77 static string OutputArgumentIdxString_(
int idx) {
78 return idx == 0 ?
"" : to_string(idx + 1);
81 static string OutputScaleArgumentName(
int idx) {
82 return "Y" + OutputArgumentIdxString_(idx) +
"_scale";
85 static string OutputZeroPointArgumentName(
int idx) {
86 return "Y" + OutputArgumentIdxString_(idx) +
"_zero_point";
89 static void SetStaticQuantizationParams_(
92 const TensorQuantizationParams& qparams) {
94 OutputScaleArgumentName(output_index), qparams.scale, op_def);
96 OutputZeroPointArgumentName(output_index), qparams.zero_point, op_def);
99 void SetStaticQuantizationParams(
102 const TensorQuantizationParams& qparams) {
103 LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
104 auto op_def = make_shared<OperatorDef>();
105 *op_def = op->debug_def();
106 SetStaticQuantizationParams_(op_def.get(), output_index, qparams);
107 op->set_debug_def(op_def);
110 bool HasStaticQuantization(
113 LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
114 return op->HasSingleArgumentOfType<
float>(
115 OutputScaleArgumentName(output_index));
118 TensorQuantizationParams GetStaticQuantizationParamsOf(
121 LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
122 unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);
124 TensorQuantizationParams qparams;
125 qparams.scale = op->GetSingleArgument<
float>(OutputScaleArgumentName(idx), 0);
127 op->GetSingleArgument<int32_t>(OutputZeroPointArgumentName(idx), 0);
128 qparams.precision = qfactory->GetActivationPrecision();
133 template <
typename T>
134 const T* QuantizeInputIfNeeded(
137 const TensorQuantizationParams& qparams,
145 temp.resize(tensor.numel());
147 tensor.data<
float>(), temp.data(), temp.size(), qparams);
152 template <
typename T>
153 const T* RowWiseQuantizeInputIfNeeded(
156 const std::vector<TensorQuantizationParams>& qparams,
164 temp.resize(tensor.numel());
166 int N = qparams.size();
167 int rowwidth = temp.size() / N;
169 for (
int i = 0; i < N; i++) {
171 tensor.data<
float>() + rowwidth * i,
172 temp.data() + rowwidth * i,
180 template const uint8_t* QuantizeInputIfNeeded<uint8_t>(
183 const TensorQuantizationParams& qparams,
184 vector<uint8_t>& temp);
186 template const int8_t* QuantizeInputIfNeeded<int8_t>(
189 const TensorQuantizationParams& qparams,
190 vector<int8_t>& temp);
192 template const uint16_t* QuantizeInputIfNeeded<uint16_t>(
195 const TensorQuantizationParams& qparams,
196 vector<uint16_t>& temp);
198 template const int16_t* QuantizeInputIfNeeded<int16_t>(
201 const TensorQuantizationParams& qparams,
202 vector<int16_t>& temp);
204 template const uint8_t* RowWiseQuantizeInputIfNeeded<uint8_t>(
207 const std::vector<TensorQuantizationParams>& qparams,
208 vector<uint8_t>& temp);
210 template const uint16_t* RowWiseQuantizeInputIfNeeded<uint16_t>(
213 const std::vector<TensorQuantizationParams>& qparams,
214 vector<uint16_t>& temp);
216 void MeasureQuantizationError(
220 QuantizationErrorStats* stat) {
221 for (
int i = 0; i < len; ++i) {
222 stat->sum_sq += ref[i] * ref[i];
223 float err = actual[i] - ref[i];
224 stat->sum_err_sq += err * err;
226 if (fabs(err) > stat->max_abs_err) {
227 stat->max_abs_err = fabs(err);
228 stat->max_err_actual = actual[i];
229 stat->max_err_ref = ref[i];
235 void ReportQuantizationError(
237 const QuantizationErrorStats& stat) {
238 if (stat.sum_sq == 0) {
239 LOG(INFO) <<
" output " << op->debug_def().output(0) <<
" of operator " 240 << op <<
" with type " << op->debug_def().type()
241 <<
" has l2 relative error nan (stat.sum_err_sq " 242 << stat.sum_err_sq <<
" stat.sum_sq 0)" 243 <<
" and max abs error " << stat.max_abs_err <<
" (reference is " 244 << stat.max_err_ref <<
" and actual is " << stat.max_err_actual
246 <<
" sum_err_sq " << stat.sum_err_sq <<
" sum_sq_ " << stat.sum_sq
247 <<
" cnt " << stat.measure_cnt;
249 LOG(INFO) <<
" output " << op->debug_def().output(0) <<
" of operator " 250 << op <<
" with type " << op->debug_def().type()
251 <<
" has l2 relative error " 252 << std::sqrt(stat.sum_err_sq) / std::sqrt(stat.sum_sq)
253 <<
" and max abs error " << stat.max_abs_err <<
" (reference is " 254 << stat.max_err_ref <<
" and actual is " << stat.max_err_actual
256 <<
" sum_err_sq " << stat.sum_err_sq <<
" sum_sq_ " << stat.sum_sq
257 <<
" cnt " << stat.measure_cnt;
261 static unique_ptr<QuantizationFactory> GetQuantizationFactoryOf_(
262 const OperatorDef& op_def) {
263 int activation_precision =
264 ArgumentHelper::GetSingleArgument<OperatorDef, int>(
266 "activation_precision",
267 FLAGS_caffe2_dnnlowp_activation_quantization_precision);
268 int weight_precision = ArgumentHelper::GetSingleArgument<OperatorDef, int>(
271 FLAGS_caffe2_dnnlowp_weight_quantization_precision);
272 int requantization_multiplier_precision =
273 ArgumentHelper::GetSingleArgument<OperatorDef, int>(
275 "requantization_multiplier_precision",
276 FLAGS_caffe2_dnnlowp_requantization_multiplier_precision);
277 int eltwise_quantization_precision =
278 ArgumentHelper::GetSingleArgument<OperatorDef, int>(
280 "eltwise_quantization_precision",
281 FLAGS_caffe2_dnnlowp_eltwise_quantization_precision);
282 bool preserve_activation_sparsity =
283 ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
285 "preserve_activation_sparsity",
286 FLAGS_caffe2_dnnlowp_preserve_activation_sparsity);
287 bool preserve_weight_sparsity =
288 ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
290 "preserve_weight_sparsity",
291 FLAGS_caffe2_dnnlowp_preserve_weight_sparsity);
292 bool force_scale_power_of_two =
293 ArgumentHelper::GetSingleArgument<OperatorDef, bool>(
295 "force_scale_power_of_two",
296 FLAGS_caffe2_dnnlowp_force_scale_power_of_two);
297 string activation_quantization_kind =
298 ArgumentHelper::GetSingleArgument<OperatorDef, string>(
300 "activation_quantization_kind",
301 FLAGS_caffe2_dnnlowp_activation_quantization_kind);
302 string weight_quantization_kind =
303 ArgumentHelper::GetSingleArgument<OperatorDef, string>(
305 "weight_quantization_kind",
306 FLAGS_caffe2_dnnlowp_weight_quantization_kind);
308 VLOG(2) <<
"Quantization method for op with output " << op_def.output(0)
309 <<
" activation_precision " << activation_precision
310 <<
" weight_precision " << weight_precision
311 <<
" requantization_multiplier_precision " 312 << requantization_multiplier_precision
313 <<
" eltwise_quantization_precision " 314 << eltwise_quantization_precision <<
" preserve_activation_sparsity " 315 << preserve_activation_sparsity <<
" preserve_weight_sparsity " 316 << preserve_weight_sparsity <<
" force_scale_power_of_two " 317 << force_scale_power_of_two <<
" activation_quantization_kind " 318 << activation_quantization_kind <<
" weight_quantization_kind " 319 << weight_quantization_kind;
321 return unique_ptr<QuantizationFactory>(
new QuantizationFactory(
322 activation_precision,
324 requantization_multiplier_precision,
325 eltwise_quantization_precision,
326 preserve_activation_sparsity,
327 preserve_weight_sparsity,
328 force_scale_power_of_two,
329 StringToKind(activation_quantization_kind),
330 StringToKind(weight_quantization_kind)));
333 unique_ptr<QuantizationFactory> GetQuantizationFactoryOf(
335 return GetQuantizationFactoryOf_(op->debug_def());
338 void AdjustOutputTensorQuantizationParamsWithFollowedBy(
340 const string& followed_by) {
341 LOG_IF(WARNING, !HasDNNLowPEngine_(*op));
343 auto op_def = make_shared<OperatorDef>();
344 *op_def = op->debug_def();
345 AddArgument<string>(
"followed_by", followed_by, op_def.get());
346 op->set_debug_def(op_def);
348 if (followed_by ==
"Sigmoid") {
349 SetStaticQuantizationParams(
350 op, 0, Sigmoid<uint8_t>().GetInputQuantizationParams());
351 }
else if (followed_by ==
"Tanh") {
352 SetStaticQuantizationParams(
353 op, 0, Tanh<uint8_t>().GetInputQuantizationParams());
354 }
else if (followed_by ==
"Relu") {
355 if (HasStaticQuantization(op)) {
356 unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);
357 TensorQuantizationParams qparams = GetStaticQuantizationParamsOf(op, 0);
358 qparams = qfactory->ChooseQuantizationParams(0, qparams.Max());
359 SetStaticQuantizationParams(op, 0, qparams);
362 LOG(WARNING) <<
"Unknown followed_by " << followed_by;
366 void ParseDNNLowPOperatorArguments(
368 bool* dequantize_output,
369 bool* measure_quantization_error,
370 string* followed_by) {
373 if (dequantize_output) {
375 op->GetSingleArgument<
bool>(
"dequantize_output",
false);
376 if (*dequantize_output) {
377 VLOG(2) <<
"Dequantize output " << op->debug_def().output(0)
378 <<
" of operator type " << op->debug_def().type();
383 if (measure_quantization_error) {
384 *measure_quantization_error =
385 op->GetSingleArgument<
bool>(
"measure_quantization_error",
false);
391 if (HasStaticQuantization(op)) {
392 TensorQuantizationParams qparams = GetStaticQuantizationParamsOf(op, 0);
393 unique_ptr<QuantizationFactory> qfactory = GetQuantizationFactoryOf(op);
394 if (qparams.zero_point != (1 << (qfactory->GetActivationPrecision() - 1)) &&
395 qparams.zero_point != 0 && qfactory->GetPreserveActivationSparsity()) {
396 LOG(WARNING) <<
"Symmetric quantization is used for activation but " 398 << qparams.zero_point <<
" for " << op->debug_def().output(0)
399 <<
" output activation of an operator with type " 400 << op->debug_def().type();
403 if (op->HasSingleArgumentOfType<
int>(
"Y_zero_point")) {
404 LOG(WARNING) <<
"Y_zero_point without Y_scale for " 405 << op->debug_def().output(0)
406 <<
" (an output of operator type " << op->debug_def().type()
407 <<
") doesn't make sense";
413 if (op->HasSingleArgumentOfType<
string>(
"followed_by")) {
414 string followed_by_ = op->GetSingleArgument<
string>(
"followed_by",
"");
415 VLOG(2) <<
"Operator with type " << op->debug_def().type() <<
" and output " 416 << op->debug_def().output(0) <<
" is followed by " << followed_by_;
418 AdjustOutputTensorQuantizationParamsWithFollowedBy(op, followed_by_);
420 *followed_by = followed_by_;
425 NetDef AddScaleZeroOffsetArgumentsWithHistogram(
427 const string& histogram_file_name) {
428 ifstream f(histogram_file_name);
431 string first_line, word;
432 getline(f, first_line);
434 istringstream ist(first_line);
435 int nwords_first_line = 0;
436 while (ist >> word) {
443 bool new_format =
true;
444 int op_index, i, nbins;
445 string op_type, tensor_name;
447 ist >> op_index >> op_type >> i >> tensor_name >> min >> max >> nbins;
448 if (nwords_first_line != nbins + 7) {
451 ist >> op_index >> i >> tensor_name >> min >> max >> nbins;
452 if (nwords_first_line == nbins + 6) {
455 LOG(WARNING) <<
"histogram file " << histogram_file_name
456 <<
" has an invalid format";
463 for (
auto& op_def : *net_def.mutable_op()) {
466 for (i = 0; i < op_def.output().size(); ++i) {
470 f >> op_index2 >> op_type >> i2 >> tensor_name >> min >> max >> nbins;
472 f >> op_index2 >> i2 >> tensor_name >> min >> max >> nbins;
474 LOG_IF(WARNING, op_index2 != op_index)
475 <<
"op index " << op_index2 <<
" doesn't match with " << op_index;
476 LOG_IF(WARNING, tensor_name != op_def.output(i))
477 << tensor_name <<
" in histogram file line " << op_index
478 <<
" doesn't match with operation def " << op_def.output(i);
479 LOG_IF(WARNING, i2 != i)
480 <<
"output tensor index " << i2 <<
" doesn't match with " << i;
482 LOG_IF(WARNING, op_type != op_def.type())
483 <<
"operator type " << op_type <<
" in histogram file line " 484 << op_index <<
" doesn't match with operation def " 488 vector<uint64_t> bins;
489 for (
int j = 0; j < nbins; ++j) {
495 if (!HasDNNLowPEngine_(op_def) ||
496 arg_helper.GetSingleArgument<
int>(
"dequantize_output", 0) != 0 ||
498 LOG(INFO) <<
"Skip " << op_def.type() <<
" " << op_def.output(0);
502 Histogram hist = Histogram(min, max, bins);
504 unique_ptr<QuantizationFactory> qfactory =
505 GetQuantizationFactoryOf_(op_def);
506 TensorQuantizationParams qparams =
507 qfactory->ChooseQuantizationParams(hist);
509 SetStaticQuantizationParams_(&op_def, 0, qparams);
Tensor class holds a shared pointer to the implementation TensorImpl, redirects API calls to TensorIm...
A helper class to index into arguments.
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...