Caffe2 - C++ API
A deep learning, cross platform ML framework
dnnlowp.h
1 #pragma once
2 
3 #include <algorithm>
4 #include <cassert>
5 #include <cmath>
6 #include <cstdint>
7 #include <limits>
8 
9 #include <x86intrin.h>
10 
11 #include <fbgemm/QuantUtils.h>
12 
13 #include "caffe2/quantization/server/dynamic_histogram.h"
14 #include "caffe2/utils/cpuid.h"
15 
16 namespace dnnlowp {
17 
18 using fbgemm::RequantizationParams;
19 using fbgemm::TensorQuantizationParams;
20 
21 // Represents a quantization scheme that provides quantization parameter based
22 // on distribution of data to be quantized.
24  public:
25  enum QuantizationKind {
26  // A simple quantization scheme that determines quantization parameter by
27  // just looking at min/max.
28  MIN_MAX_QUANTIZATION,
29  // Minimizes L2 norm of quantization error
30  L2_MIN_QUANTIZATION,
31  // fast search to remove histogram outliers and approximate L2 min
32  L2_MIN_QUANTIZATION_APPROX,
33  // Minimizes Kullback-Leibler divergence
34  KL_MIN_QUANTIZATION,
35  // Take 99 percentail (only works with sparsity preserving quantization)
36  P99_QUANTIZATION,
37  L1_MIN_QUANTIZATION,
38  };
39 
42 
46  TensorQuantizationParams ChooseQuantizationParams(
47  float min,
48  float max,
49  int precision,
50  bool preserve_sparsity,
51  bool is_signed = false) const {
52  TensorQuantizationParams qparams = fbgemm::ChooseQuantizationParams(
53  min,
54  max,
55  is_signed ? -(1 << (precision - 1)) : 0,
56  is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
57  preserve_sparsity,
58  force_scale_power_of_two_);
59  qparams.precision = precision;
60  return qparams;
61  }
62 
66  TensorQuantizationParams
67  ChooseQuantizationParams(float min, float max, bool is_weight = false) const {
69  min,
70  max,
71  is_weight ? GetWeightPrecision() : GetActivationPrecision(),
72  is_weight ? GetPreserveWeightSparsity()
73  : GetPreserveActivationSparsity());
74  }
75 
78  TensorQuantizationParams ChooseQuantizationParams(
79  const float* values,
80  int len,
81  QuantizationKind kind,
82  int precision,
83  bool preserve_sparsity) const;
84 
85  TensorQuantizationParams ChooseQuantizationParams(
86  const float* values,
87  int len,
88  bool is_weight = false) const;
89 
92  TensorQuantizationParams ChooseQuantizationParams(
93  const Histogram& hist,
94  QuantizationKind kind,
95  int precision,
96  bool preserve_sparsity) const;
97 
98  TensorQuantizationParams ChooseQuantizationParams(
99  const Histogram& hist,
100  bool is_weight = false) const;
101 
102  // Given a real_multiplier, produces a pair (quantized_multiplier,
103  // right_shift) where quantized_multiplier is an int32 representing a
104  // fixed-point value (in practice we only produce positive values) and
105  // right_shift is an amount to shift right by, so that the floating-point
106  // multiplication of some int32 input value by real_multiplier,
107  //
108  // return static_cast<int32>(int32_value * real_multiplier);
109  //
110  // is best approximated by the integer-arithmetic-only code
111  //
112  // return RoundingRightShift(
113  // Multiplication(int32_value, quantized_multiplier),
114  // right_shift);
115  //
116  // Note: all this code only needs to run offline to generate the quantized
117  // neural network workload, not at runtime on the device on which quantized
118  // neural networks need to run. So it's not performance-critical at all.
119  RequantizationParams ChooseRequantizationMultiplier(
120  float real_multiplier,
121  TensorQuantizationParams target_qparams) const;
122 
123  int GetActivationPrecision() const {
124  return activation_precision_;
125  }
126 
127  int GetWeightPrecision() const {
128  return weight_precision_;
129  }
130 
131  int GetEltwiseQuantizePrecision() const {
132  return eltwise_quantize_precision_;
133  }
134 
135  bool GetPreserveActivationSparsity() const {
136  return preserve_activation_sparsity_;
137  }
138 
139  bool GetPreserveWeightSparsity() const {
140  return preserve_weight_sparsity_;
141  }
142 
143  QuantizationKind GetActivationKind() const {
144  return activation_kind_;
145  }
146  QuantizationKind GetWeightKind() const {
147  return weight_kind_;
148  }
149 
150  explicit QuantizationFactory(
151  int activation_precision = 8,
152  // precision used for activations in main operations like matmul
153  int weight_precision = 8, // precision used for weights
154  int requantization_multiplier_precision = 32,
155  // precision used for the requantization multiplier
156  int eltwise_quantize_precision = 16,
157  // precision used for element-wise addition
158  bool preserve_activation_sparsity = false,
159  // preserve zeros in quantization
160  bool preserve_weight_sparsity = false,
161  // preserve zeros in quantization
162  bool force_scale_power_of_two = false,
163  // restrict scaling to a power of two
164  QuantizationKind activation_kind = MIN_MAX_QUANTIZATION,
165  QuantizationKind weight_kind = MIN_MAX_QUANTIZATION);
166 
167  private:
168  int activation_precision_;
169  int weight_precision_;
170  int requantization_multiplier_precision_;
171  int eltwise_quantize_precision_;
172  bool preserve_activation_sparsity_;
173  bool preserve_weight_sparsity_;
174  bool force_scale_power_of_two_;
175  QuantizationKind activation_kind_, weight_kind_;
176 }; // class QuantizationFactory
177 
181 QuantizationFactory::QuantizationKind StringToKind(const std::string& s);
182 
183 } // namespace dnnlowp
TensorQuantizationParams ChooseQuantizationParams(float min, float max, int precision, bool preserve_sparsity, bool is_signed=false) const
Choose quantization scale and zero_point that maps floating-point range [min, max] to the integer ran...
Definition: dnnlowp.h:46
TensorQuantizationParams ChooseQuantizationParams(float min, float max, bool is_weight=false) const
Choose quantization scale and zero_point that maps floating-point range [min, max] to the default int...
Definition: dnnlowp.h:67
static QuantizationFactory * GetDefaultInstance()
Get the default factory whose policy is determined by gflags.
Definition: dnnlowp.cc:100
bin_width = (max - min)/nbins ith bin (zero-based indexing) contains [i*bin_width, (i+1)*bin_width) with an exception that (nbins - 1)th bin contains [(nbins-1)*bin_width, nbins*bin_width]