Caffe2 - C++ API
A deep learning, cross platform ML framework
dnnlowp.cc
1 #include "dnnlowp.h"
2 #include "caffe2/core/logging.h"
3 #include "dnnlowp_op.h"
4 #include "kl_minimization.h"
5 #include "l2_minimization.h"
6 
7 #include <cassert>
8 #include <cctype>
9 #ifdef _OPENMP
10 #include <omp.h>
11 #endif
12 
13 C10_DEFINE_int32(
14  caffe2_dnnlowp_activation_quantization_precision,
15  8,
16  "Precision used for activation tensors");
17 C10_DEFINE_int32(
18  caffe2_dnnlowp_weight_quantization_precision,
19  8,
20  "Precision used for weight tensors");
21 C10_DEFINE_int32(
22  caffe2_dnnlowp_requantization_multiplier_precision,
23  32,
24  "Precision of integer multipliers used for rescaling quantized numbers");
25 C10_DEFINE_int32(
26  caffe2_dnnlowp_eltwise_quantization_precision,
27  16,
28  "Precision used for intermediate numbers during elementwise operations");
29 C10_DEFINE_bool(
30  caffe2_dnnlowp_force_scale_power_of_two,
31  false,
32  "When true, force quantization scales to a power of two");
33 C10_DEFINE_bool(
34  caffe2_dnnlowp_preserve_activation_sparsity,
35  false,
36  "When true, 0 is mapped to 0 after quantization: "
37  "i.e., symmetric quantization");
38 C10_DEFINE_bool(
39  caffe2_dnnlowp_preserve_weight_sparsity,
40  false,
41  "When true, 0 is mapped to 0 after quantization: "
42  "i.e., symmetric quantization");
43 C10_DEFINE_string(
44  caffe2_dnnlowp_activation_quantization_kind,
45  "min_max",
46  "Quantization method for activation tensors. "
47  "Allowed values: min_max, l2, l2_approx, kl, l1, p99");
48 C10_DEFINE_string(
49  caffe2_dnnlowp_weight_quantization_kind,
50  "min_max",
51  "Quantization method for weight tensors. "
52  "Allowed values: min_max, l2, l2_approx, kl, l1, p99");
53 C10_DEFINE_int32(
54  caffe2_dnnlowp_nbits_in_non_outlier,
55  8,
56  "When outlier-aware quantization is used, if a quantized number can be "
57  "represented by this number of bits, it is considered not an outlier so "
58  "handled with 16-bit accumulation");
59 C10_DEFINE_int32(
60  caffe2_dnnlowp_copy_to_32bit_frequency,
61  32,
62  "When outlier-aware quantization is used, this option specifies how often "
63  "we spill 16-bit accumulated numbers to 32-bit during the first pass");
64 C10_DEFINE_bool(
65  caffe2_dnnlowp_force_slow_path,
66  false,
67  "When true, use slow path in quantization");
68 
69 namespace dnnlowp {
70 
71 using namespace std;
72 
73 QuantizationFactory::QuantizationKind StringToKind(const string& s) {
74  string s_lower(s);
75  transform(s_lower.begin(), s_lower.end(), s_lower.begin(), ::tolower);
76 
77  if (s_lower == "min_max") {
78  return QuantizationFactory::MIN_MAX_QUANTIZATION;
79  } else if (s_lower == "l1") {
80  return QuantizationFactory::L1_MIN_QUANTIZATION;
81  } else if (s_lower == "l2") {
82  return QuantizationFactory::L2_MIN_QUANTIZATION;
83  } else if (s_lower == "l2_approx") {
84  if (FLAGS_caffe2_dnnlowp_preserve_weight_sparsity ||
85  FLAGS_caffe2_dnnlowp_preserve_activation_sparsity) {
86  return QuantizationFactory::L2_MIN_QUANTIZATION;
87  } else {
88  return QuantizationFactory::L2_MIN_QUANTIZATION_APPROX;
89  }
90  } else if (s_lower == "kl") {
91  return QuantizationFactory::KL_MIN_QUANTIZATION;
92  } else if (s_lower == "p99") {
93  return QuantizationFactory::P99_QUANTIZATION;
94  } else {
95  assert(false);
96  return QuantizationFactory::MIN_MAX_QUANTIZATION;
97  }
98 }
99 
101  static QuantizationFactory singleton(
102  FLAGS_caffe2_dnnlowp_activation_quantization_precision,
103  FLAGS_caffe2_dnnlowp_weight_quantization_precision,
104  FLAGS_caffe2_dnnlowp_requantization_multiplier_precision,
105  FLAGS_caffe2_dnnlowp_eltwise_quantization_precision,
106  FLAGS_caffe2_dnnlowp_preserve_activation_sparsity,
107  FLAGS_caffe2_dnnlowp_preserve_weight_sparsity,
108  FLAGS_caffe2_dnnlowp_force_scale_power_of_two,
109  StringToKind(FLAGS_caffe2_dnnlowp_activation_quantization_kind),
110  StringToKind(FLAGS_caffe2_dnnlowp_weight_quantization_kind));
111 
112  static bool log_printed = false;
113  if (!log_printed) {
114  LOG(INFO) << "activation_precision "
115  << FLAGS_caffe2_dnnlowp_activation_quantization_precision;
116  LOG(INFO) << "weight_precision "
117  << FLAGS_caffe2_dnnlowp_weight_quantization_precision;
118  LOG(INFO) << "requantization_multiplier_precision "
119  << FLAGS_caffe2_dnnlowp_requantization_multiplier_precision;
120  LOG(INFO) << "eltwise_quantize_precision "
121  << FLAGS_caffe2_dnnlowp_eltwise_quantization_precision;
122  LOG(INFO) << "preserve_activation_sparsity "
123  << FLAGS_caffe2_dnnlowp_preserve_activation_sparsity;
124  LOG(INFO) << "preserve_weight_sparsity "
125  << FLAGS_caffe2_dnnlowp_preserve_weight_sparsity;
126  LOG(INFO) << "force_scale_power_of_two "
127  << FLAGS_caffe2_dnnlowp_force_scale_power_of_two;
128  LOG(INFO) << "activation_quantization_kind "
129  << FLAGS_caffe2_dnnlowp_activation_quantization_kind;
130  LOG(INFO) << "weight_quantization_kind "
131  << FLAGS_caffe2_dnnlowp_weight_quantization_kind;
132  LOG(INFO) << "nbits_in_non_outlier "
133  << FLAGS_caffe2_dnnlowp_nbits_in_non_outlier;
134  LOG(INFO) << "copy_to_32bit_frequency "
135  << FLAGS_caffe2_dnnlowp_copy_to_32bit_frequency;
136  LOG(INFO) << "omp_get_max_threads() " << caffe2::dnnlowp_get_max_threads();
137 
138  log_printed = true;
139  }
140 
141  return &singleton;
142 }
143 
144 QuantizationFactory::QuantizationFactory(
145  int activation_precision,
146  int weight_precision,
147  int requantization_multiplier_precision,
148  int eltwise_quantize_precision,
149  bool preserve_activation_sparsity,
150  bool preserve_weight_sparsity,
151  bool force_scale_power_of_two,
152  QuantizationKind activation_kind,
153  QuantizationKind weight_kind)
154  : activation_precision_(activation_precision),
155  weight_precision_(weight_precision),
156  requantization_multiplier_precision_(requantization_multiplier_precision),
157  eltwise_quantize_precision_(eltwise_quantize_precision),
158  preserve_activation_sparsity_(preserve_activation_sparsity),
159  preserve_weight_sparsity_(preserve_weight_sparsity),
160  force_scale_power_of_two_(force_scale_power_of_two),
161  activation_kind_(activation_kind),
162  weight_kind_(weight_kind) {}
163 
165  const Histogram& hist,
166  QuantizationKind kind,
167  int precision,
168  bool preserve_sparsity) const {
169  switch (kind) {
170  case L2_MIN_QUANTIZATION:
171  return L2ErrorMinimization().ChooseQuantizationParams(
172  hist, preserve_sparsity, precision);
173  case L2_MIN_QUANTIZATION_APPROX:
175  hist, preserve_sparsity, precision);
176  case L1_MIN_QUANTIZATION:
177  return L1ErrorMinimization().ChooseQuantizationParams(
178  hist, preserve_sparsity, precision);
179  case KL_MIN_QUANTIZATION:
180  return KLDivergenceMinimization().ChooseQuantizationParams(
181  hist, preserve_sparsity, precision);
182  case P99_QUANTIZATION:
183  assert(preserve_sparsity);
184  return P99().ChooseQuantizationParams(hist, preserve_sparsity, precision);
185  case MIN_MAX_QUANTIZATION:
186  default:
187  return ChooseQuantizationParams(
188  hist.Min(), hist.Max(), precision, preserve_sparsity);
189  }
190 }
191 
192 TensorQuantizationParams QuantizationFactory::ChooseQuantizationParams(
193  const Histogram& hist,
194  bool is_weight) const {
195  if (is_weight) {
196  return ChooseQuantizationParams(
197  hist,
198  GetWeightKind(),
199  GetWeightPrecision(),
200  GetPreserveWeightSparsity());
201  } else {
202  return ChooseQuantizationParams(
203  hist,
204  GetActivationKind(),
205  GetActivationPrecision(),
206  GetPreserveActivationSparsity());
207  }
208 }
209 
211  const float* values,
212  int len,
213  QuantizationKind kind,
214  int precision,
215  bool preserve_sparsity) const {
216  float min = 0, max = 0;
217  fbgemm::FindMinMax(values, &min, &max, len);
218 
219  if (MIN_MAX_QUANTIZATION == kind) {
220  return ChooseQuantizationParams(min, max, precision, preserve_sparsity);
221  } else {
222  if (0 == len) {
223  return ChooseQuantizationParams(min, max, precision, preserve_sparsity);
224  }
225 
230  Histogram hist(1 << (precision + 3), min, max);
231  for (int i = 0; i < len; ++i) {
232  hist.Add(values[i]);
233  }
234 
235  return ChooseQuantizationParams(hist, kind, precision, preserve_sparsity);
236  }
237 }
238 
239 TensorQuantizationParams QuantizationFactory::ChooseQuantizationParams(
240  const float* values,
241  int len,
242  bool is_weight) const {
243  if (is_weight) {
244  return ChooseQuantizationParams(
245  values,
246  len,
247  GetWeightKind(),
248  GetWeightPrecision(),
249  GetPreserveWeightSparsity());
250  } else {
251  return ChooseQuantizationParams(
252  values,
253  len,
254  GetActivationKind(),
255  GetActivationPrecision(),
256  GetPreserveActivationSparsity());
257  }
258 }
259 
260 RequantizationParams QuantizationFactory::ChooseRequantizationMultiplier(
261  float real_multiplier,
262  TensorQuantizationParams target_qparams) const {
263  RequantizationParams params;
264  params.target_qparams = target_qparams;
265  params.real_multiplier = real_multiplier;
266 
267  fbgemm::ChooseRequantizationMultiplier(
268  real_multiplier,
269  &params.multiplier,
270  &params.right_shift,
271  requantization_multiplier_precision_);
272 
273  return params;
274 }
275 
276 } // namespace dnnlowp
TensorQuantizationParams ChooseQuantizationParams(float min, float max, int precision, bool preserve_sparsity, bool is_signed=false) const
Choose quantization scale and zero_point that maps floating-point range [min, max] to the integer ran...
Definition: dnnlowp.h:46
A quantization scheme that minimizes L2 norm of quantization error.
TensorQuantizationParams NonlinearQuantizationParamsSearch(const Histogram &hist, bool preserve_sparsity=false, int precision=8)
Faster approximate search.
A quantization scheme that minimizes Kullback-Leiber divergence.
static QuantizationFactory * GetDefaultInstance()
Get the default factory whose policy is determined by gflags.
Definition: dnnlowp.cc:100
bin_width = (max - min)/nbins ith bin (zero-based indexing) contains [i*bin_width, (i+1)*bin_width) with an exception that (nbins - 1)th bin contains [(nbins-1)*bin_width, nbins*bin_width]