2 #include "caffe2/core/logging.h" 3 #include "dnnlowp_op.h" 4 #include "kl_minimization.h" 5 #include "l2_minimization.h" 14 caffe2_dnnlowp_activation_quantization_precision,
16 "Precision used for activation tensors");
18 caffe2_dnnlowp_weight_quantization_precision,
20 "Precision used for weight tensors");
22 caffe2_dnnlowp_requantization_multiplier_precision,
24 "Precision of integer multipliers used for rescaling quantized numbers");
26 caffe2_dnnlowp_eltwise_quantization_precision,
28 "Precision used for intermediate numbers during elementwise operations");
30 caffe2_dnnlowp_force_scale_power_of_two,
32 "When true, force quantization scales to a power of two");
34 caffe2_dnnlowp_preserve_activation_sparsity,
36 "When true, 0 is mapped to 0 after quantization: " 37 "i.e., symmetric quantization");
39 caffe2_dnnlowp_preserve_weight_sparsity,
41 "When true, 0 is mapped to 0 after quantization: " 42 "i.e., symmetric quantization");
44 caffe2_dnnlowp_activation_quantization_kind,
46 "Quantization method for activation tensors. " 47 "Allowed values: min_max, l2, l2_approx, kl, l1, p99");
49 caffe2_dnnlowp_weight_quantization_kind,
51 "Quantization method for weight tensors. " 52 "Allowed values: min_max, l2, l2_approx, kl, l1, p99");
54 caffe2_dnnlowp_nbits_in_non_outlier,
56 "When outlier-aware quantization is used, if a quantized number can be " 57 "represented by this number of bits, it is considered not an outlier so " 58 "handled with 16-bit accumulation");
60 caffe2_dnnlowp_copy_to_32bit_frequency,
62 "When outlier-aware quantization is used, this option specifies how often " 63 "we spill 16-bit accumulated numbers to 32-bit during the first pass");
65 caffe2_dnnlowp_force_slow_path,
67 "When true, use slow path in quantization");
73 QuantizationFactory::QuantizationKind StringToKind(
const string& s) {
75 transform(s_lower.begin(), s_lower.end(), s_lower.begin(), ::tolower);
77 if (s_lower ==
"min_max") {
78 return QuantizationFactory::MIN_MAX_QUANTIZATION;
79 }
else if (s_lower ==
"l1") {
80 return QuantizationFactory::L1_MIN_QUANTIZATION;
81 }
else if (s_lower ==
"l2") {
82 return QuantizationFactory::L2_MIN_QUANTIZATION;
83 }
else if (s_lower ==
"l2_approx") {
84 if (FLAGS_caffe2_dnnlowp_preserve_weight_sparsity ||
85 FLAGS_caffe2_dnnlowp_preserve_activation_sparsity) {
86 return QuantizationFactory::L2_MIN_QUANTIZATION;
88 return QuantizationFactory::L2_MIN_QUANTIZATION_APPROX;
90 }
else if (s_lower ==
"kl") {
91 return QuantizationFactory::KL_MIN_QUANTIZATION;
92 }
else if (s_lower ==
"p99") {
93 return QuantizationFactory::P99_QUANTIZATION;
96 return QuantizationFactory::MIN_MAX_QUANTIZATION;
102 FLAGS_caffe2_dnnlowp_activation_quantization_precision,
103 FLAGS_caffe2_dnnlowp_weight_quantization_precision,
104 FLAGS_caffe2_dnnlowp_requantization_multiplier_precision,
105 FLAGS_caffe2_dnnlowp_eltwise_quantization_precision,
106 FLAGS_caffe2_dnnlowp_preserve_activation_sparsity,
107 FLAGS_caffe2_dnnlowp_preserve_weight_sparsity,
108 FLAGS_caffe2_dnnlowp_force_scale_power_of_two,
109 StringToKind(FLAGS_caffe2_dnnlowp_activation_quantization_kind),
110 StringToKind(FLAGS_caffe2_dnnlowp_weight_quantization_kind));
112 static bool log_printed =
false;
114 LOG(INFO) <<
"activation_precision " 115 << FLAGS_caffe2_dnnlowp_activation_quantization_precision;
116 LOG(INFO) <<
"weight_precision " 117 << FLAGS_caffe2_dnnlowp_weight_quantization_precision;
118 LOG(INFO) <<
"requantization_multiplier_precision " 119 << FLAGS_caffe2_dnnlowp_requantization_multiplier_precision;
120 LOG(INFO) <<
"eltwise_quantize_precision " 121 << FLAGS_caffe2_dnnlowp_eltwise_quantization_precision;
122 LOG(INFO) <<
"preserve_activation_sparsity " 123 << FLAGS_caffe2_dnnlowp_preserve_activation_sparsity;
124 LOG(INFO) <<
"preserve_weight_sparsity " 125 << FLAGS_caffe2_dnnlowp_preserve_weight_sparsity;
126 LOG(INFO) <<
"force_scale_power_of_two " 127 << FLAGS_caffe2_dnnlowp_force_scale_power_of_two;
128 LOG(INFO) <<
"activation_quantization_kind " 129 << FLAGS_caffe2_dnnlowp_activation_quantization_kind;
130 LOG(INFO) <<
"weight_quantization_kind " 131 << FLAGS_caffe2_dnnlowp_weight_quantization_kind;
132 LOG(INFO) <<
"nbits_in_non_outlier " 133 << FLAGS_caffe2_dnnlowp_nbits_in_non_outlier;
134 LOG(INFO) <<
"copy_to_32bit_frequency " 135 << FLAGS_caffe2_dnnlowp_copy_to_32bit_frequency;
136 LOG(INFO) <<
"omp_get_max_threads() " << caffe2::dnnlowp_get_max_threads();
144 QuantizationFactory::QuantizationFactory(
145 int activation_precision,
146 int weight_precision,
147 int requantization_multiplier_precision,
148 int eltwise_quantize_precision,
149 bool preserve_activation_sparsity,
150 bool preserve_weight_sparsity,
151 bool force_scale_power_of_two,
152 QuantizationKind activation_kind,
153 QuantizationKind weight_kind)
154 : activation_precision_(activation_precision),
155 weight_precision_(weight_precision),
156 requantization_multiplier_precision_(requantization_multiplier_precision),
157 eltwise_quantize_precision_(eltwise_quantize_precision),
158 preserve_activation_sparsity_(preserve_activation_sparsity),
159 preserve_weight_sparsity_(preserve_weight_sparsity),
160 force_scale_power_of_two_(force_scale_power_of_two),
161 activation_kind_(activation_kind),
162 weight_kind_(weight_kind) {}
166 QuantizationKind kind,
168 bool preserve_sparsity)
const {
170 case L2_MIN_QUANTIZATION:
172 hist, preserve_sparsity, precision);
173 case L2_MIN_QUANTIZATION_APPROX:
175 hist, preserve_sparsity, precision);
176 case L1_MIN_QUANTIZATION:
178 hist, preserve_sparsity, precision);
179 case KL_MIN_QUANTIZATION:
181 hist, preserve_sparsity, precision);
182 case P99_QUANTIZATION:
183 assert(preserve_sparsity);
184 return P99().ChooseQuantizationParams(hist, preserve_sparsity, precision);
185 case MIN_MAX_QUANTIZATION:
187 return ChooseQuantizationParams(
188 hist.Min(), hist.Max(), precision, preserve_sparsity);
194 bool is_weight)
const {
196 return ChooseQuantizationParams(
199 GetWeightPrecision(),
200 GetPreserveWeightSparsity());
202 return ChooseQuantizationParams(
205 GetActivationPrecision(),
206 GetPreserveActivationSparsity());
213 QuantizationKind kind,
215 bool preserve_sparsity)
const {
216 float min = 0, max = 0;
217 fbgemm::FindMinMax(values, &min, &max, len);
219 if (MIN_MAX_QUANTIZATION == kind) {
220 return ChooseQuantizationParams(min, max, precision, preserve_sparsity);
223 return ChooseQuantizationParams(min, max, precision, preserve_sparsity);
230 Histogram hist(1 << (precision + 3), min, max);
231 for (
int i = 0; i < len; ++i) {
235 return ChooseQuantizationParams(hist, kind, precision, preserve_sparsity);
242 bool is_weight)
const {
244 return ChooseQuantizationParams(
248 GetWeightPrecision(),
249 GetPreserveWeightSparsity());
251 return ChooseQuantizationParams(
255 GetActivationPrecision(),
256 GetPreserveActivationSparsity());
260 RequantizationParams QuantizationFactory::ChooseRequantizationMultiplier(
261 float real_multiplier,
262 TensorQuantizationParams target_qparams)
const {
263 RequantizationParams params;
264 params.target_qparams = target_qparams;
265 params.real_multiplier = real_multiplier;
267 fbgemm::ChooseRequantizationMultiplier(
271 requantization_multiplier_precision_);
TensorQuantizationParams ChooseQuantizationParams(float min, float max, int precision, bool preserve_sparsity, bool is_signed=false) const
Choose quantization scale and zero_point that maps floating-point range [min, max] to the integer ran...
A quantization scheme that minimizes L2 norm of quantization error.
TensorQuantizationParams NonlinearQuantizationParamsSearch(const Histogram &hist, bool preserve_sparsity=false, int precision=8)
Faster approximate search.
A quantization scheme that minimizes Kullback-Leiber divergence.
static QuantizationFactory * GetDefaultInstance()
Get the default factory whose policy is determined by gflags.
bin_width = (max - min)/nbins ith bin (zero-based indexing) contains [i*bin_width, (i+1)*bin_width) with an exception that (nbins - 1)th bin contains [(nbins-1)*bin_width, nbins*bin_width]