Caffe2 - C++ API
A deep learning, cross platform ML framework
norm_minimization.cc
1 #include "caffe2/core/logging.h"
2 #include "caffe2/utils/cpuid.h"
3 #include "l2_minimization.h"
4 
5 #include <cassert>
6 #include <cmath>
7 #include <limits>
8 
9 #include <x86intrin.h>
10 
11 using namespace std;
12 
13 namespace dnnlowp {
14 
15 #undef NDEBUG
16 
17 static float
18 GetNorm(float begin, float end, float density, NormMinimization::Kind kind) {
19  float norm = 0;
20 
21  // assume values are uniformly distributed within each histogram bin
22  if (NormMinimization::L2 == kind) {
23  // err = density * (integral_{begin, end} x^2)
24  // = density * (end^3 - begin^3) / 3
25  norm = (end * end * end - begin * begin * begin) / 3;
26  // for begin = d/2 and end = -d/2, this leads to d^3/12
27  } else {
28  // err = density * (integral_{begin, end} |x|)
29  // = density * (end^2 - begin^2) / 2
30  float left_begin = std::min(0.0f, begin);
31  float left_end = std::min(0.0f, end);
32  assert(left_begin * left_begin >= left_end * left_end);
33  norm += (left_begin * left_begin - left_end * left_end) / 2;
34 
35  float right_begin = std::max(0.0f, begin);
36  float right_end = std::max(0.0f, end);
37  assert(right_end * right_end >= right_begin * right_begin);
38  norm += (right_end * right_end - right_begin * right_begin) / 2;
39  }
40 
41  return density * norm;
42 }
43 
44 namespace {
45 
46 vector<float>
47 adjust_hist_to_include_zero(const Histogram& hist, float* min, float* max) {
48  const vector<uint64_t> bins = *hist.GetHistogram();
49  *min = hist.Min();
50  *max = hist.Max();
51  int nbins = bins.size();
52  float bin_width = (*max - *min) / nbins;
53 
54  // Pad histogram to include zero
55  int additional_nbins = 0;
56  int offset = 0;
57  if (*min > 0) {
58  // additional nbins to include 0
59  additional_nbins = ceil(*min / bin_width);
60  offset = additional_nbins;
61  *min -= additional_nbins * bin_width;
62  assert(*min <= 0);
63  } else if (*max < 0) {
64  additional_nbins = ceil((-*max) / bin_width);
65  *max += additional_nbins * bin_width;
66  assert(*max >= 0);
67  }
68 
69  vector<float> bins_f(nbins + additional_nbins);
70  for (int i = 0; i < nbins; ++i) {
71  bins_f[i + offset] = bins[i];
72  }
73  return bins_f;
74 }
75 
76 } // namespace
77 
78 // Filter out outliers in input distributions
79 // Exploit the input distributions for the quick search
80 TensorQuantizationParams NormMinimization::NonlinearQuantizationParamsSearch(
81  const Histogram& hist,
82  bool preserve_sparsity,
83  int precision) {
84  if (preserve_sparsity) {
85  VLOG(2) << "l2_approx with symmetric quantization falls back to L2";
86  return ChooseQuantizationParams(hist, preserve_sparsity, precision);
87  }
88  VLOG(2) << "Using the nonlinear quantile search";
89 
90  float min, max;
91  vector<float> bins_f(adjust_hist_to_include_zero(hist, &min, &max));
92  int nbins = bins_f.size();
93  float bin_width = (max - min) / nbins;
94  int dst_nbins = 1 << precision;
95 
96  // calculate the CDF
97  uint64_t total = 0;
98  for (uint64_t x : bins_f) {
99  total += x;
100  }
101  vector<uint64_t> CDF;
102  uint64_t sum = 0;
103  for (uint64_t x : bins_f) {
104  sum += x;
105  CDF.push_back(sum);
106  }
107 
108  double stepsize = 0.00001; // experiment on the granularity
109  double alpha = 0.0f, beta = 1.0f; // lowerbound and upperbound
110  int start_bin = 0;
111  int end_bin = nbins - 1;
112  double norm_min = numeric_limits<double>::max();
113 
114  while (alpha < beta) {
115  // find the next step
116  double next_alpha = alpha + stepsize;
117  double next_beta = beta - stepsize;
118 
119  // find the left and right bins between the quantile bounds
120  int i = start_bin, j = end_bin;
121  while (i < end_bin && CDF[i] < next_alpha * total)
122  i++;
123  while (j > start_bin && CDF[j] > next_beta * total)
124  j--;
125 
126  // decide the next move
127  // cout << i << ", " << j << endl;
128  int next_start_bin = start_bin, next_end_bin = end_bin;
129  if ((i - start_bin) > (end_bin - j)) {
130  // move the start_bin
131  next_start_bin = i;
132  alpha = next_alpha;
133  } else {
134  // move the end_bin
135  next_end_bin = j;
136  beta = next_beta;
137  }
138 
139  if (next_start_bin == start_bin && next_end_bin == end_bin)
140  continue;
141  // calculate the norm
142  double norm = 0;
143  double dst_bin_width =
144  bin_width * (next_end_bin - next_start_bin + 1) / dst_nbins;
145 
146  // go over each histogram bin and accumulate errors
147  for (int src_bin = 0; src_bin < nbins; ++src_bin) {
148  // distances from the beginning of first dst_bin to the beginning and
149  // end of src_bin
150  double src_bin_begin = (src_bin - next_start_bin) * bin_width;
151  double src_bin_end = src_bin_begin + bin_width;
152 
153  // which dst_bins the beginning and end of src_bin belong to?
154  int dst_bin_of_begin = std::min(
155  (1 << precision) - 1.,
156  std::max(0., floor(src_bin_begin / dst_bin_width)));
157  int dst_bin_of_end = std::min(
158  (1 << precision) - 1.,
159  std::max(0., floor(src_bin_end / dst_bin_width)));
160 
161  double dst_bin_of_begin_center =
162  dst_bin_of_begin * dst_bin_width + dst_bin_width / 2;
163  double density = bins_f[src_bin] / bin_width;
164  if (dst_bin_of_begin == dst_bin_of_end) {
165  // if src_bin is entirely within 1 dst_bin
166  double delta_begin = src_bin_begin - dst_bin_of_begin_center;
167  double delta_end = src_bin_end - dst_bin_of_begin_center;
168  norm += GetNorm(delta_begin, delta_end, density, kind_);
169  } else {
170  double delta_begin = src_bin_begin - dst_bin_of_begin_center;
171  double delta_end = dst_bin_width / 2;
172  norm += GetNorm(delta_begin, delta_end, density, kind_);
173 
174  norm += (dst_bin_of_end - dst_bin_of_begin - 1) *
175  GetNorm(-dst_bin_width / 2, dst_bin_width / 2, density, kind_);
176 
177  double dst_bin_of_end_center =
178  dst_bin_of_end * dst_bin_width + dst_bin_width / 2;
179  delta_begin = -dst_bin_width / 2;
180  delta_end = src_bin_end - dst_bin_of_end_center;
181  norm += GetNorm(delta_begin, delta_end, density, kind_);
182  }
183  }
184  if (norm > norm_min)
185  break;
186  norm_min = norm;
187  start_bin = next_start_bin;
188  end_bin = next_end_bin;
189  }
190  VLOG(2) << "best quantiation range " << start_bin << "," << end_bin + 1 << ","
191  << norm_min;
192 
193  double selected_sum = 0;
194  for (int i = start_bin; i < end_bin + 1; ++i) {
195  selected_sum += bins_f[i];
196  }
197  VLOG(2) << "best quantization range covers "
198  << (double)selected_sum / total * 100 << " %%";
199 
200  max = min + bin_width * (end_bin + 1);
201  min = min + bin_width * start_bin;
202 
203  QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
204  return qfactory->ChooseQuantizationParams(
205  min, max, precision, preserve_sparsity);
206 }
207 
208 TensorQuantizationParams NormMinimization::ChooseQuantizationParams(
209  const Histogram& hist,
210  bool preserve_sparsity,
211  int precision) {
212  VLOG(2) << "Using the brute force search";
213  float min, max;
214  vector<float> bins_f(adjust_hist_to_include_zero(hist, &min, &max));
215  int nbins = bins_f.size();
216  float bin_width = (max - min) / nbins;
217 
218  int dst_nbins = 1 << precision;
219 
220  int zero_bin = round(-min / bin_width);
221 
222  vector<pair<int, float>> best_start_bins(nbins + 1);
223 
224  // Look at mapping [start_bin, start_bin + nbins_selected) to
225  // [0, 1 << precision) for every (start_bin, nbins_selected) combination and
226  // pick the one with smallest L2 quantization error
227 #ifdef _OPENMP
228 #pragma omp parallel for schedule(dynamic)
229 #endif
230  for (int nbins_selected = 1; nbins_selected <= nbins; ++nbins_selected) {
231  float norm_min = numeric_limits<float>::max();
232  int best_start_bin = 0;
233 
234  int start_bin_begin = 0, start_bin_end = nbins - nbins_selected + 1;
235  if (preserve_sparsity) {
236  // when preserving sparsity we only check the range
237  // starting from 0 (when min is 0) or symmetric around 0.
238  if (min == 0) {
239  start_bin_begin = 0;
240  start_bin_end = 1;
241  } else {
242  start_bin_begin = zero_bin - nbins_selected / 2;
243  start_bin_end = start_bin_begin + 1;
244  }
245  }
246 
247  float dst_bin_width = bin_width * nbins_selected / dst_nbins;
248 
249  int start_bin;
250  for (start_bin = start_bin_begin; start_bin < start_bin_end; ++start_bin) {
251  float norm = 0;
252 
253  // go over each histogram bin and accumulate errors
254  caffe2::CpuId cpuid = caffe2::GetCpuId();
255  if (kind_ == NormMinimization::L2 && cpuid.avx2() && cpuid.fma()) {
256  norm = internal::L2MinimizationKernelAVX2(
257  precision,
258  bins_f.data(),
259  nbins,
260  bin_width,
261  dst_bin_width,
262  start_bin);
263  } else {
264  for (int src_bin = 0; src_bin < nbins; ++src_bin) {
265  // distances from the beginning of first dst_bin to the beginning and
266  // end of src_bin
267  float src_bin_begin = (src_bin - start_bin) * bin_width;
268  float src_bin_end = src_bin_begin + bin_width;
269 
270  // which dst_bins the beginning and end of src_bin belong to?
271  int dst_bin_of_begin = std::min(
272  (1 << precision) - 1.0f,
273  std::max(0.0f, floorf(src_bin_begin / dst_bin_width)));
274  int dst_bin_of_end = std::min(
275  (1 << precision) - 1.0f,
276  std::max(0.0f, floorf(src_bin_end / dst_bin_width)));
277 
278  float dst_bin_of_begin_center =
279  dst_bin_of_begin * dst_bin_width + dst_bin_width / 2;
280  float density = bins_f[src_bin] / bin_width;
281  float delta_begin = src_bin_begin - dst_bin_of_begin_center;
282  if (dst_bin_of_begin == dst_bin_of_end) {
283  // if src_bin is entirely within 1 dst_bin
284  float delta_end = src_bin_end - dst_bin_of_begin_center;
285  norm += GetNorm(delta_begin, delta_end, density, kind_);
286  } else {
287  float delta_end = dst_bin_width / 2;
288  norm += GetNorm(delta_begin, delta_end, density, kind_);
289 
290  norm += (dst_bin_of_end - dst_bin_of_begin - 1) *
291  GetNorm(-dst_bin_width / 2, dst_bin_width / 2, density, kind_);
292 
293  float dst_bin_of_end_center =
294  dst_bin_of_end * dst_bin_width + dst_bin_width / 2;
295  delta_begin = -dst_bin_width / 2;
296  delta_end = src_bin_end - dst_bin_of_end_center;
297  norm += GetNorm(delta_begin, delta_end, density, kind_);
298  }
299  }
300  }
301 
302  if (norm < norm_min) {
303  norm_min = norm;
304  best_start_bin = start_bin;
305  }
306  } // for each start_bin
307 
308  best_start_bins[nbins_selected] = {best_start_bin, norm_min};
309  } // for each nbins_selected
310 
311  float norm_min = numeric_limits<float>::max();
312  int best_nbins_selected = 1, best_start_bin = 0;
313  for (int nbins_selected = 1; nbins_selected <= nbins; ++nbins_selected) {
314  float norm = best_start_bins[nbins_selected].second;
315  if (norm < norm_min) {
316  norm_min = norm;
317  best_start_bin = best_start_bins[nbins_selected].first;
318  best_nbins_selected = nbins_selected;
319  }
320  }
321 
322  float total_sum = 0;
323  for (int i = 0; i < bins_f.size(); ++i) {
324  total_sum += bins_f[i];
325  }
326  float selected_sum = 0;
327  int i_begin = std::max(0, best_start_bin);
328  int i_end = std::min(nbins, best_start_bin + best_nbins_selected);
329  for (int i = i_begin; i < i_end; ++i) {
330  selected_sum += bins_f[i];
331  }
332  VLOG(2) << "best quantization range covers " << selected_sum / total_sum * 100
333  << " %%";
334 
335  max = min + bin_width * (best_start_bin + best_nbins_selected);
336  min = min + bin_width * (best_start_bin);
337 
338  QuantizationFactory* qfactory = QuantizationFactory::GetDefaultInstance();
339  return qfactory->ChooseQuantizationParams(
340  min, max, precision, preserve_sparsity);
341 } // ChooseQuantizationParams
342 
343 } // namespace dnnlowp
TensorQuantizationParams ChooseQuantizationParams(float min, float max, int precision, bool preserve_sparsity, bool is_signed=false) const
Choose quantization scale and zero_point that maps floating-point range [min, max] to the integer ran...
Definition: dnnlowp.h:46
Identification of an Intel CPU.
Definition: cpuid.h:27
bin_width = (max - min)/nbins ith bin (zero-based indexing) contains [i*bin_width, (i+1)*bin_width) with an exception that (nbins - 1)th bin contains [(nbins-1)*bin_width, nbins*bin_width]