Caffe2 - C++ API
A deep learning, cross platform ML framework
ulp.cc
1 
17 #include "ulp.h"
18 #include "caffe2/operators/conv_pool_op_base.h"
19 #include "ulp_neon.h"
20 
21 namespace caffe2 {
22 
23 void uniformQuantize2b1b(const TensorCPU& X,
24  const std::vector<std::unique_ptr<TensorCPU>>& XQ,
25  float offset,
26  float inter_center_distance) {
27  CAFFE_ENFORCE_GT(X.ndim(), 1);
28  const auto N = X.size_to_dim(X.ndim() - 1);
29  auto C = X.size() / N;
30  const auto QC = divRoundUp(C, 8);
31  auto XQs = X.dims();
32  XQs[X.ndim() - 1] = QC;
33  CAFFE_ENFORCE_EQ(XQ.size(), k2b1bXBits);
34  for (auto i = 0; i < k2b1bXBits; ++i) {
35  XQ[i]->Resize(XQs);
36  }
37  const float* Xdata = X.data<float>();
38  std::array<uint8_t*, k2b1bXBits> XQdata;
39  for (auto i = 0; i < k2b1bXBits; ++i) {
40  XQdata[i] = XQ[i]->mutable_data<uint8_t>();
41  }
42  for (auto n = 0; n < N; ++n) {
43  for (auto qc = 0; qc < QC; ++qc) {
44  // compute the block in X.
45  std::array<uint8_t, k2b1bXBits> p = {{0, 0}};
46  for (auto b = 0; b < 8; ++b) {
47  const auto c = qc * 8 + b;
48  if (c < C) {
49  float v = Xdata[qc * 8 + b + C * n];
50  if (v < offset) {
51  // zero'd already.
52  } else if (v < offset + inter_center_distance) {
53  p[0] |= 1 << b;
54  } else if (v < offset + 2 * inter_center_distance) {
55  p[1] |= 1 << b;
56  } else {
57  p[0] |= 1 << b;
58  p[1] |= 1 << b;
59  }
60  }
61  }
62  for (auto i = 0; i < k2b1bXBits; ++i) {
63  XQdata[i][qc + QC * n] = p[i];
64  }
65  }
66  }
67 }
68 
69 void qconv(const ConvArgs& args,
70  const TensorCPU& X,
71  const TensorCPU& W,
72  const TensorCPU* b,
73  TensorCPU* Y) {
74  const auto N = X.dim32(0);
75  const auto IH = X.dim32(1);
76  const auto IW = X.dim32(2);
77  const auto KH = W.dim32(1);
78  const auto KW = W.dim32(2);
79  const auto KC = W.dim32(3);
80  Y->Resize(X.dim32(0),
81  (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
82  (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
83  W.dim32(0));
84  const auto OH = Y->dim32(1);
85  const auto OW = Y->dim32(2);
86  const auto OC = Y->dim32(3);
87 
88  CAFFE_ENFORCE_EQ(W.dim32(3), X.dim32(3));
89 
90  const auto* Xdata = X.data<uint8_t>();
91  const auto* Wdata = W.data<uint8_t>();
92  auto* Ydata = Y->mutable_data<float>();
93  for (size_t n = 0; n < N; ++n) {
94  for (size_t oh = 0; oh < OH; ++oh) {
95  for (size_t ow = 0; ow < OW; ++ow) {
96  for (size_t oc = 0; oc < OC; ++oc) {
97  float acc = 0.0;
98  for (size_t kh = 0; kh < KH; ++kh) {
99  const int32_t ih = (int32_t)kh + (int32_t)args.stride_h * oh - (int32_t)args.pad_t;
100  for (size_t kw = 0; kw < KW; ++kw) {
101  const int32_t iw = (int32_t)kw + (int32_t)args.stride_w * ow - (int32_t)args.pad_l;
102  for (size_t kc = 0; kc < KC; ++kc) {
103  const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc];
104  // Use unsigned integer math to avoid multiple comparisons (>= H, < 0).
105  if ((size_t)ih >= (size_t)IH || (size_t)iw >= (size_t)IW) {
106  acc += __builtin_popcount(0 ^ w);
107  } else {
108  const uint8_t x =
109  Xdata[kc + KC * (size_t)iw + KC * IW * (size_t)ih + n * KC * IW * IH];
110  const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc];
111  acc += __builtin_popcount(x ^ w);
112  }
113  }
114  }
115  }
116  Ydata[oc + OC * ow + OC * OW * oh + n * OC * OW * OH] =
117  KW * KH * KC * 8 - 2 * acc + (b ? b->data<float>()[oc] : 0.0);
118  ;
119  }
120  }
121  }
122  }
123 }
124 
125 void qpad_zero(const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
126  CAFFE_ENFORCE_EQ(args.stride_h, 1);
127  CAFFE_ENFORCE_EQ(args.stride_w, 1);
128  const auto* Xdata = X.data<uint8_t>();
129  Y->Resize(X.dim32(0),
130  X.dim32(1) + args.pad_t + args.pad_b,
131  X.dim32(2) + args.pad_l + args.pad_r,
132  X.dim32(3));
133  auto* Ydata = Y->mutable_data<uint8_t>();
134  ::memset(Ydata, Y->nbytes(), 0);
135  const auto C = Y->dim32(3);
136  const auto XrowSize = X.dim32(3) * X.dim32(2);
137  const auto YrowSize = Y->dim32(3) * Y->dim32(2);
138  math::CopyMatrix<CPUContext>(1,
139  X.dim32(1),
140  XrowSize,
141  Xdata,
142  XrowSize,
143  Ydata + C * args.pad_l + YrowSize * args.pad_t,
144  YrowSize,
145  nullptr);
146 }
147 
148 void signQuantize(const TensorCPU& X, TensorCPU* XQ) {
149  CAFFE_ENFORCE_GT(X.ndim(), 1);
150  const auto N = X.size_to_dim(X.ndim() - 1);
151  auto C = X.size() / N;
152  const auto QC = divRoundUp(C, 8);
153  auto XQs = X.dims();
154  XQs[X.ndim() - 1] = QC;
155  XQ->Resize(XQs);
156  const float* Xdata = X.data<float>();
157  uint8_t* XQdata = XQ->mutable_data<uint8_t>();
158  for (auto n = 0; n < N; ++n) {
159  for (auto qc = 0; qc < QC; ++qc) {
160  // compute the block in X.
161  uint8_t p = 0;
162  for (auto b = 0; b < 8; ++b) {
163  const auto c = qc * 8 + b;
164  if (c < C) {
165  p |= (Xdata[c + C * n] > 0) << b;
166  }
167  }
168  XQdata[qc + QC * n] = p;
169  }
170  }
171 }
172 
173 void filterNormalization11(const TensorCPU& WQ, TensorCPU* WQN) {
174  const auto F = WQ.dim32(0);
175  // In our NEON kernel we read up to TileSize, so align allocation to TileSize elements.
176  WQN->Resize(divRoundUp(F, kGEMMTileSize) * kGEMMTileSize);
177  const auto WQs = WQ.size() / F;
178  const auto WQbits = 8 * WQs;
179  const auto* WQdata = WQ.data<uint8_t>();
180  auto* WQNdata = WQN->mutable_data<float>();
181  for (auto f = 0; f < F; ++f) {
182  int32_t bitSum = 0;
183  for (auto j = 0; j < WQs; ++j) {
184  bitSum += __builtin_popcount(WQdata[f * WQs + j]);
185  }
186  DCHECK_LE(bitSum, WQbits);
187  WQNdata[f] = 2 * bitSum - WQbits;
188  }
189 }
190 
191 void filterNormalizationL1(const TensorCPU& W, TensorCPU* WL1) {
192  const auto F = W.dim32(0);
193  WL1->Resize(F);
194  const auto Ws = W.size() / F;
195  const auto* Wdata = W.data<float>();
196  auto* WL1data = WL1->mutable_data<float>();
197  for (auto f = 0; f < F; ++f) {
198  double l1sum = 0.0;
199  for (auto j = 0; j < Ws; ++j) {
200  l1sum += std::abs(Wdata[f * Ws + j]);
201  }
202  WL1data[f] = l1sum / Ws;
203  }
204 }
205 
206 void qim2col(const ConvArgs& args, const TensorCPU& XQ, const TensorCPU& WQ, TensorCPU* XQcol) {
207  // TODO: pass pre-resized output?
208  // TODO: handle strides?
209 
210  CAFFE_ENFORCE_EQ(XQ.dim32(3), WQ.dim32(3));
211  const size_t N = XQ.dim32(0);
212  const size_t IH = XQ.dim32(1);
213  const size_t IW = XQ.dim32(2);
214  const size_t KH = WQ.dim32(1);
215  const size_t KW = WQ.dim32(2);
216  const size_t KC = WQ.dim32(3);
217 
218  XQcol->Resize(XQ.dim32(0),
219  (XQ.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
220  (XQ.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
221  KH * KW * KC);
222 
223  if (args.pad_l == 0 && args.pad_r == 0 && args.pad_b == 0 && args.pad_t == 0 &&
224  args.stride_h == 1 && args.stride_w == 1 && KH == 1 && KW == 1) {
225  CAFFE_ENFORCE_EQ(XQ.size(), XQcol->size());
226  XQcol->ShareExternalPointer(const_cast<uint8_t*>(XQ.data<uint8_t>()), XQ.size());
227  return;
228  }
229  const size_t OH = XQcol->dim32(1);
230  const size_t OW = XQcol->dim32(2);
231 
232  const uint8_t* XQdata = XQ.data<uint8_t>();
233  uint8_t* XQcoldata = XQcol->mutable_data<uint8_t>();
234  for (size_t n = 0; n < N; ++n) {
235  for (size_t oh = 0; oh < OH; ++oh) {
236  int32_t h_pad = (int32_t)(args.stride_h * oh) - (int32_t)args.pad_t;
237  for (size_t ow = 0; ow < OW; ++ow) {
238  int32_t w_pad = (int32_t)(args.stride_w * ow) - (int32_t)args.pad_l;
239  for (size_t kh = 0; kh < KH; ++kh) {
240  int32_t ih = (int32_t)kh + h_pad;
241  if ((size_t)ih < (size_t)IH && (size_t)w_pad < (size_t)IW &&
242  (size_t)((int32_t)w_pad + (int32_t)KW) < (size_t)IW) {
243  // We can do a larger memcpy, of size KW * KC
244  size_t off = kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW +
245  n * KH * KW * KC * OW * OH;
246  std::memcpy(&XQcoldata[off],
247  &XQdata[((int32_t)w_pad) * KC + ih * IW * KC + n * IW * KC * IH],
248  KW * KC);
249  } else {
250  for (size_t kw = 0; kw < KW; ++kw) {
251  int32_t iw = (int32_t)kw + w_pad;
252  // Use unsigned integer math to avoid multiple comparisons (>= H, < 0).
253  size_t off = kw * KC + kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW +
254  n * KH * KW * KC * OW * OH;
255  if ((size_t)ih < (size_t)IH && (size_t)iw < (size_t)IW) {
256  std::memcpy(
257  &XQcoldata[off], &XQdata[iw * KC + ih * IW * KC + n * KC * IW * IH], KC);
258  } else {
259  // This should be simply padded with zero.
260  std::memset(&XQcoldata[off], 0, KC);
261  }
262  }
263  }
264  }
265  }
266  }
267  }
268 }
269 
270 std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
271  const TensorCPU& W,
272  const TensorCPU* b) {
273  auto state = caffe2::make_unique<QConvState>();
274  state->XQs.resize(k2b1bXBits);
275  state->YQs.resize(k2b1bXBits);
276  for (auto i = 0; i < k2b1bXBits; ++i) {
277  state->XQs[i] = caffe2::make_unique<TensorCPU>();
278  state->YQs[i] = caffe2::make_unique<TensorCPU>();
279  }
280  state->WQ = caffe2::make_unique<TensorCPU>();
281  state->WQN = caffe2::make_unique<TensorCPU>();
282  state->WQL1Norm = caffe2::make_unique<TensorCPU>();
283  state->scratch = caffe2::make_unique<TensorCPU>();
284  state->scratchColBuffer = caffe2::make_unique<TensorCPU>();
285 
286  signQuantize(W, state->WQ.get());
287  filterNormalization11(*(state->WQ), state->WQN.get());
288  filterNormalizationL1(W, state->WQL1Norm.get());
289  // TODO: incorporate center distance normalization.
290  // Since inputs to convs are [0, 1, 2, 3], instead of [0, x, 2 * x, ...],
291  // we can just uniformly rescale the outputs by x, i.e.,
292  // for (auto i = 0; i < r->WQL1Norm.size(); ++i) {
293  // r->WQL1Norm.mutable_data<float>()[i] *= center_distance;
294  // }
295  state->parallelFor = [ws](size_t range, std::function<void(size_t)> f) {
296 #if CAFFE2_MOBILE
297  ws->GetThreadPool()->run([&](int, size_t v) { f(v); }, range);
298 #else
299  for (size_t v = 0; v < range; ++v) {
300  f(v);
301  }
302 #endif
303  };
304  if (b) {
305  state->bias = caffe2::make_unique<TensorCPU>(*b);
306  }
307  return state;
308 }
309 
310 void run2b1bConvGeneric(QConvState* state, const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
311 #ifdef __ARM_NEON__
312  if (run2b1bConvNeon(state, args, X, Y)) {
313  return;
314  }
315 #endif
316  uniformQuantize2b1b(X, state->XQs, 0.5, 1.0);
317  for (auto i = 0; i < k2b1bXBits; ++i) {
318  qconv(args, *(state->XQs[i]), *(state->WQ), nullptr, state->YQs[i].get());
319  }
320  Y->ResizeLike(*(state->YQs[0]));
321  const auto F = state->WQ->dim(0);
322  const auto N = Y->size() / F;
323  run2b1bUnification(state,
324  N,
325  F,
326  state->WQN->data<float>(),
327  state->YQs[0]->data<float>(),
328  state->YQs[1]->data<float>(),
329  F,
330  Y->mutable_data<float>(),
331  F,
332  state->bias ? state->bias->data<float>() : nullptr);
333 }
334 
335 void run2b1bUnification(QConvState* state,
336  size_t N,
337  size_t C,
338  const float* WQNVdata,
339  const float* YQs0Vdata,
340  const float* YQs1Vdata,
341  size_t YQstride,
342  float* Ydata,
343  size_t Ystride,
344  const float* bias) {
345  ConstEigenVectorArrayMap<float> WQNV(WQNVdata, C);
346 
347  for (size_t j = 0; j < N; ++j) {
348  ConstEigenVectorArrayMap<float> YQs0V(YQs0Vdata + YQstride * j, C);
349  ConstEigenVectorArrayMap<float> YQs1V(YQs1Vdata + YQstride * j, C);
350  EigenVectorArrayMap<float> YNV(Ydata + Ystride * j, C);
351  if (bias) {
352  ConstEigenVectorArrayMap<float> BV(bias, C);
353  YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V +
354  std::pow<float>(2, 0) * YQs1V + BV;
355  } else {
356  YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V +
357  std::pow<float>(2, 0) * YQs1V;
358  }
359  }
360 }
361 
362 class QConvOp final : public ConvPoolOpBase<CPUContext> {
363  public:
364  QConvOp(const OperatorDef& operator_def, Workspace* ws)
365  : ConvPoolOpBase<CPUContext>(operator_def, ws), ws_(ws) {
366  OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NHWC, "QConvOp only supports NHWC order");
367  OPERATOR_NEEDS_FEATURE(this->dilation_h() == 1, "");
368  OPERATOR_NEEDS_FEATURE(this->dilation_w() == 1, "");
369  OPERATOR_NEEDS_FEATURE(this->group_ == 1, "");
370  }
371 
372  bool RunOnDeviceWithOrderNHWC() override {
373  auto& X = Input(0);
374  auto& filter = Input(1);
375  const auto* bias = InputSize() == 3 ? &Input(2) : nullptr;
376  auto* Y = Output(0);
377 
378  // TODO: Support multiple quantization methods instead of assuming 2b1b.
379  if (!state_) {
380  state_ = create2b1bConvState(ws_, filter, bias);
381  }
382  ConvArgs args;
383  args.pad_l = this->pad_l();
384  args.pad_t = this->pad_t();
385  args.pad_b = this->pad_b();
386  args.pad_r = this->pad_r();
387  args.stride_h = this->stride_h();
388  args.stride_w = this->stride_w();
389  run2b1bConvGeneric(state_.get(), args, X, Y);
390  return true;
391  }
392 
393  private:
394  std::unique_ptr<QConvState> state_;
395  Workspace* ws_;
396 };
397 
398 REGISTER_CPU_OPERATOR(QConv, QConvOp);
399 
400 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
Copyright (c) 2016-present, Facebook, Inc.