Caffe2 - C++ API
A deep learning, cross platform ML framework
ulp.cc
1 #include "ulp.h"
2 
3 #include <cstring>
4 #include "caffe2/operators/conv_pool_op_base.h"
5 #include "caffe2/utils/eigen_utils.h"
6 #include "ulp_neon.h"
7 
8 namespace caffe2 {
9 
10 void uniformQuantize2b1b(const TensorCPU& X,
11  const std::vector<std::unique_ptr<TensorCPU>>& XQ,
12  float offset,
13  float inter_center_distance) {
14  CAFFE_ENFORCE_GT(X.ndim(), 1);
15  const auto N = X.size_to_dim(X.ndim() - 1);
16  auto C = X.size() / N;
17  const auto QC = divRoundUp(C, 8);
18  auto XQs = X.sizes().vec();
19  XQs[X.ndim() - 1] = QC;
20  CAFFE_ENFORCE_EQ(XQ.size(), k2b1bXBits);
21  for (auto i = 0; i < k2b1bXBits; ++i) {
22  XQ[i]->Resize(XQs);
23  }
24  const float* Xdata = X.data<float>();
25  std::array<uint8_t*, k2b1bXBits> XQdata;
26  for (auto i = 0; i < k2b1bXBits; ++i) {
27  XQdata[i] = XQ[i]->mutable_data<uint8_t>();
28  }
29  for (auto n = 0; n < N; ++n) {
30  for (auto qc = 0; qc < QC; ++qc) {
31  // compute the block in X.
32  std::array<uint8_t, k2b1bXBits> p = {{0, 0}};
33  for (auto b = 0; b < 8; ++b) {
34  const auto c = qc * 8 + b;
35  if (c < C) {
36  float v = Xdata[qc * 8 + b + C * n];
37  if (v < offset) {
38  // zero'd already.
39  } else if (v < offset + inter_center_distance) {
40  p[0] |= 1 << b;
41  } else if (v < offset + 2 * inter_center_distance) {
42  p[1] |= 1 << b;
43  } else {
44  p[0] |= 1 << b;
45  p[1] |= 1 << b;
46  }
47  }
48  }
49  for (auto i = 0; i < k2b1bXBits; ++i) {
50  XQdata[i][qc + QC * n] = p[i];
51  }
52  }
53  }
54 }
55 
56 void qconv(const ConvArgs& args,
57  const TensorCPU& X,
58  const TensorCPU& W,
59  const TensorCPU* b,
60  TensorCPU* Y) {
61  const auto N = X.dim32(0);
62  const auto IH = X.dim32(1);
63  const auto IW = X.dim32(2);
64  const auto KH = W.dim32(1);
65  const auto KW = W.dim32(2);
66  const auto KC = W.dim32(3);
67  Y->Resize(X.dim32(0),
68  (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
69  (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
70  W.dim32(0));
71  const auto OH = Y->dim32(1);
72  const auto OW = Y->dim32(2);
73  const auto OC = Y->dim32(3);
74 
75  CAFFE_ENFORCE_EQ(W.dim32(3), X.dim32(3));
76 
77  const auto* Xdata = X.data<uint8_t>();
78  const auto* Wdata = W.data<uint8_t>();
79  auto* Ydata = Y->mutable_data<float>();
80  for (size_t n = 0; n < N; ++n) {
81  for (size_t oh = 0; oh < OH; ++oh) {
82  for (size_t ow = 0; ow < OW; ++ow) {
83  for (size_t oc = 0; oc < OC; ++oc) {
84  float acc = 0.0;
85  for (size_t kh = 0; kh < KH; ++kh) {
86  const int32_t ih = (int32_t)kh + (int32_t)args.stride_h * oh - (int32_t)args.pad_t;
87  for (size_t kw = 0; kw < KW; ++kw) {
88  const int32_t iw = (int32_t)kw + (int32_t)args.stride_w * ow - (int32_t)args.pad_l;
89  for (size_t kc = 0; kc < KC; ++kc) {
90  const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc];
91  // Use unsigned integer math to avoid multiple comparisons (>= H, < 0).
92  if ((size_t)ih >= (size_t)IH || (size_t)iw >= (size_t)IW) {
93  acc += __builtin_popcount(0 ^ w);
94  } else {
95  const uint8_t x =
96  Xdata[kc + KC * (size_t)iw + KC * IW * (size_t)ih + n * KC * IW * IH];
97  const uint8_t w = Wdata[kc + KC * kw + KC * KW * kh + KC * KW * KH * oc];
98  acc += __builtin_popcount(x ^ w);
99  }
100  }
101  }
102  }
103  Ydata[oc + OC * ow + OC * OW * oh + n * OC * OW * OH] =
104  KW * KH * KC * 8 - 2 * acc + (b ? b->data<float>()[oc] : 0.0);
105  ;
106  }
107  }
108  }
109  }
110 }
111 
112 void qpad_zero(const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
113  CAFFE_ENFORCE_EQ(args.stride_h, 1);
114  CAFFE_ENFORCE_EQ(args.stride_w, 1);
115  const auto* Xdata = X.data<uint8_t>();
116  Y->Resize(X.dim32(0),
117  X.dim32(1) + args.pad_t + args.pad_b,
118  X.dim32(2) + args.pad_l + args.pad_r,
119  X.dim32(3));
120  auto* Ydata = Y->mutable_data<uint8_t>();
121  ::memset(Ydata, 0, Y->nbytes());
122  const auto C = Y->dim32(3);
123  const auto XrowSize = X.dim32(3) * X.dim32(2);
124  const auto YrowSize = Y->dim32(3) * Y->dim32(2);
125  math::CopyMatrix<CPUContext>(1,
126  X.dim32(1),
127  XrowSize,
128  Xdata,
129  XrowSize,
130  Ydata + C * args.pad_l + YrowSize * args.pad_t,
131  YrowSize,
132  nullptr);
133 }
134 
135 void signQuantize(const TensorCPU& X, TensorCPU* XQ) {
136  CAFFE_ENFORCE_GT(X.ndim(), 1);
137  const auto N = X.size_to_dim(X.ndim() - 1);
138  auto C = X.size() / N;
139  const auto QC = divRoundUp(C, 8);
140  auto XQs = X.sizes().vec();
141  XQs[X.ndim() - 1] = QC;
142  XQ->Resize(XQs);
143  const float* Xdata = X.data<float>();
144  uint8_t* XQdata = XQ->mutable_data<uint8_t>();
145  for (auto n = 0; n < N; ++n) {
146  for (auto qc = 0; qc < QC; ++qc) {
147  // compute the block in X.
148  uint8_t p = 0;
149  for (auto b = 0; b < 8; ++b) {
150  const auto c = qc * 8 + b;
151  if (c < C) {
152  p |= (Xdata[c + C * n] > 0) << b;
153  }
154  }
155  XQdata[qc + QC * n] = p;
156  }
157  }
158 }
159 
160 void filterNormalization11(const TensorCPU& WQ, TensorCPU* WQN) {
161  const auto F = WQ.dim32(0);
162  // In our NEON kernel we read up to TileSize, so align allocation to TileSize elements.
163  WQN->Resize(divRoundUp(F, kGEMMTileSize) * kGEMMTileSize);
164  const auto WQs = WQ.size() / F;
165  const auto WQbits = 8 * WQs;
166  const auto* WQdata = WQ.data<uint8_t>();
167  auto* WQNdata = WQN->mutable_data<float>();
168  for (auto f = 0; f < F; ++f) {
169  int32_t bitSum = 0;
170  for (auto j = 0; j < WQs; ++j) {
171  bitSum += __builtin_popcount(WQdata[f * WQs + j]);
172  }
173  DCHECK_LE(bitSum, WQbits);
174  WQNdata[f] = 2 * bitSum - WQbits;
175  }
176 }
177 
178 void filterNormalizationL1(const TensorCPU& W, TensorCPU* WL1) {
179  const auto F = W.dim32(0);
180  WL1->Resize(F);
181  const auto Ws = W.size() / F;
182  const auto* Wdata = W.data<float>();
183  auto* WL1data = WL1->mutable_data<float>();
184  for (auto f = 0; f < F; ++f) {
185  double l1sum = 0.0;
186  for (auto j = 0; j < Ws; ++j) {
187  l1sum += std::abs(Wdata[f * Ws + j]);
188  }
189  WL1data[f] = l1sum / Ws;
190  }
191 }
192 
193 void qim2col(const ConvArgs& args, const TensorCPU& XQ, const TensorCPU& WQ, TensorCPU* XQcol) {
194  // TODO: pass pre-resized output?
195  // TODO: handle strides?
196 
197  CAFFE_ENFORCE_EQ(XQ.dim32(3), WQ.dim32(3));
198  const size_t N = XQ.dim32(0);
199  const size_t IH = XQ.dim32(1);
200  const size_t IW = XQ.dim32(2);
201  const size_t KH = WQ.dim32(1);
202  const size_t KW = WQ.dim32(2);
203  const size_t KC = WQ.dim32(3);
204 
205  XQcol->Resize(XQ.dim32(0),
206  (XQ.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1,
207  (XQ.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1,
208  KH * KW * KC);
209 
210  if (args.pad_l == 0 && args.pad_r == 0 && args.pad_b == 0 && args.pad_t == 0 &&
211  args.stride_h == 1 && args.stride_w == 1 && KH == 1 && KW == 1) {
212  CAFFE_ENFORCE_EQ(XQ.size(), XQcol->size());
213  XQcol->ShareExternalPointer(const_cast<uint8_t*>(XQ.data<uint8_t>()), XQ.size());
214  return;
215  }
216  const size_t OH = XQcol->dim32(1);
217  const size_t OW = XQcol->dim32(2);
218 
219  const uint8_t* XQdata = XQ.data<uint8_t>();
220  uint8_t* XQcoldata = XQcol->mutable_data<uint8_t>();
221  for (size_t n = 0; n < N; ++n) {
222  for (size_t oh = 0; oh < OH; ++oh) {
223  int32_t h_pad = (int32_t)(args.stride_h * oh) - (int32_t)args.pad_t;
224  for (size_t ow = 0; ow < OW; ++ow) {
225  int32_t w_pad = (int32_t)(args.stride_w * ow) - (int32_t)args.pad_l;
226  for (size_t kh = 0; kh < KH; ++kh) {
227  int32_t ih = (int32_t)kh + h_pad;
228  if ((size_t)ih < (size_t)IH && (size_t)w_pad < (size_t)IW &&
229  (size_t)((int32_t)w_pad + (int32_t)KW) < (size_t)IW) {
230  // We can do a larger memcpy, of size KW * KC
231  size_t off = kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW +
232  n * KH * KW * KC * OW * OH;
233  std::memcpy(&XQcoldata[off],
234  &XQdata[((int32_t)w_pad) * KC + ih * IW * KC + n * IW * KC * IH],
235  KW * KC);
236  } else {
237  for (size_t kw = 0; kw < KW; ++kw) {
238  int32_t iw = (int32_t)kw + w_pad;
239  // Use unsigned integer math to avoid multiple comparisons (>= H, < 0).
240  size_t off = kw * KC + kh * KW * KC + ow * KH * KW * KC + oh * KH * KW * KC * OW +
241  n * KH * KW * KC * OW * OH;
242  if ((size_t)ih < (size_t)IH && (size_t)iw < (size_t)IW) {
243  std::memcpy(
244  &XQcoldata[off], &XQdata[iw * KC + ih * IW * KC + n * KC * IW * IH], KC);
245  } else {
246  // This should be simply padded with zero.
247  std::memset(&XQcoldata[off], 0, KC);
248  }
249  }
250  }
251  }
252  }
253  }
254  }
255 }
256 
257 std::unique_ptr<QConvState> create2b1bConvState(Workspace* ws,
258  const TensorCPU& W,
259  const TensorCPU* b) {
260  auto state = caffe2::make_unique<QConvState>();
261  state->XQs.resize(k2b1bXBits);
262  state->YQs.resize(k2b1bXBits);
263  for (auto i = 0; i < k2b1bXBits; ++i) {
264  state->XQs[i] = caffe2::make_unique<Tensor>(CPU);
265  state->YQs[i] = caffe2::make_unique<Tensor>(CPU);
266  }
267  state->WQ = caffe2::make_unique<Tensor>(CPU);
268  state->WQN = caffe2::make_unique<Tensor>(CPU);
269  state->WQL1Norm = caffe2::make_unique<Tensor>(CPU);
270  state->scratch = caffe2::make_unique<Tensor>(CPU);
271  state->scratchColBuffer = caffe2::make_unique<Tensor>(CPU);
272 
273  signQuantize(W, state->WQ.get());
274  filterNormalization11(*(state->WQ), state->WQN.get());
275  filterNormalizationL1(W, state->WQL1Norm.get());
276  // TODO: incorporate center distance normalization.
277  // Since inputs to convs are [0, 1, 2, 3], instead of [0, x, 2 * x, ...],
278  // we can just uniformly rescale the outputs by x, i.e.,
279  // for (auto i = 0; i < r->WQL1Norm.size(); ++i) {
280  // r->WQL1Norm.mutable_data<float>()[i] *= center_distance;
281  // }
282  state->parallelFor = [ws](size_t range, std::function<void(size_t)> f) {
283 #ifdef C10_MOBILE
284  ws->GetThreadPool()->run([&](int, size_t v) { f(v); }, range);
285 #else
286  for (size_t v = 0; v < range; ++v) {
287  f(v);
288  }
289 #endif
290  };
291  if (b) {
292  state->bias = caffe2::make_unique<Tensor>(*b, CPU);
293  }
294  return state;
295 }
296 
297 void run2b1bConvGeneric(QConvState* state, const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
298 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
299  if (run2b1bConvNeon(state, args, X, Y)) {
300  return;
301  }
302 #endif
303  uniformQuantize2b1b(X, state->XQs, 0.5, 1.0);
304  for (auto i = 0; i < k2b1bXBits; ++i) {
305  qconv(args, *(state->XQs[i]), *(state->WQ), nullptr, state->YQs[i].get());
306  }
307  Y->ResizeLike(*(state->YQs[0]));
308  const auto F = state->WQ->dim(0);
309  const auto N = Y->size() / F;
310  run2b1bUnification(state,
311  N,
312  F,
313  state->WQN->data<float>(),
314  state->YQs[0]->data<float>(),
315  state->YQs[1]->data<float>(),
316  F,
317  Y->mutable_data<float>(),
318  F,
319  state->bias ? state->bias->data<float>() : nullptr);
320 }
321 
322 void run2b1bUnification(QConvState* state,
323  size_t N,
324  size_t C,
325  const float* WQNVdata,
326  const float* YQs0Vdata,
327  const float* YQs1Vdata,
328  size_t YQstride,
329  float* Ydata,
330  size_t Ystride,
331  const float* bias) {
332  ConstEigenVectorArrayMap<float> WQNV(WQNVdata, C);
333 
334  for (size_t j = 0; j < N; ++j) {
335  ConstEigenVectorArrayMap<float> YQs0V(YQs0Vdata + YQstride * j, C);
336  ConstEigenVectorArrayMap<float> YQs1V(YQs1Vdata + YQstride * j, C);
337  EigenVectorArrayMap<float> YNV(Ydata + Ystride * j, C);
338  if (bias) {
339  ConstEigenVectorArrayMap<float> BV(bias, C);
340  YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V +
341  std::pow<float>(2, 0) * YQs1V + BV;
342  } else {
343  YNV = (std::pow<float>(2, k2b1bXBits) - 1) / 2 * WQNV + std::pow<float>(2, -1) * YQs0V +
344  std::pow<float>(2, 0) * YQs1V;
345  }
346  }
347 }
348 
349 class QConvOp final : public ConvPoolOpBase<CPUContext> {
350  public:
351  QConvOp(const OperatorDef& operator_def, Workspace* ws)
352  : ConvPoolOpBase<CPUContext>(operator_def, ws), ws_(ws) {
353  OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NHWC, "QConvOp only supports NHWC order");
354  OPERATOR_NEEDS_FEATURE(this->dilation_h() == 1, "");
355  OPERATOR_NEEDS_FEATURE(this->dilation_w() == 1, "");
356  OPERATOR_NEEDS_FEATURE(this->group_ == 1, "");
357  }
358 
359  bool RunOnDeviceWithOrderNHWC() override {
360  auto& X = Input(0);
361  auto& filter = Input(1);
362  const auto* bias = InputSize() == 3 ? &Input(2) : nullptr;
363  auto* Y = Output(0);
364 
365  // TODO: Support multiple quantization methods instead of assuming 2b1b.
366  if (!state_) {
367  state_ = create2b1bConvState(ws_, filter, bias);
368  }
369  ConvArgs args;
370  args.pad_l = this->pad_l();
371  args.pad_t = this->pad_t();
372  args.pad_b = this->pad_b();
373  args.pad_r = this->pad_r();
374  args.stride_h = this->stride_h();
375  args.stride_w = this->stride_w();
376  run2b1bConvGeneric(state_.get(), args, X, Y);
377  return true;
378  }
379 
380  private:
381  std::unique_ptr<QConvState> state_;
382  Workspace* ws_;
383 };
384 
385 REGISTER_CPU_OPERATOR(QConv, QConvOp);
386 
387 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64