Caffe2 - C++ API
A deep learning, cross platform ML framework
ulp_neon.cc
1 #include "ulp_neon.h"
2 #include "caffe2/core/timer.h"
3 #include "caffe2/utils/eigen_utils.h"
4 #include "caffe2/utils/math.h"
5 
6 namespace caffe2 {
7 
8 // TODO: tune this with cache size detection code. Changing to 32 helps on some
9 // devices (Snapdragon 820).
10 constexpr size_t kL1CacheSizeBytes = 16 * 1024;
11 
12 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
13 
14 // Applies 2-bit uniform quantization to the floating point data at Xdata,
15 // storing QC bytes into XQdata (i.e. reading 8 * QC floats from Xdata).
16 // Requires QC to be a multiple of 8.
17 inline void quantize2bNeon(size_t QC,
18  const float* __restrict__ Xdata,
19  float offset,
20  float inter_center_distance,
21  std::array<uint8_t*, k2b1bXBits> XQdata) {
22  DCHECK_EQ(QC % 8, 0);
23  const auto offset_plus_2_inter_center_distance = vdupq_n_f32(offset + 2 * inter_center_distance);
24  const auto offset_plus_inter_center_distance = vdupq_n_f32(offset + inter_center_distance);
25  const auto offset_ = vdupq_n_f32(offset);
26  const uint8x8_t shifts = {1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7};
27 
28  for (size_t qc = 0; qc < QC; qc += 8) {
29  std::array<std::array<uint8x8_t, 8>, k2b1bXBits> ps;
30  for (auto i = 0; i < k2b1bXBits; ++i) {
31  for (auto j = 0; j < 8; ++j) {
32  ps[i][j] = vdup_n_u8(0);
33  }
34  }
35 
36  for (auto j = 0; j < 8; ++j) {
37  const auto x0 = vld1q_f32(&Xdata[qc * 8 + j * 8 + 0]);
38  const auto x1 = vld1q_f32(&Xdata[qc * 8 + j * 8 + 4]);
39 
40  // logic.
41  // if (v >= offset + inter_center_distance) {
42  // p[1] |= 1 << b;
43  // } else {
44  // p[1] |= 0 << b;
45  // }
46 
47  // if ((v >= offset && v < offset + inter_center_distance) ||
48  // (v >= offset * 2 * inter_center_distance)) {
49  // p[0] |= 1 << b;
50  // } else {
51  // p[0] |= 0 << b;
52  // }
53 
54  auto join = [](uint32x4_t a, uint32x4_t b) -> uint8x8_t {
55  return vmovn_u16(vcombine_u16(vmovn_u32(a), vmovn_u32(b)));
56  };
57 
58  const auto x_geq_offset_plus_2_inter_center_distance =
59  join(vcgeq_s32(vreinterpretq_s32_f32(x0),
60  vreinterpretq_s32_f32(offset_plus_2_inter_center_distance)),
61  vcgeq_s32(vreinterpretq_s32_f32(x1),
62  vreinterpretq_s32_f32(offset_plus_2_inter_center_distance)));
63  const auto x_ge_offset =
64  join(vcgeq_s32(vreinterpretq_s32_f32(x0), vreinterpretq_s32_f32(offset_)),
65  vcgeq_s32(vreinterpretq_s32_f32(x1), vreinterpretq_s32_f32(offset_)));
66 
67  const auto x_lt_offset_plus_inter_center_distance =
68  join(vcltq_s32(vreinterpretq_s32_f32(x0),
69  vreinterpretq_s32_f32(offset_plus_inter_center_distance)),
70  vcltq_s32(vreinterpretq_s32_f32(x1),
71  vreinterpretq_s32_f32(offset_plus_inter_center_distance)));
72 
73  const auto p1_mask = vmvn_u8(x_lt_offset_plus_inter_center_distance);
74  const auto p0_mask = vorr_u8(vand_u8(x_ge_offset, x_lt_offset_plus_inter_center_distance),
75  x_geq_offset_plus_2_inter_center_distance);
76  ps[0][j] = vand_u8(shifts, p0_mask);
77  ps[1][j] = vand_u8(shifts, p1_mask);
78  }
79 
80  for (auto i = 0; i < 2; ++i) {
81  const auto p01 = vpadd_u8(ps[i][0], ps[i][1]);
82  const auto p23 = vpadd_u8(ps[i][2], ps[i][3]);
83  const auto p45 = vpadd_u8(ps[i][4], ps[i][5]);
84  const auto p67 = vpadd_u8(ps[i][6], ps[i][7]);
85  const auto p0123 = vpadd_u8(p01, p23);
86  const auto p4567 = vpadd_u8(p45, p67);
87  vst1_u8(XQdata[i] + qc, vpadd_u8(p0123, p4567));
88  }
89  }
90 }
91 
92 void uniformQuantize2b1bNeon(QConvState* state,
93  const TensorCPU& X,
94  const std::vector<std::unique_ptr<TensorCPU>>& XQ,
95  float offset,
96  float inter_center_distance) {
97  CAFFE_ENFORCE_GT(X.ndim(), 1);
98  const size_t C = X.dim32(X.ndim() - 1);
99  const size_t N = X.size() / C;
100  const size_t QC = divRoundUp(C, 8);
101  auto XQs = X.sizes().vec();
102  XQs[X.ndim() - 1] = QC;
103  CAFFE_ENFORCE_EQ(XQ.size(), k2b1bXBits);
104  for (auto i = 0; i < k2b1bXBits; ++i) {
105  XQ[i]->Resize(XQs);
106  }
107  const float* Xdata = X.data<float>();
108  std::array<uint8_t*, k2b1bXBits> XQdata;
109  for (size_t i = 0; i < k2b1bXBits; ++i) {
110  XQdata[i] = XQ[i]->mutable_data<uint8_t>();
111  }
112  CAFFE_ENFORCE_GT(offset, 0);
113  CAFFE_ENFORCE_GT(inter_center_distance, 0);
114  size_t QCUnroll = ((C / 8) / 8) * 8;
115  // Each worker loads an L1 cache sized block.
116  // We read/write B * K * 4 + 2 * B * (K / 8), so to fit inside C, we have
117  // B = 4 * C / 17 K.
118  // QCUnroll = 0;
119  const size_t rowsPerBlock =
120  std::max<size_t>(std::floor<size_t>(double(4 * kL1CacheSizeBytes) / double(17 * C)), 1);
121  state->parallelFor(divRoundUp(N, rowsPerBlock), [&](size_t nb) {
122  for (size_t n = nb * rowsPerBlock; n < std::min<size_t>(nb * rowsPerBlock + rowsPerBlock, N);
123  ++n) {
124  std::array<uint8_t*, k2b1bXBits> XQoff = {{
125  XQdata[0] + 0 + QC * n, XQdata[1] + 0 + QC * n,
126  }};
127  quantize2bNeon(QCUnroll, &Xdata[0 + C * n], offset, inter_center_distance, XQoff);
128  for (size_t qc = QCUnroll; qc < QC; ++qc) {
129  // compute the block in X.
130  std::array<uint8_t, k2b1bXBits> p = {{0, 0}};
131  for (size_t b = 0; b < 8; ++b) {
132  const size_t c = qc * 8 + b;
133  if (c < C) {
134  float v = Xdata[c + C * n];
135  if (v < offset) {
136  // zero'd already.
137  } else if (v < offset + inter_center_distance) {
138  p[0] |= 1 << b;
139  } else if (v < offset + 2 * inter_center_distance) {
140  p[1] |= 1 << b;
141  } else {
142  p[0] |= 1 << b;
143  p[1] |= 1 << b;
144  }
145  }
146  }
147  for (auto i = 0; i < k2b1bXBits; ++i) {
148  XQdata[i][qc + QC * n] = p[i];
149  }
150  }
151  }
152  });
153 }
154 
155 template <size_t TileSize, size_t TileDepthBytes>
156 void uniformQuantize2b1bNeonPacked(QConvState* state,
157  const TensorCPU& X,
158  const std::vector<std::unique_ptr<TensorCPU>>& XQ,
159  float offset,
160  float inter_center_distance) {
161  const size_t M = X.size_to_dim(3);
162  const size_t K = X.size() / M;
163  const size_t QK = divRoundUp(K, 8);
164  const size_t numTiles = divRoundUp(M, TileSize);
165  const size_t numTilesDepth = divRoundUp(QK, TileDepthBytes);
166  for (size_t i = 0; i < k2b1bXBits; ++i) {
167  XQ[i]->Resize(numTiles, numTilesDepth, TileSize, TileDepthBytes);
168  }
169  const float* Xdata = X.data<float>();
170  std::array<uint8_t*, k2b1bXBits> XQdata;
171  for (auto i = 0; i < k2b1bXBits; ++i) {
172  XQdata[i] = XQ[i]->mutable_data<uint8_t>();
173  }
174  CAFFE_ENFORCE_GT(offset, 0);
175  CAFFE_ENFORCE_GT(inter_center_distance, 0);
176  // Each worker loads an L1 cache sized block.
177  // We read/write B * K * TileSize * 4 + 2 * B * TileSize * (K / 8), so to fit inside C, we have
178  // B = 4 * C / (17 * K * TileSize).
179  const size_t tilesPerBlock = std::max<size_t>(
180  std::floor<size_t>(double(4 * kL1CacheSizeBytes) / double(17 * K * TileSize)), 1);
181  state->parallelFor(divRoundUp(numTiles, tilesPerBlock), [&](size_t nb) {
182  for (size_t i = nb * tilesPerBlock;
183  i < std::min<size_t>(nb * tilesPerBlock + tilesPerBlock, numTiles);
184  ++i) {
185  for (size_t j = 0; j < numTilesDepth; ++j) {
186  if (i != numTiles - 1 && j != numTilesDepth - 1) {
187  // we have a full tile. Just memcpy.
188  for (auto ii = 0; ii < TileSize; ++ii) {
189  size_t m = i * TileSize + ii;
190  size_t k = j * TileDepthBytes * 8;
191  std::array<uint8_t*, k2b1bXBits> XQoff = {
192  {XQdata[0] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
193  TileSize * TileDepthBytes * numTilesDepth * i,
194  XQdata[1] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
195  TileSize * TileDepthBytes * numTilesDepth * i}};
196  quantize2bNeon(TileDepthBytes, &Xdata[m * K + k], offset, inter_center_distance, XQoff);
197  }
198  } else {
199  for (size_t ii = 0; ii < TileSize; ++ii) {
200  size_t m = i * TileSize + ii;
201  size_t k = j * TileDepthBytes * 8;
202  std::array<uint8_t*, k2b1bXBits> XQoff = {
203  {XQdata[0] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
204  TileSize * TileDepthBytes * numTilesDepth * i,
205  XQdata[1] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
206  TileSize * TileDepthBytes * numTilesDepth * i}};
207  if (m < M && k + TileDepthBytes * 8 <= K) {
208  // We can just read the stripe directly.
209  quantize2bNeon(
210  TileDepthBytes, &Xdata[m * K + k], offset, inter_center_distance, XQoff);
211  } else {
212  // We need to pad the stripe to the full amount read by
213  // quantize2bNeon.
214  std::array<float, 8 * TileDepthBytes> Xpad = {{0}};
215  if (m < M) {
216  std::copy(&Xdata[m * K + k], &Xdata[m * K + K], Xpad.begin());
217  }
218  quantize2bNeon(TileDepthBytes, Xpad.data(), offset, inter_center_distance, XQoff);
219  }
220  }
221  }
222  }
223  }
224  });
225 }
226 
227 // Packs a matrix (of size MxK) into a tiled array of size
228 // (M/TileSize)x(K/TileDepthBytes)xTileSizexTileDepthBytes.
229 template <size_t TileSize, size_t TileDepthBytes>
230 void qpack_tiles(QConvState* state, const TensorCPU& X, size_t axis, TensorCPU* XP) {
231  const size_t M = X.size_to_dim(axis);
232  const size_t QK = X.size() / M;
233  const size_t numTiles = divRoundUp(M, TileSize);
234  const size_t numTilesDepth = divRoundUp(QK, TileDepthBytes);
235  XP->Resize(numTiles, numTilesDepth, TileSize, TileDepthBytes);
236 
237  const auto* __restrict__ Xdata = X.data<uint8_t>();
238  auto* __restrict__ XPdata = XP->mutable_data<uint8_t>();
239  // Load L1 sized tiles per thread.
240  // We read/write 2 * B * QK * TileSize bytes, so
241  // B = C / (2 * QK * TileSize)
242  const size_t tilesPerBlock = std::max<size_t>(
243  std::floor<size_t>(double(kL1CacheSizeBytes) / double(2 * TileSize * QK)), 1);
244  state->parallelFor(divRoundUp(numTiles, tilesPerBlock), [&](size_t nb) {
245  for (size_t i = nb * tilesPerBlock;
246  i < std::min<size_t>(nb * tilesPerBlock + tilesPerBlock, numTiles);
247  ++i) {
248  for (size_t j = 0; j < numTilesDepth; ++j) {
249  if (i != numTiles - 1 && j != numTilesDepth - 1) {
250  // we have a full tile. Just memcpy.
251  for (auto ii = 0; ii < TileSize; ++ii) {
252  auto m = i * TileSize + ii;
253  auto qk = j * TileDepthBytes;
254  std::memcpy(&XPdata[TileDepthBytes * ii + TileDepthBytes * TileSize * j +
255  TileSize * TileDepthBytes * numTilesDepth * i],
256  &Xdata[m * QK + qk],
257  TileDepthBytes);
258  }
259  } else {
260  for (size_t ii = 0; ii < TileSize; ++ii) {
261  for (size_t jj = 0; jj < TileDepthBytes; ++jj) {
262  size_t m = i * TileSize + ii;
263  size_t qk = j * TileDepthBytes + jj;
264  uint8_t pval = 0;
265  if (m < M && qk < QK) {
266  // get value from X
267  pval = Xdata[m * QK + qk];
268  }
269  XPdata[jj + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
270  TileSize * TileDepthBytes * numTilesDepth * i] = pval;
271  }
272  }
273  }
274  }
275  }
276  });
277 }
278 
279 // Computes the kUnrollM x kUnrollM tile of a GEMM by multiplying two packed
280 // slices of size (kUnrolLMxK). These tiles are constructed by the qpack_tiles
281 // function, which packs an input array of size [M][K] into an
282 // [M/TileSize][K/TileDepthBytes][TileSize][TileDepthBytes], which ensures all
283 // the array accesses in this function is contiguous.
284 template <size_t kUnrollM, size_t kUnrollN, size_t TileDepthBytes, typename F>
285 void qgess_packed(const uint8_t* __restrict__ Ablock,
286  const uint8_t* __restrict__ Bblock,
287  float* __restrict__ Cblock,
288  const size_t Cstride,
289  const size_t QK,
290  const size_t Nstart,
291  F&& f) {
292  static_assert(kUnrollN % 8 == 0, "");
293  static_assert(TileDepthBytes == 16, "");
294  DCHECK_EQ(QK % 16, 0);
295  uint16x8_t acc[kUnrollM][kUnrollN / 8];
296  for (size_t mm = 0; mm < kUnrollM; ++mm) {
297  for (size_t nn = 0; nn < kUnrollN / 8; ++nn) {
298  acc[mm][nn] = vdupq_n_u16(0);
299  }
300  }
301  size_t qk = 0;
302  const size_t QK16Unroll = (QK / 16) * 16;
303  for (; qk < QK16Unroll; qk += 16) {
304  uint8x16_t Areg[kUnrollM];
305  for (size_t mm = 0; mm < kUnrollM; ++mm) {
306  Areg[mm] = vld1q_u8(Ablock);
307  Ablock += 16;
308  }
309 
310  for (size_t nn = 0; nn < kUnrollN / 8; ++nn) {
311  uint8x16_t Breg[8];
312  for (size_t nnn = 0; nnn < 8; ++nnn) {
313  Breg[nnn] = vld1q_u8(Bblock);
314  Bblock += 16;
315  }
316  for (size_t mm = 0; mm < kUnrollM; ++mm) {
317  uint8x16_t cnts[8];
318  for (size_t nnn = 0; nnn < 8; ++nnn) {
319  cnts[nnn] = vcntq_u8(veorq_u8(Breg[nnn], Areg[mm]));
320  }
321  uint8x8_t ps[8];
322  for (size_t nnn = 0; nnn < 8; ++nnn) {
323  ps[nnn] = vadd_u8(vget_low_u8(cnts[nnn]), vget_high_u8(cnts[nnn]));
324  }
325  uint8x8_t pss[4];
326  for (size_t nnn = 0; nnn < 4; ++nnn) {
327  pss[nnn] = vpadd_u8(ps[2 * nnn], ps[2 * nnn + 1]);
328  }
329  uint8x8_t psss[2];
330  for (size_t nnn = 0; nnn < 2; ++nnn) {
331  psss[nnn] = vpadd_u8(pss[2 * nnn], pss[2 * nnn + 1]);
332  }
333  uint8x16_t out = vcombine_u8(psss[0], psss[1]);
334  acc[mm][nn] = vpadalq_u8(acc[mm][nn], out);
335  }
336  }
337  }
338 
339  for (size_t mm = 0; mm < kUnrollM; ++mm) {
340  auto* Crow = Cblock + mm * Cstride;
341  for (size_t nn = 0; nn < kUnrollN / 8; ++nn) {
342  const int32x4_t K_ = vdupq_n_s32(QK * 8);
343  const int16x4_t two = vdup_n_s16(2);
344  const int16x4_t acc0123_l = vreinterpret_s16_u16(vget_low_u16(acc[mm][nn]));
345  const int16x4_t acc0123_h = vreinterpret_s16_u16(vget_high_u16(acc[mm][nn]));
346  const int32x4_t K_minus_2_acc0123_l = vmlsl_s16(K_, two, acc0123_l);
347  const int32x4_t K_minus_2_acc0123_h = vmlsl_s16(K_, two, acc0123_h);
348  f(Crow + nn * 8 + 0, vcvtq_f32_s32(K_minus_2_acc0123_l), Nstart + nn * 8 + 0);
349  f(Crow + nn * 8 + 4, vcvtq_f32_s32(K_minus_2_acc0123_h), Nstart + nn * 8 + 4);
350  }
351  }
352 }
353 
354 // Computes the (normal + transpose) matrix-matrix product of two -1/1 binary
355 // matrices, laid out in the standard format.
356 template <size_t TileSize, size_t TileDepthBytes, typename F>
357 inline void qgemm_nt_packed(
358  QConvState* state, const TensorCPU& A, const TensorCPU& B, TensorCPU* C, F&& f = F()) {
359  CAFFE_ENFORCE_EQ(A.ndim(), 4);
360  CAFFE_ENFORCE_EQ(B.ndim(), 4);
361  CAFFE_ENFORCE_EQ(A.dim(2), TileSize);
362  CAFFE_ENFORCE_EQ(B.dim(2), TileSize);
363  CAFFE_ENFORCE_EQ(A.dim(3), TileDepthBytes);
364  CAFFE_ENFORCE_EQ(B.dim(3), TileDepthBytes);
365  const size_t MT = A.dim(0);
366  const size_t NT = B.dim(0);
367  const size_t M = MT * TileSize;
368  const size_t N = NT * TileSize;
369 
370  const size_t QKT = A.dim(1);
371  const size_t K = QKT * 8 * TileDepthBytes;
372  const size_t QK = K / 8;
373  CAFFE_ENFORCE_EQ(A.dim(1), B.dim(1));
374  C->Resize(M, N);
375  const auto* Adata = A.data<uint8_t>();
376  const auto* Bdata = B.data<uint8_t>();
377  auto* Cdata = C->mutable_data<float>();
378 
379  // Assume TxT tile. Each input slice is of size T x (K/8) bytes, and the output
380  // is a tile of size T x T x sizeof(float) bytes. We want the sum of this to fit
381  // in L1 cache. This means for a block number of tiles B , we load B * T * K /
382  // 8 + B * T * K / 8 + B * B * T * T * sizeof(float).
383 
384  // If cache size = C, we get
385  // B = 1/(32 * T) (sqrt(256 C + K^2) - K)
386  // taking floor (by integer division), gives the result.
387 
388  // Assume 16KB L1 cache.
389  size_t tilesPerBlock =
390  std::floor((std::sqrt(256 * kL1CacheSizeBytes + K * K) - K) / (32 * TileSize));
391  if (tilesPerBlock < 1) {
392  tilesPerBlock = 1;
393  }
394  CAFFE_ENFORCE_LT(K, std::pow(2, 16));
395  CAFFE_ENFORCE_EQ(M % TileSize, 0);
396  CAFFE_ENFORCE_EQ(N % TileSize, 0);
397  const size_t MNumTiles = M / TileSize;
398  const size_t NNumTiles = N / TileSize;
399  const size_t MNumBlocks = divRoundUp(MNumTiles, tilesPerBlock);
400  const size_t NNumBlocks = divRoundUp(NNumTiles, tilesPerBlock);
401 
402  state->parallelFor(MNumBlocks * NNumBlocks, [&](size_t mn) {
403  const size_t mBlockIdx = mn / NNumBlocks;
404  const size_t nBlockIdx = mn % NNumBlocks;
405  const size_t mTileStart = mBlockIdx * tilesPerBlock;
406  const size_t nTileStart = nBlockIdx * tilesPerBlock;
407  for (size_t mBlockTileIdx = 0;
408  mBlockTileIdx < tilesPerBlock && mBlockTileIdx + mTileStart < MNumTiles;
409  ++mBlockTileIdx) {
410  const size_t mTileIdx = mBlockTileIdx + mTileStart;
411  for (size_t nBlockTileIdx = 0;
412  nBlockTileIdx < tilesPerBlock && nBlockTileIdx + nTileStart < NNumTiles;
413  ++nBlockTileIdx) {
414  const size_t nTileIdx = nBlockTileIdx + nTileStart;
415  // A layout: [M/TileSize][QK / TileDepth][TileSize][TileDepth]
416  // C layout: [M/TileSize][TileSize][N/TileSize][TileSize]
417  const auto* Ablock = &Adata[mTileIdx * QK * TileSize];
418  const auto* Bblock = &Bdata[nTileIdx * QK * TileSize];
419  auto* Cblock = &Cdata[mTileIdx * TileSize * N + nTileIdx * TileSize];
420  const size_t Cstride = N;
421  qgess_packed<TileSize, TileSize, TileDepthBytes, F>(
422  Ablock, Bblock, Cblock, Cstride, QK, nTileIdx * TileSize, std::forward<F>(f));
423  }
424  }
425  });
426 }
427 
428 void run2b1bConvIm2ColGEMM(QConvState* state,
429  const ConvArgs& args,
430  const TensorCPU& X,
431  TensorCPU* Y) {
432  // TODO: packing + quantization in same block.
433  const size_t KH = state->WQ->dim32(1);
434  const size_t KW = state->WQ->dim32(2);
435  const size_t OH = (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1;
436  const size_t OW = (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1;
437  const size_t OC = state->WQ->dim32(0);
438  const size_t QK = KH * KW * divRoundUp(X.dim32(3), 8);
439  Y->Resize(X.dim32(0), OH, OW, OC);
440  if (!state->WQPacked) {
441  state->WQPacked = caffe2::make_unique<Tensor>(CPU);
442  qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(state, *(state->WQ), 1, state->WQPacked.get());
443  CAFFE_ENFORCE_EQ(state->WQPacked->dim32(0), divRoundUp(OC, kGEMMTileSize));
444  CAFFE_ENFORCE_EQ(state->WQPacked->dim32(1), divRoundUp(QK, kGEMMTileDepthBytes));
445  CAFFE_ENFORCE_EQ(state->WQPacked->dim32(2), kGEMMTileSize);
446  CAFFE_ENFORCE_EQ(state->WQPacked->dim32(3), kGEMMTileDepthBytes);
447 
448  // We can fuse the bias addition into the filter normalization. We can
449  // replace the bias + 3/2 normalization factor by replacing normalization
450  // with (2/3 bias + normalization), and setting bias to zero.
451  if (state->bias) {
452  for (size_t i = 0; i < state->bias->size(); ++i) {
453  state->WQN->mutable_data<float>()[i] += 2.0f / 3 * state->bias->data<float>()[i];
454  }
455  }
456  state->bias.reset();
457 
458  // If we have to pad when we pack our weight tiles, then we need to adjust
459  // the normalization factor by the number of zeros that we added.
460  const size_t QKPadding = divRoundUp(QK, kGEMMTileDepthBytes) * kGEMMTileDepthBytes - QK;
461  if (QKPadding != 0) {
462  for (size_t i = 0; i < state->WQN->size(); ++i) {
463  state->WQN->mutable_data<float>()[i] -= QKPadding * 8;
464  }
465  }
466  }
467  CAFFE_ENFORCE(!state->bias.get());
468  // Since 1x1s are so common, we fuse the quantization + packing steps.
469  const bool is_1x1 = KH == 1 && KW == 1 && args.pad_l == 0 && args.pad_r == 0 && args.pad_b == 0 &&
470  args.pad_t == 0 && args.stride_h == 1 && args.stride_w == 1;
471 
472  if (is_1x1) {
473  CAFFE_ENFORCE_EQ(OH, X.dim32(1));
474  CAFFE_ENFORCE_EQ(OW, X.dim32(2));
475  uniformQuantize2b1bNeonPacked<kGEMMTileSize, kGEMMTileDepthBytes>(
476  state, X, state->XQs, 0.5, 1.0);
477  } else {
478  uniformQuantize2b1bNeon(state, X, state->XQs, 0.5, 1.0);
479  }
480  TensorCPU* YQ0 = state->YQs[0].get();
481 
482  if (state->WQ->dim32(0) % kGEMMTileSize == 0) {
483  // We can run inplace by operating on our Y vector, and then shrinking Y.
484  YQ0 = Y;
485  }
486 
487  for (size_t i = 0; i < k2b1bXBits; ++i) {
488  const auto& XQ = *(state->XQs[i]);
489  if (!is_1x1) {
490  qim2col(args, XQ, *(state->WQ), state->scratchColBuffer.get());
491  qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(
492  state, *(state->scratchColBuffer), 3, state->scratch.get());
493  }
494 
495  {
496  const auto* __restrict__ WQNdata = state->WQN->data<float>();
497  switch (i) {
498  case 0:
499  qgemm_nt_packed<kGEMMTileSize, kGEMMTileDepthBytes>(
500  state,
501  is_1x1 ? XQ : *(state->scratch),
502  *(state->WQPacked),
503  YQ0,
504  [WQNdata](float* __restrict__ acc, float32x4_t value, size_t channel) {
505  // acc[c] = 3/2 WQN[c] + 1/2 value[c];
506  const float32x4_t _32 = vdupq_n_f32(3.0f / 2);
507  const float32x4_t _12 = vdupq_n_f32(1.0f / 2);
508  const float32x4_t WQNc_32 = vmulq_f32(_32, vld1q_f32(WQNdata + channel));
509  const float32x4_t WQNc_32_value_12 = vmlaq_f32(WQNc_32, _12, value);
510  vst1q_f32(acc, WQNc_32_value_12);
511  });
512  break;
513  case 1:
514  qgemm_nt_packed<kGEMMTileSize, kGEMMTileDepthBytes>(
515  state,
516  is_1x1 ? XQ : *(state->scratch),
517  *(state->WQPacked),
518  YQ0,
519  [](float* __restrict__ acc, float32x4_t value, size_t channel) {
520  const float32x4_t curr = vld1q_f32(acc);
521  vst1q_f32(acc, vaddq_f32(curr, value));
522  });
523  break;
524  }
525  }
526  }
527 
528  if (YQ0 != Y) {
529  // In this case, the stride does not match, so we need to copy the output
530  // data into the contiguous Y matrix.
531  const size_t F = state->WQ->dim(0);
532  const size_t N = Y->size() / F;
533  const size_t NP = YQ0->dim32(0);
534  const size_t FP = YQ0->dim32(1);
535  math::CopyMatrix<CPUContext>(
536  sizeof(float), N, F, YQ0->data<float>(), FP, Y->mutable_data<float>(), F, nullptr);
537  } else {
538  CAFFE_ENFORCE_EQ(Y->dim32(0), divRoundUp(X.dim32(0) * OH * OW, kGEMMTileSize) * kGEMMTileSize);
539  CAFFE_ENFORCE_EQ(Y->dim32(1), OC);
540  Y->ShrinkTo(X.dim32(0) * OH * OW);
541  Y->Reshape(std::vector<int64_t>{{int64_t(X.dim(0)), int64_t(OH), int64_t(OW), int64_t(OC)}});
542  }
543 }
544 
545 bool run2b1bConvNeon(QConvState* state, const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
546  // TODO: insert specialized cases (e.g. depthwise convolutions, the direct
547  // convolution.
548  CAFFE_ENFORCE_EQ(X.ndim(), 4);
549  run2b1bConvIm2ColGEMM(state, args, X, Y);
550  return true;
551 }
552 
553 #endif
554 
555 } // namespace caffe2
Definition: any.cpp:108
Definition: static.cpp:52
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64
Definition: static.cpp:58