Caffe2 - C++ API
A deep learning, cross platform ML framework
ulp_neon.cc
1 
17 #include "ulp_neon.h"
18 #include "caffe2/core/timer.h"
19 #include "caffe2/utils/math.h"
20 
21 namespace caffe2 {
22 
23 // TODO: tune this with cache size detection code. Changing to 32 helps on some
24 // devices (Snapdragon 820).
25 constexpr size_t kL1CacheSizeBytes = 16 * 1024;
26 
27 #ifdef __ARM_NEON__
28 
29 // Applies 2-bit uniform quantization to the floating point data at Xdata,
30 // storing QC bytes into XQdata (i.e. reading 8 * QC floats from Xdata).
31 // Requires QC to be a multiple of 8.
32 inline void quantize2bNeon(size_t QC,
33  const float* __restrict__ Xdata,
34  float offset,
35  float inter_center_distance,
36  std::array<uint8_t*, k2b1bXBits> XQdata) {
37  DCHECK_EQ(QC % 8, 0);
38  const auto offset_plus_2_inter_center_distance = vdupq_n_f32(offset + 2 * inter_center_distance);
39  const auto offset_plus_inter_center_distance = vdupq_n_f32(offset + inter_center_distance);
40  const auto offset_ = vdupq_n_f32(offset);
41  const uint8x8_t shifts = {1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7};
42 
43  for (size_t qc = 0; qc < QC; qc += 8) {
44  std::array<std::array<uint8x8_t, 8>, k2b1bXBits> ps;
45  for (auto i = 0; i < k2b1bXBits; ++i) {
46  for (auto j = 0; j < 8; ++j) {
47  ps[i][j] = vdup_n_u8(0);
48  }
49  }
50 
51  for (auto j = 0; j < 8; ++j) {
52  const auto x0 = vld1q_f32(&Xdata[qc * 8 + j * 8 + 0]);
53  const auto x1 = vld1q_f32(&Xdata[qc * 8 + j * 8 + 4]);
54 
55  // logic.
56  // if (v >= offset + inter_center_distance) {
57  // p[1] |= 1 << b;
58  // } else {
59  // p[1] |= 0 << b;
60  // }
61 
62  // if ((v >= offset && v < offset + inter_center_distance) ||
63  // (v >= offset * 2 * inter_center_distance)) {
64  // p[0] |= 1 << b;
65  // } else {
66  // p[0] |= 0 << b;
67  // }
68 
69  auto join = [](uint32x4_t a, uint32x4_t b) -> uint8x8_t {
70  return vmovn_u16(vcombine_u16(vmovn_u32(a), vmovn_u32(b)));
71  };
72 
73  const auto x_geq_offset_plus_2_inter_center_distance =
74  join(vcgeq_s32(vreinterpretq_s32_f32(x0),
75  vreinterpretq_s32_f32(offset_plus_2_inter_center_distance)),
76  vcgeq_s32(vreinterpretq_s32_f32(x1),
77  vreinterpretq_s32_f32(offset_plus_2_inter_center_distance)));
78  const auto x_ge_offset =
79  join(vcgeq_s32(vreinterpretq_s32_f32(x0), vreinterpretq_s32_f32(offset_)),
80  vcgeq_s32(vreinterpretq_s32_f32(x1), vreinterpretq_s32_f32(offset_)));
81 
82  const auto x_lt_offset_plus_inter_center_distance =
83  join(vcltq_s32(vreinterpretq_s32_f32(x0),
84  vreinterpretq_s32_f32(offset_plus_inter_center_distance)),
85  vcltq_s32(vreinterpretq_s32_f32(x1),
86  vreinterpretq_s32_f32(offset_plus_inter_center_distance)));
87 
88  const auto p1_mask = vmvn_u8(x_lt_offset_plus_inter_center_distance);
89  const auto p0_mask = vorr_u8(vand_u8(x_ge_offset, x_lt_offset_plus_inter_center_distance),
90  x_geq_offset_plus_2_inter_center_distance);
91  ps[0][j] = vand_u8(shifts, p0_mask);
92  ps[1][j] = vand_u8(shifts, p1_mask);
93  }
94 
95  for (auto i = 0; i < 2; ++i) {
96  const auto p01 = vpadd_u8(ps[i][0], ps[i][1]);
97  const auto p23 = vpadd_u8(ps[i][2], ps[i][3]);
98  const auto p45 = vpadd_u8(ps[i][4], ps[i][5]);
99  const auto p67 = vpadd_u8(ps[i][6], ps[i][7]);
100  const auto p0123 = vpadd_u8(p01, p23);
101  const auto p4567 = vpadd_u8(p45, p67);
102  vst1_u8(XQdata[i] + qc, vpadd_u8(p0123, p4567));
103  }
104  }
105 }
106 
107 void uniformQuantize2b1bNeon(QConvState* state,
108  const TensorCPU& X,
109  const std::vector<std::unique_ptr<TensorCPU>>& XQ,
110  float offset,
111  float inter_center_distance) {
112  CAFFE_ENFORCE_GT(X.ndim(), 1);
113  const size_t C = X.dim32(X.ndim() - 1);
114  const size_t N = X.size() / C;
115  const size_t QC = divRoundUp(C, 8);
116  auto XQs = X.dims();
117  XQs[X.ndim() - 1] = QC;
118  CAFFE_ENFORCE_EQ(XQ.size(), k2b1bXBits);
119  for (auto i = 0; i < k2b1bXBits; ++i) {
120  XQ[i]->Resize(XQs);
121  }
122  const float* Xdata = X.data<float>();
123  std::array<uint8_t*, k2b1bXBits> XQdata;
124  for (size_t i = 0; i < k2b1bXBits; ++i) {
125  XQdata[i] = XQ[i]->mutable_data<uint8_t>();
126  }
127  CAFFE_ENFORCE_GT(offset, 0);
128  CAFFE_ENFORCE_GT(inter_center_distance, 0);
129  size_t QCUnroll = ((C / 8) / 8) * 8;
130  // Each worker loads an L1 cache sized block.
131  // We read/write B * K * 4 + 2 * B * (K / 8), so to fit inside C, we have
132  // B = 4 * C / 17 K.
133  // QCUnroll = 0;
134  const size_t rowsPerBlock =
135  std::max<size_t>(std::floor<size_t>(double(4 * kL1CacheSizeBytes) / double(17 * C)), 1);
136  state->parallelFor(divRoundUp(N, rowsPerBlock), [&](size_t nb) {
137  for (size_t n = nb * rowsPerBlock; n < std::min<size_t>(nb * rowsPerBlock + rowsPerBlock, N);
138  ++n) {
139  std::array<uint8_t*, k2b1bXBits> XQoff = {{
140  XQdata[0] + 0 + QC * n, XQdata[1] + 0 + QC * n,
141  }};
142  quantize2bNeon(QCUnroll, &Xdata[0 + C * n], offset, inter_center_distance, XQoff);
143  for (size_t qc = QCUnroll; qc < QC; ++qc) {
144  // compute the block in X.
145  std::array<uint8_t, k2b1bXBits> p = {{0, 0}};
146  for (size_t b = 0; b < 8; ++b) {
147  const size_t c = qc * 8 + b;
148  if (c < C) {
149  float v = Xdata[c + C * n];
150  if (v < offset) {
151  // zero'd already.
152  } else if (v < offset + inter_center_distance) {
153  p[0] |= 1 << b;
154  } else if (v < offset + 2 * inter_center_distance) {
155  p[1] |= 1 << b;
156  } else {
157  p[0] |= 1 << b;
158  p[1] |= 1 << b;
159  }
160  }
161  }
162  for (auto i = 0; i < k2b1bXBits; ++i) {
163  XQdata[i][qc + QC * n] = p[i];
164  }
165  }
166  }
167  });
168 }
169 
170 template <size_t TileSize, size_t TileDepthBytes>
171 void uniformQuantize2b1bNeonPacked(QConvState* state,
172  const TensorCPU& X,
173  const std::vector<std::unique_ptr<TensorCPU>>& XQ,
174  float offset,
175  float inter_center_distance) {
176  const size_t M = X.size_to_dim(3);
177  const size_t K = X.size() / M;
178  const size_t QK = divRoundUp(K, 8);
179  const size_t numTiles = divRoundUp(M, TileSize);
180  const size_t numTilesDepth = divRoundUp(QK, TileDepthBytes);
181  for (size_t i = 0; i < k2b1bXBits; ++i) {
182  XQ[i]->Resize(numTiles, numTilesDepth, TileSize, TileDepthBytes);
183  }
184  const float* Xdata = X.data<float>();
185  std::array<uint8_t*, k2b1bXBits> XQdata;
186  for (auto i = 0; i < k2b1bXBits; ++i) {
187  XQdata[i] = XQ[i]->mutable_data<uint8_t>();
188  }
189  CAFFE_ENFORCE_GT(offset, 0);
190  CAFFE_ENFORCE_GT(inter_center_distance, 0);
191  // Each worker loads an L1 cache sized block.
192  // We read/write B * K * TileSize * 4 + 2 * B * TileSize * (K / 8), so to fit inside C, we have
193  // B = 4 * C / (17 * K * TileSize).
194  const size_t tilesPerBlock = std::max<size_t>(
195  std::floor<size_t>(double(4 * kL1CacheSizeBytes) / double(17 * K * TileSize)), 1);
196  state->parallelFor(divRoundUp(numTiles, tilesPerBlock), [&](size_t nb) {
197  for (size_t i = nb * tilesPerBlock;
198  i < std::min<size_t>(nb * tilesPerBlock + tilesPerBlock, numTiles);
199  ++i) {
200  for (size_t j = 0; j < numTilesDepth; ++j) {
201  if (i != numTiles - 1 && j != numTilesDepth - 1) {
202  // we have a full tile. Just memcpy.
203  for (auto ii = 0; ii < TileSize; ++ii) {
204  size_t m = i * TileSize + ii;
205  size_t k = j * TileDepthBytes * 8;
206  std::array<uint8_t*, k2b1bXBits> XQoff = {
207  {XQdata[0] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
208  TileSize * TileDepthBytes * numTilesDepth * i,
209  XQdata[1] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
210  TileSize * TileDepthBytes * numTilesDepth * i}};
211  quantize2bNeon(TileDepthBytes, &Xdata[m * K + k], offset, inter_center_distance, XQoff);
212  }
213  } else {
214  for (size_t ii = 0; ii < TileSize; ++ii) {
215  size_t m = i * TileSize + ii;
216  size_t k = j * TileDepthBytes * 8;
217  std::array<uint8_t*, k2b1bXBits> XQoff = {
218  {XQdata[0] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
219  TileSize * TileDepthBytes * numTilesDepth * i,
220  XQdata[1] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
221  TileSize * TileDepthBytes * numTilesDepth * i}};
222  if (m < M && k + TileDepthBytes * 8 <= K) {
223  // We can just read the stripe directly.
224  quantize2bNeon(
225  TileDepthBytes, &Xdata[m * K + k], offset, inter_center_distance, XQoff);
226  } else {
227  // We need to pad the stripe to the full amount read by
228  // quantize2bNeon.
229  std::array<float, 8 * TileDepthBytes> Xpad = {{0}};
230  if (m < M) {
231  std::copy(&Xdata[m * K + k], &Xdata[m * K + K], Xpad.begin());
232  }
233  quantize2bNeon(TileDepthBytes, Xpad.data(), offset, inter_center_distance, XQoff);
234  }
235  }
236  }
237  }
238  }
239  });
240 }
241 
242 // Packs a matrix (of size MxK) into a tiled array of size
243 // (M/TileSize)x(K/TileDepthBytes)xTileSizexTileDepthBytes.
244 template <size_t TileSize, size_t TileDepthBytes>
245 void qpack_tiles(QConvState* state, const TensorCPU& X, size_t axis, TensorCPU* XP) {
246  const size_t M = X.size_to_dim(axis);
247  const size_t QK = X.size() / M;
248  const size_t numTiles = divRoundUp(M, TileSize);
249  const size_t numTilesDepth = divRoundUp(QK, TileDepthBytes);
250  XP->Resize(numTiles, numTilesDepth, TileSize, TileDepthBytes);
251 
252  const auto* __restrict__ Xdata = X.data<uint8_t>();
253  auto* __restrict__ XPdata = XP->mutable_data<uint8_t>();
254  // Load L1 sized tiles per thread.
255  // We read/write 2 * B * QK * TileSize bytes, so
256  // B = C / (2 * QK * TileSize)
257  const size_t tilesPerBlock = std::max<size_t>(
258  std::floor<size_t>(double(kL1CacheSizeBytes) / double(2 * TileSize * QK)), 1);
259  state->parallelFor(divRoundUp(numTiles, tilesPerBlock), [&](size_t nb) {
260  for (size_t i = nb * tilesPerBlock;
261  i < std::min<size_t>(nb * tilesPerBlock + tilesPerBlock, numTiles);
262  ++i) {
263  for (size_t j = 0; j < numTilesDepth; ++j) {
264  if (i != numTiles - 1 && j != numTilesDepth - 1) {
265  // we have a full tile. Just memcpy.
266  for (auto ii = 0; ii < TileSize; ++ii) {
267  auto m = i * TileSize + ii;
268  auto qk = j * TileDepthBytes;
269  std::memcpy(&XPdata[TileDepthBytes * ii + TileDepthBytes * TileSize * j +
270  TileSize * TileDepthBytes * numTilesDepth * i],
271  &Xdata[m * QK + qk],
272  TileDepthBytes);
273  }
274  } else {
275  for (size_t ii = 0; ii < TileSize; ++ii) {
276  for (size_t jj = 0; jj < TileDepthBytes; ++jj) {
277  size_t m = i * TileSize + ii;
278  size_t qk = j * TileDepthBytes + jj;
279  uint8_t pval = 0;
280  if (m < M && qk < QK) {
281  // get value from X
282  pval = Xdata[m * QK + qk];
283  }
284  XPdata[jj + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
285  TileSize * TileDepthBytes * numTilesDepth * i] = pval;
286  }
287  }
288  }
289  }
290  }
291  });
292 }
293 
294 // Computes the kUnrollM x kUnrollM tile of a GEMM by multiplying two packed
295 // slices of size (kUnrolLMxK). These tiles are constructed by the qpack_tiles
296 // function, which packs an input array of size [M][K] into an
297 // [M/TileSize][K/TileDepthBytes][TileSize][TileDepthBytes], which ensures all
298 // the array accesses in this function is contiguous.
299 template <size_t kUnrollM, size_t kUnrollN, size_t TileDepthBytes, typename F>
300 void qgess_packed(const uint8_t* __restrict__ Ablock,
301  const uint8_t* __restrict__ Bblock,
302  float* __restrict__ Cblock,
303  const size_t Cstride,
304  const size_t QK,
305  const size_t Nstart,
306  F&& f) {
307  static_assert(kUnrollN % 8 == 0, "");
308  static_assert(TileDepthBytes == 16, "");
309  DCHECK_EQ(QK % 16, 0);
310  uint16x8_t acc[kUnrollM][kUnrollN / 8];
311  for (size_t mm = 0; mm < kUnrollM; ++mm) {
312  for (size_t nn = 0; nn < kUnrollN / 8; ++nn) {
313  acc[mm][nn] = vdupq_n_u16(0);
314  }
315  }
316  size_t qk = 0;
317  const size_t QK16Unroll = (QK / 16) * 16;
318  for (; qk < QK16Unroll; qk += 16) {
319  uint8x16_t Areg[kUnrollM];
320  for (size_t mm = 0; mm < kUnrollM; ++mm) {
321  Areg[mm] = vld1q_u8(Ablock);
322  Ablock += 16;
323  }
324 
325  for (size_t nn = 0; nn < kUnrollN / 8; ++nn) {
326  uint8x16_t Breg[8];
327  for (size_t nnn = 0; nnn < 8; ++nnn) {
328  Breg[nnn] = vld1q_u8(Bblock);
329  Bblock += 16;
330  }
331  for (size_t mm = 0; mm < kUnrollM; ++mm) {
332  uint8x16_t cnts[8];
333  for (size_t nnn = 0; nnn < 8; ++nnn) {
334  cnts[nnn] = vcntq_u8(veorq_u8(Breg[nnn], Areg[mm]));
335  }
336  uint8x8_t ps[8];
337  for (size_t nnn = 0; nnn < 8; ++nnn) {
338  ps[nnn] = vadd_u8(vget_low_u8(cnts[nnn]), vget_high_u8(cnts[nnn]));
339  }
340  uint8x8_t pss[4];
341  for (size_t nnn = 0; nnn < 4; ++nnn) {
342  pss[nnn] = vpadd_u8(ps[2 * nnn], ps[2 * nnn + 1]);
343  }
344  uint8x8_t psss[2];
345  for (size_t nnn = 0; nnn < 2; ++nnn) {
346  psss[nnn] = vpadd_u8(pss[2 * nnn], pss[2 * nnn + 1]);
347  }
348  uint8x16_t out = vcombine_u8(psss[0], psss[1]);
349  acc[mm][nn] = vpadalq_u8(acc[mm][nn], out);
350  }
351  }
352  }
353 
354  for (size_t mm = 0; mm < kUnrollM; ++mm) {
355  auto* Crow = Cblock + mm * Cstride;
356  for (size_t nn = 0; nn < kUnrollN / 8; ++nn) {
357  const int32x4_t K_ = vdupq_n_s32(QK * 8);
358  const int16x4_t two = vdup_n_s16(2);
359  const int16x4_t acc0123_l = vreinterpret_s16_u16(vget_low_u16(acc[mm][nn]));
360  const int16x4_t acc0123_h = vreinterpret_s16_u16(vget_high_u16(acc[mm][nn]));
361  const int32x4_t K_minus_2_acc0123_l = vmlsl_s16(K_, two, acc0123_l);
362  const int32x4_t K_minus_2_acc0123_h = vmlsl_s16(K_, two, acc0123_h);
363  f(Crow + nn * 8 + 0, vcvtq_f32_s32(K_minus_2_acc0123_l), Nstart + nn * 8 + 0);
364  f(Crow + nn * 8 + 4, vcvtq_f32_s32(K_minus_2_acc0123_h), Nstart + nn * 8 + 4);
365  }
366  }
367 }
368 
369 // Computes the (normal + transpose) matrix-matrix product of two -1/1 binary
370 // matrices, laid out in the standard format.
371 template <size_t TileSize, size_t TileDepthBytes, typename F>
372 inline void qgemm_nt_packed(
373  QConvState* state, const TensorCPU& A, const TensorCPU& B, TensorCPU* C, F&& f = F()) {
374  CAFFE_ENFORCE_EQ(A.ndim(), 4);
375  CAFFE_ENFORCE_EQ(B.ndim(), 4);
376  CAFFE_ENFORCE_EQ(A.dim(2), TileSize);
377  CAFFE_ENFORCE_EQ(B.dim(2), TileSize);
378  CAFFE_ENFORCE_EQ(A.dim(3), TileDepthBytes);
379  CAFFE_ENFORCE_EQ(B.dim(3), TileDepthBytes);
380  const size_t MT = A.dim(0);
381  const size_t NT = B.dim(0);
382  const size_t M = MT * TileSize;
383  const size_t N = NT * TileSize;
384 
385  const size_t QKT = A.dim(1);
386  const size_t K = QKT * 8 * TileDepthBytes;
387  const size_t QK = K / 8;
388  CAFFE_ENFORCE_EQ(A.dim(1), B.dim(1));
389  C->Resize(M, N);
390  const auto* Adata = A.data<uint8_t>();
391  const auto* Bdata = B.data<uint8_t>();
392  auto* Cdata = C->mutable_data<float>();
393 
394  // Assume TxT tile. Each input slice is of size T x (K/8) bytes, and the output
395  // is a tile of size T x T x sizeof(float) bytes. We want the sum of this to fit
396  // in L1 cache. This means for a block number of tiles B , we load B * T * K /
397  // 8 + B * T * K / 8 + B * B * T * T * sizeof(float).
398 
399  // If cache size = C, we get
400  // B = 1/(32 * T) (sqrt(256 C + K^2) - K)
401  // taking floor (by integer division), gives the result.
402 
403  // Assume 16KB L1 cache.
404  size_t tilesPerBlock =
405  std::floor((std::sqrt(256 * kL1CacheSizeBytes + K * K) - K) / (32 * TileSize));
406  if (tilesPerBlock < 1) {
407  tilesPerBlock = 1;
408  }
409  CAFFE_ENFORCE_LT(K, std::pow(2, 16));
410  CAFFE_ENFORCE_EQ(M % TileSize, 0);
411  CAFFE_ENFORCE_EQ(N % TileSize, 0);
412  const size_t MNumTiles = M / TileSize;
413  const size_t NNumTiles = N / TileSize;
414  const size_t MNumBlocks = divRoundUp(MNumTiles, tilesPerBlock);
415  const size_t NNumBlocks = divRoundUp(NNumTiles, tilesPerBlock);
416 
417  state->parallelFor(MNumBlocks * NNumBlocks, [&](size_t mn) {
418  const size_t mBlockIdx = mn / NNumBlocks;
419  const size_t nBlockIdx = mn % NNumBlocks;
420  const size_t mTileStart = mBlockIdx * tilesPerBlock;
421  const size_t nTileStart = nBlockIdx * tilesPerBlock;
422  for (size_t mBlockTileIdx = 0;
423  mBlockTileIdx < tilesPerBlock && mBlockTileIdx + mTileStart < MNumTiles;
424  ++mBlockTileIdx) {
425  const size_t mTileIdx = mBlockTileIdx + mTileStart;
426  for (size_t nBlockTileIdx = 0;
427  nBlockTileIdx < tilesPerBlock && nBlockTileIdx + nTileStart < NNumTiles;
428  ++nBlockTileIdx) {
429  const size_t nTileIdx = nBlockTileIdx + nTileStart;
430  // A layout: [M/TileSize][QK / TileDepth][TileSize][TileDepth]
431  // C layout: [M/TileSize][TileSize][N/TileSize][TileSize]
432  const auto* Ablock = &Adata[mTileIdx * QK * TileSize];
433  const auto* Bblock = &Bdata[nTileIdx * QK * TileSize];
434  auto* Cblock = &Cdata[mTileIdx * TileSize * N + nTileIdx * TileSize];
435  const size_t Cstride = N;
436  qgess_packed<TileSize, TileSize, TileDepthBytes, F>(
437  Ablock, Bblock, Cblock, Cstride, QK, nTileIdx * TileSize, std::forward<F>(f));
438  }
439  }
440  });
441 }
442 
443 void run2b1bConvIm2ColGEMM(QConvState* state,
444  const ConvArgs& args,
445  const TensorCPU& X,
446  TensorCPU* Y) {
447  // TODO: packing + quantization in same block.
448  const size_t KH = state->WQ->dim32(1);
449  const size_t KW = state->WQ->dim32(2);
450  const size_t OH = (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1;
451  const size_t OW = (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1;
452  const size_t OC = state->WQ->dim32(0);
453  const size_t QK = KH * KW * divRoundUp(X.dim32(3), 8);
454  Y->Resize(X.dim32(0), OH, OW, OC);
455  if (!state->WQPacked) {
456  state->WQPacked = caffe2::make_unique<TensorCPU>();
457  qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(state, *(state->WQ), 1, state->WQPacked.get());
458  CAFFE_ENFORCE_EQ(state->WQPacked->dim32(0), divRoundUp(OC, kGEMMTileSize));
459  CAFFE_ENFORCE_EQ(state->WQPacked->dim32(1), divRoundUp(QK, kGEMMTileDepthBytes));
460  CAFFE_ENFORCE_EQ(state->WQPacked->dim32(2), kGEMMTileSize);
461  CAFFE_ENFORCE_EQ(state->WQPacked->dim32(3), kGEMMTileDepthBytes);
462 
463  // We can fuse the bias addition into the filter normalization. We can
464  // replace the bias + 3/2 normalization factor by replacing normalization
465  // with (2/3 bias + normalization), and setting bias to zero.
466  if (state->bias) {
467  for (size_t i = 0; i < state->bias->size(); ++i) {
468  state->WQN->mutable_data<float>()[i] += 2.0f / 3 * state->bias->data<float>()[i];
469  }
470  }
471  state->bias.reset();
472 
473  // If we have to pad when we pack our weight tiles, then we need to adjust
474  // the normalization factor by the number of zeros that we added.
475  const size_t QKPadding = divRoundUp(QK, kGEMMTileDepthBytes) * kGEMMTileDepthBytes - QK;
476  if (QKPadding != 0) {
477  for (size_t i = 0; i < state->WQN->size(); ++i) {
478  state->WQN->mutable_data<float>()[i] -= QKPadding * 8;
479  }
480  }
481  }
482  CAFFE_ENFORCE(!state->bias.get());
483  // Since 1x1s are so common, we fuse the quantization + packing steps.
484  const bool is_1x1 = KH == 1 && KW == 1 && args.pad_l == 0 && args.pad_r == 0 && args.pad_b == 0 &&
485  args.pad_t == 0 && args.stride_h == 1 && args.stride_w == 1;
486 
487  if (is_1x1) {
488  CAFFE_ENFORCE_EQ(OH, X.dim32(1));
489  CAFFE_ENFORCE_EQ(OW, X.dim32(2));
490  uniformQuantize2b1bNeonPacked<kGEMMTileSize, kGEMMTileDepthBytes>(
491  state, X, state->XQs, 0.5, 1.0);
492  } else {
493  uniformQuantize2b1bNeon(state, X, state->XQs, 0.5, 1.0);
494  }
495  TensorCPU* YQ0 = state->YQs[0].get();
496 
497  if (state->WQ->dim32(0) % kGEMMTileSize == 0) {
498  // We can run inplace by operating on our Y vector, and then shrinking Y.
499  YQ0 = Y;
500  }
501 
502  for (size_t i = 0; i < k2b1bXBits; ++i) {
503  const auto& XQ = *(state->XQs[i]);
504  if (!is_1x1) {
505  qim2col(args, XQ, *(state->WQ), state->scratchColBuffer.get());
506  qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(
507  state, *(state->scratchColBuffer), 3, state->scratch.get());
508  }
509 
510  {
511  const auto* __restrict__ WQNdata = state->WQN->data<float>();
512  switch (i) {
513  case 0:
514  qgemm_nt_packed<kGEMMTileSize, kGEMMTileDepthBytes>(
515  state,
516  is_1x1 ? XQ : *(state->scratch),
517  *(state->WQPacked),
518  YQ0,
519  [WQNdata](float* __restrict__ acc, float32x4_t value, size_t channel) {
520  // acc[c] = 3/2 WQN[c] + 1/2 value[c];
521  const float32x4_t _32 = vdupq_n_f32(3.0f / 2);
522  const float32x4_t _12 = vdupq_n_f32(1.0f / 2);
523  const float32x4_t WQNc_32 = vmulq_f32(_32, vld1q_f32(WQNdata + channel));
524  const float32x4_t WQNc_32_value_12 = vmlaq_f32(WQNc_32, _12, value);
525  vst1q_f32(acc, WQNc_32_value_12);
526  });
527  break;
528  case 1:
529  qgemm_nt_packed<kGEMMTileSize, kGEMMTileDepthBytes>(
530  state,
531  is_1x1 ? XQ : *(state->scratch),
532  *(state->WQPacked),
533  YQ0,
534  [](float* __restrict__ acc, float32x4_t value, size_t channel) {
535  const float32x4_t curr = vld1q_f32(acc);
536  vst1q_f32(acc, vaddq_f32(curr, value));
537  });
538  break;
539  }
540  }
541  }
542 
543  if (YQ0 != Y) {
544  // In this case, the stride does not match, so we need to copy the output
545  // data into the contiguous Y matrix.
546  const size_t F = state->WQ->dim(0);
547  const size_t N = Y->size() / F;
548  const size_t NP = YQ0->dim32(0);
549  const size_t FP = YQ0->dim32(1);
550  math::CopyMatrix<CPUContext>(
551  sizeof(float), N, F, YQ0->data<float>(), FP, Y->mutable_data<float>(), F, nullptr);
552  } else {
553  CAFFE_ENFORCE_EQ(Y->dim32(0), divRoundUp(X.dim32(0) * OH * OW, kGEMMTileSize) * kGEMMTileSize);
554  CAFFE_ENFORCE_EQ(Y->dim32(1), OC);
555  Y->Shrink(X.dim32(0) * OH * OW);
556  Y->Reshape(std::vector<TIndex>{{TIndex(X.dim(0)), TIndex(OH), TIndex(OW), TIndex(OC)}});
557  }
558 }
559 
560 bool run2b1bConvNeon(QConvState* state, const ConvArgs& args, const TensorCPU& X, TensorCPU* Y) {
561  // TODO: insert specialized cases (e.g. depthwise convolutions, the direct
562  // convolution.
563  CAFFE_ENFORCE_EQ(X.ndim(), 4);
564  run2b1bConvIm2ColGEMM(state, args, X, Y);
565  return true;
566 }
567 
568 #endif
569 
570 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.