2 #include "caffe2/core/timer.h" 3 #include "caffe2/utils/eigen_utils.h" 4 #include "caffe2/utils/math.h" 10 constexpr
size_t kL1CacheSizeBytes = 16 * 1024;
12 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 17 inline void quantize2bNeon(
size_t QC,
18 const float* __restrict__ Xdata,
20 float inter_center_distance,
21 std::array<uint8_t*, k2b1bXBits> XQdata) {
23 const auto offset_plus_2_inter_center_distance = vdupq_n_f32(offset + 2 * inter_center_distance);
24 const auto offset_plus_inter_center_distance = vdupq_n_f32(offset + inter_center_distance);
25 const auto offset_ = vdupq_n_f32(offset);
26 const uint8x8_t shifts = {1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7};
28 for (
size_t qc = 0; qc < QC; qc += 8) {
29 std::array<std::array<uint8x8_t, 8>, k2b1bXBits> ps;
30 for (
auto i = 0; i < k2b1bXBits; ++i) {
31 for (
auto j = 0; j < 8; ++j) {
32 ps[i][j] = vdup_n_u8(0);
36 for (
auto j = 0; j < 8; ++j) {
37 const auto x0 = vld1q_f32(&Xdata[qc * 8 + j * 8 + 0]);
38 const auto x1 = vld1q_f32(&Xdata[qc * 8 + j * 8 + 4]);
54 auto join = [](uint32x4_t a, uint32x4_t b) -> uint8x8_t {
55 return vmovn_u16(vcombine_u16(vmovn_u32(a), vmovn_u32(b)));
58 const auto x_geq_offset_plus_2_inter_center_distance =
59 join(vcgeq_s32(vreinterpretq_s32_f32(x0),
60 vreinterpretq_s32_f32(offset_plus_2_inter_center_distance)),
61 vcgeq_s32(vreinterpretq_s32_f32(x1),
62 vreinterpretq_s32_f32(offset_plus_2_inter_center_distance)));
63 const auto x_ge_offset =
64 join(vcgeq_s32(vreinterpretq_s32_f32(x0), vreinterpretq_s32_f32(offset_)),
65 vcgeq_s32(vreinterpretq_s32_f32(x1), vreinterpretq_s32_f32(offset_)));
67 const auto x_lt_offset_plus_inter_center_distance =
68 join(vcltq_s32(vreinterpretq_s32_f32(x0),
69 vreinterpretq_s32_f32(offset_plus_inter_center_distance)),
70 vcltq_s32(vreinterpretq_s32_f32(x1),
71 vreinterpretq_s32_f32(offset_plus_inter_center_distance)));
73 const auto p1_mask = vmvn_u8(x_lt_offset_plus_inter_center_distance);
74 const auto p0_mask = vorr_u8(vand_u8(x_ge_offset, x_lt_offset_plus_inter_center_distance),
75 x_geq_offset_plus_2_inter_center_distance);
76 ps[0][j] = vand_u8(shifts, p0_mask);
77 ps[1][j] = vand_u8(shifts, p1_mask);
80 for (
auto i = 0; i < 2; ++i) {
81 const auto p01 = vpadd_u8(ps[i][0], ps[i][1]);
82 const auto p23 = vpadd_u8(ps[i][2], ps[i][3]);
83 const auto p45 = vpadd_u8(ps[i][4], ps[i][5]);
84 const auto p67 = vpadd_u8(ps[i][6], ps[i][7]);
85 const auto p0123 = vpadd_u8(p01, p23);
86 const auto p4567 = vpadd_u8(p45, p67);
87 vst1_u8(XQdata[i] + qc, vpadd_u8(p0123, p4567));
92 void uniformQuantize2b1bNeon(QConvState* state,
94 const std::vector<std::unique_ptr<TensorCPU>>& XQ,
96 float inter_center_distance) {
97 CAFFE_ENFORCE_GT(X.ndim(), 1);
98 const size_t C = X.dim32(X.ndim() - 1);
99 const size_t N = X.size() / C;
100 const size_t QC = divRoundUp(C, 8);
101 auto XQs = X.sizes().vec();
102 XQs[X.ndim() - 1] = QC;
103 CAFFE_ENFORCE_EQ(XQ.size(), k2b1bXBits);
104 for (
auto i = 0; i < k2b1bXBits; ++i) {
107 const float* Xdata = X.data<
float>();
108 std::array<uint8_t*, k2b1bXBits> XQdata;
109 for (
size_t i = 0; i < k2b1bXBits; ++i) {
110 XQdata[i] = XQ[i]->mutable_data<uint8_t>();
112 CAFFE_ENFORCE_GT(offset, 0);
113 CAFFE_ENFORCE_GT(inter_center_distance, 0);
114 size_t QCUnroll = ((C / 8) / 8) * 8;
119 const size_t rowsPerBlock =
120 std::max<size_t>(std::floor<size_t>(double(4 * kL1CacheSizeBytes) / double(17 * C)), 1);
121 state->parallelFor(divRoundUp(N, rowsPerBlock), [&](
size_t nb) {
122 for (
size_t n = nb * rowsPerBlock; n < std::min<size_t>(nb * rowsPerBlock + rowsPerBlock, N);
124 std::array<uint8_t*, k2b1bXBits> XQoff = {{
125 XQdata[0] + 0 + QC * n, XQdata[1] + 0 + QC * n,
127 quantize2bNeon(QCUnroll, &Xdata[0 + C * n], offset, inter_center_distance, XQoff);
128 for (
size_t qc = QCUnroll; qc < QC; ++qc) {
130 std::array<uint8_t, k2b1bXBits> p = {{0, 0}};
131 for (
size_t b = 0; b < 8; ++b) {
132 const size_t c = qc * 8 + b;
134 float v = Xdata[c + C * n];
137 }
else if (v < offset + inter_center_distance) {
139 }
else if (v < offset + 2 * inter_center_distance) {
147 for (
auto i = 0; i < k2b1bXBits; ++i) {
148 XQdata[i][qc + QC * n] = p[i];
155 template <
size_t TileSize,
size_t TileDepthBytes>
156 void uniformQuantize2b1bNeonPacked(QConvState* state,
158 const std::vector<std::unique_ptr<TensorCPU>>& XQ,
160 float inter_center_distance) {
161 const size_t M = X.size_to_dim(3);
162 const size_t K = X.size() / M;
163 const size_t QK = divRoundUp(K, 8);
164 const size_t numTiles = divRoundUp(M, TileSize);
165 const size_t numTilesDepth = divRoundUp(QK, TileDepthBytes);
166 for (
size_t i = 0; i < k2b1bXBits; ++i) {
167 XQ[i]->Resize(numTiles, numTilesDepth, TileSize, TileDepthBytes);
169 const float* Xdata = X.data<
float>();
170 std::array<uint8_t*, k2b1bXBits> XQdata;
171 for (
auto i = 0; i < k2b1bXBits; ++i) {
172 XQdata[i] = XQ[i]->mutable_data<uint8_t>();
174 CAFFE_ENFORCE_GT(offset, 0);
175 CAFFE_ENFORCE_GT(inter_center_distance, 0);
179 const size_t tilesPerBlock = std::max<size_t>(
180 std::floor<size_t>(double(4 * kL1CacheSizeBytes) / double(17 * K * TileSize)), 1);
181 state->parallelFor(divRoundUp(numTiles, tilesPerBlock), [&](
size_t nb) {
182 for (
size_t i = nb * tilesPerBlock;
183 i < std::min<size_t>(nb * tilesPerBlock + tilesPerBlock, numTiles);
185 for (
size_t j = 0; j < numTilesDepth; ++j) {
186 if (i != numTiles - 1 && j != numTilesDepth - 1) {
188 for (
auto ii = 0; ii < TileSize; ++ii) {
189 size_t m = i * TileSize + ii;
190 size_t k = j * TileDepthBytes * 8;
191 std::array<uint8_t*, k2b1bXBits> XQoff = {
192 {XQdata[0] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
193 TileSize * TileDepthBytes * numTilesDepth * i,
194 XQdata[1] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
195 TileSize * TileDepthBytes * numTilesDepth * i}};
196 quantize2bNeon(TileDepthBytes, &Xdata[m * K + k], offset, inter_center_distance, XQoff);
199 for (
size_t ii = 0; ii < TileSize; ++ii) {
200 size_t m = i * TileSize + ii;
201 size_t k = j * TileDepthBytes * 8;
202 std::array<uint8_t*, k2b1bXBits> XQoff = {
203 {XQdata[0] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
204 TileSize * TileDepthBytes * numTilesDepth * i,
205 XQdata[1] + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
206 TileSize * TileDepthBytes * numTilesDepth * i}};
207 if (m < M && k + TileDepthBytes * 8 <= K) {
210 TileDepthBytes, &Xdata[m * K + k], offset, inter_center_distance, XQoff);
214 std::array<float, 8 * TileDepthBytes> Xpad = {{0}};
216 std::copy(&Xdata[m * K + k], &Xdata[m * K + K], Xpad.begin());
218 quantize2bNeon(TileDepthBytes, Xpad.data(), offset, inter_center_distance, XQoff);
229 template <
size_t TileSize,
size_t TileDepthBytes>
230 void qpack_tiles(QConvState* state,
const TensorCPU& X,
size_t axis, TensorCPU* XP) {
231 const size_t M = X.size_to_dim(axis);
232 const size_t QK = X.size() / M;
233 const size_t numTiles = divRoundUp(M, TileSize);
234 const size_t numTilesDepth = divRoundUp(QK, TileDepthBytes);
235 XP->Resize(numTiles, numTilesDepth, TileSize, TileDepthBytes);
237 const auto* __restrict__ Xdata = X.data<uint8_t>();
238 auto* __restrict__ XPdata = XP->mutable_data<uint8_t>();
242 const size_t tilesPerBlock = std::max<size_t>(
243 std::floor<size_t>(double(kL1CacheSizeBytes) / double(2 * TileSize * QK)), 1);
244 state->parallelFor(divRoundUp(numTiles, tilesPerBlock), [&](
size_t nb) {
245 for (
size_t i = nb * tilesPerBlock;
246 i < std::min<size_t>(nb * tilesPerBlock + tilesPerBlock, numTiles);
248 for (
size_t j = 0; j < numTilesDepth; ++j) {
249 if (i != numTiles - 1 && j != numTilesDepth - 1) {
251 for (
auto ii = 0; ii < TileSize; ++ii) {
252 auto m = i * TileSize + ii;
253 auto qk = j * TileDepthBytes;
254 std::memcpy(&XPdata[TileDepthBytes * ii + TileDepthBytes * TileSize * j +
255 TileSize * TileDepthBytes * numTilesDepth * i],
260 for (
size_t ii = 0; ii < TileSize; ++ii) {
261 for (
size_t jj = 0; jj < TileDepthBytes; ++jj) {
262 size_t m = i * TileSize + ii;
263 size_t qk = j * TileDepthBytes + jj;
265 if (m < M && qk < QK) {
267 pval = Xdata[m * QK + qk];
269 XPdata[jj + TileDepthBytes * ii + TileDepthBytes * TileSize * j +
270 TileSize * TileDepthBytes * numTilesDepth * i] = pval;
284 template <
size_t kUnrollM,
size_t kUnrollN,
size_t TileDepthBytes,
typename F>
285 void qgess_packed(
const uint8_t* __restrict__ Ablock,
286 const uint8_t* __restrict__ Bblock,
287 float* __restrict__ Cblock,
288 const size_t Cstride,
292 static_assert(kUnrollN % 8 == 0,
"");
293 static_assert(TileDepthBytes == 16,
"");
294 DCHECK_EQ(QK % 16, 0);
295 uint16x8_t acc[kUnrollM][kUnrollN / 8];
296 for (
size_t mm = 0; mm < kUnrollM; ++mm) {
297 for (
size_t nn = 0; nn < kUnrollN / 8; ++nn) {
298 acc[mm][nn] = vdupq_n_u16(0);
302 const size_t QK16Unroll = (QK / 16) * 16;
303 for (; qk < QK16Unroll; qk += 16) {
304 uint8x16_t Areg[kUnrollM];
305 for (
size_t mm = 0; mm < kUnrollM; ++mm) {
306 Areg[mm] = vld1q_u8(Ablock);
310 for (
size_t nn = 0; nn < kUnrollN / 8; ++nn) {
312 for (
size_t nnn = 0; nnn < 8; ++nnn) {
313 Breg[nnn] = vld1q_u8(Bblock);
316 for (
size_t mm = 0; mm < kUnrollM; ++mm) {
318 for (
size_t nnn = 0; nnn < 8; ++nnn) {
319 cnts[nnn] = vcntq_u8(veorq_u8(Breg[nnn], Areg[mm]));
322 for (
size_t nnn = 0; nnn < 8; ++nnn) {
323 ps[nnn] = vadd_u8(vget_low_u8(cnts[nnn]), vget_high_u8(cnts[nnn]));
326 for (
size_t nnn = 0; nnn < 4; ++nnn) {
327 pss[nnn] = vpadd_u8(ps[2 * nnn], ps[2 * nnn + 1]);
330 for (
size_t nnn = 0; nnn < 2; ++nnn) {
331 psss[nnn] = vpadd_u8(pss[2 * nnn], pss[2 * nnn + 1]);
333 uint8x16_t out = vcombine_u8(psss[0], psss[1]);
334 acc[mm][nn] = vpadalq_u8(acc[mm][nn], out);
339 for (
size_t mm = 0; mm < kUnrollM; ++mm) {
340 auto* Crow = Cblock + mm * Cstride;
341 for (
size_t nn = 0; nn < kUnrollN / 8; ++nn) {
342 const int32x4_t K_ = vdupq_n_s32(QK * 8);
343 const int16x4_t two = vdup_n_s16(2);
344 const int16x4_t acc0123_l = vreinterpret_s16_u16(vget_low_u16(acc[mm][nn]));
345 const int16x4_t acc0123_h = vreinterpret_s16_u16(vget_high_u16(acc[mm][nn]));
346 const int32x4_t K_minus_2_acc0123_l = vmlsl_s16(K_, two, acc0123_l);
347 const int32x4_t K_minus_2_acc0123_h = vmlsl_s16(K_, two, acc0123_h);
348 f(Crow + nn * 8 + 0, vcvtq_f32_s32(K_minus_2_acc0123_l), Nstart + nn * 8 + 0);
349 f(Crow + nn * 8 + 4, vcvtq_f32_s32(K_minus_2_acc0123_h), Nstart + nn * 8 + 4);
356 template <
size_t TileSize,
size_t TileDepthBytes,
typename F>
357 inline void qgemm_nt_packed(
358 QConvState* state,
const TensorCPU&
A,
const TensorCPU&
B, TensorCPU* C, F&& f = F()) {
359 CAFFE_ENFORCE_EQ(A.ndim(), 4);
360 CAFFE_ENFORCE_EQ(B.ndim(), 4);
361 CAFFE_ENFORCE_EQ(A.dim(2), TileSize);
362 CAFFE_ENFORCE_EQ(B.dim(2), TileSize);
363 CAFFE_ENFORCE_EQ(A.dim(3), TileDepthBytes);
364 CAFFE_ENFORCE_EQ(B.dim(3), TileDepthBytes);
365 const size_t MT = A.dim(0);
366 const size_t NT = B.dim(0);
367 const size_t M = MT * TileSize;
368 const size_t N = NT * TileSize;
370 const size_t QKT = A.dim(1);
371 const size_t K = QKT * 8 * TileDepthBytes;
372 const size_t QK = K / 8;
373 CAFFE_ENFORCE_EQ(A.dim(1), B.dim(1));
375 const auto* Adata = A.data<uint8_t>();
376 const auto* Bdata = B.data<uint8_t>();
377 auto* Cdata = C->mutable_data<
float>();
389 size_t tilesPerBlock =
390 std::floor((std::sqrt(256 * kL1CacheSizeBytes + K * K) - K) / (32 * TileSize));
391 if (tilesPerBlock < 1) {
394 CAFFE_ENFORCE_LT(K, std::pow(2, 16));
395 CAFFE_ENFORCE_EQ(M % TileSize, 0);
396 CAFFE_ENFORCE_EQ(N % TileSize, 0);
397 const size_t MNumTiles = M / TileSize;
398 const size_t NNumTiles = N / TileSize;
399 const size_t MNumBlocks = divRoundUp(MNumTiles, tilesPerBlock);
400 const size_t NNumBlocks = divRoundUp(NNumTiles, tilesPerBlock);
402 state->parallelFor(MNumBlocks * NNumBlocks, [&](
size_t mn) {
403 const size_t mBlockIdx = mn / NNumBlocks;
404 const size_t nBlockIdx = mn % NNumBlocks;
405 const size_t mTileStart = mBlockIdx * tilesPerBlock;
406 const size_t nTileStart = nBlockIdx * tilesPerBlock;
407 for (
size_t mBlockTileIdx = 0;
408 mBlockTileIdx < tilesPerBlock && mBlockTileIdx + mTileStart < MNumTiles;
410 const size_t mTileIdx = mBlockTileIdx + mTileStart;
411 for (
size_t nBlockTileIdx = 0;
412 nBlockTileIdx < tilesPerBlock && nBlockTileIdx + nTileStart < NNumTiles;
414 const size_t nTileIdx = nBlockTileIdx + nTileStart;
417 const auto* Ablock = &Adata[mTileIdx * QK * TileSize];
418 const auto* Bblock = &Bdata[nTileIdx * QK * TileSize];
419 auto* Cblock = &Cdata[mTileIdx * TileSize * N + nTileIdx * TileSize];
420 const size_t Cstride = N;
421 qgess_packed<TileSize, TileSize, TileDepthBytes, F>(
422 Ablock, Bblock, Cblock, Cstride, QK, nTileIdx * TileSize, std::forward<F>(f));
428 void run2b1bConvIm2ColGEMM(QConvState* state,
429 const ConvArgs& args,
433 const size_t KH = state->WQ->dim32(1);
434 const size_t KW = state->WQ->dim32(2);
435 const size_t OH = (X.dim32(1) - KH + args.pad_t + args.pad_b) / args.stride_h + 1;
436 const size_t OW = (X.dim32(2) - KW + args.pad_l + args.pad_r) / args.stride_w + 1;
437 const size_t OC = state->WQ->dim32(0);
438 const size_t QK = KH * KW * divRoundUp(X.dim32(3), 8);
439 Y->Resize(X.dim32(0), OH, OW, OC);
440 if (!state->WQPacked) {
441 state->WQPacked = caffe2::make_unique<Tensor>(CPU);
442 qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(state, *(state->WQ), 1, state->WQPacked.get());
443 CAFFE_ENFORCE_EQ(state->WQPacked->dim32(0), divRoundUp(OC, kGEMMTileSize));
444 CAFFE_ENFORCE_EQ(state->WQPacked->dim32(1), divRoundUp(QK, kGEMMTileDepthBytes));
445 CAFFE_ENFORCE_EQ(state->WQPacked->dim32(2), kGEMMTileSize);
446 CAFFE_ENFORCE_EQ(state->WQPacked->dim32(3), kGEMMTileDepthBytes);
452 for (
size_t i = 0; i < state->bias->size(); ++i) {
453 state->WQN->mutable_data<
float>()[i] += 2.0f / 3 * state->bias->data<
float>()[i];
460 const size_t QKPadding = divRoundUp(QK, kGEMMTileDepthBytes) * kGEMMTileDepthBytes - QK;
461 if (QKPadding != 0) {
462 for (
size_t i = 0; i < state->WQN->size(); ++i) {
463 state->WQN->mutable_data<
float>()[i] -= QKPadding * 8;
467 CAFFE_ENFORCE(!state->bias.get());
469 const bool is_1x1 = KH == 1 && KW == 1 && args.pad_l == 0 && args.pad_r == 0 && args.pad_b == 0 &&
470 args.pad_t == 0 && args.stride_h == 1 && args.stride_w == 1;
473 CAFFE_ENFORCE_EQ(OH, X.dim32(1));
474 CAFFE_ENFORCE_EQ(OW, X.dim32(2));
475 uniformQuantize2b1bNeonPacked<kGEMMTileSize, kGEMMTileDepthBytes>(
476 state, X, state->XQs, 0.5, 1.0);
478 uniformQuantize2b1bNeon(state, X, state->XQs, 0.5, 1.0);
480 TensorCPU* YQ0 = state->YQs[0].get();
482 if (state->WQ->dim32(0) % kGEMMTileSize == 0) {
487 for (
size_t i = 0; i < k2b1bXBits; ++i) {
488 const auto& XQ = *(state->XQs[i]);
490 qim2col(args, XQ, *(state->WQ), state->scratchColBuffer.get());
491 qpack_tiles<kGEMMTileSize, kGEMMTileDepthBytes>(
492 state, *(state->scratchColBuffer), 3, state->scratch.get());
496 const auto* __restrict__ WQNdata = state->WQN->data<
float>();
499 qgemm_nt_packed<kGEMMTileSize, kGEMMTileDepthBytes>(
501 is_1x1 ? XQ : *(state->scratch),
504 [WQNdata](
float* __restrict__ acc, float32x4_t value,
size_t channel) {
506 const float32x4_t _32 = vdupq_n_f32(3.0f / 2);
507 const float32x4_t _12 = vdupq_n_f32(1.0f / 2);
508 const float32x4_t WQNc_32 = vmulq_f32(_32, vld1q_f32(WQNdata + channel));
509 const float32x4_t WQNc_32_value_12 = vmlaq_f32(WQNc_32, _12, value);
510 vst1q_f32(acc, WQNc_32_value_12);
514 qgemm_nt_packed<kGEMMTileSize, kGEMMTileDepthBytes>(
516 is_1x1 ? XQ : *(state->scratch),
519 [](
float* __restrict__ acc, float32x4_t value,
size_t channel) {
520 const float32x4_t curr = vld1q_f32(acc);
521 vst1q_f32(acc, vaddq_f32(curr, value));
531 const size_t F = state->WQ->dim(0);
532 const size_t N = Y->size() / F;
533 const size_t NP = YQ0->dim32(0);
534 const size_t FP = YQ0->dim32(1);
535 math::CopyMatrix<CPUContext>(
536 sizeof(float), N, F, YQ0->data<
float>(), FP, Y->mutable_data<
float>(), F,
nullptr);
538 CAFFE_ENFORCE_EQ(Y->dim32(0), divRoundUp(X.dim32(0) * OH * OW, kGEMMTileSize) * kGEMMTileSize);
539 CAFFE_ENFORCE_EQ(Y->dim32(1), OC);
540 Y->ShrinkTo(X.dim32(0) * OH * OW);
541 Y->Reshape(std::vector<int64_t>{{int64_t(X.dim(0)), int64_t(OH), int64_t(OW), int64_t(OC)}});
545 bool run2b1bConvNeon(QConvState* state,
const ConvArgs& args,
const TensorCPU& X, TensorCPU* Y) {
548 CAFFE_ENFORCE_EQ(X.ndim(), 4);
549 run2b1bConvIm2ColGEMM(state, args, X, Y);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...