3 #ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_ 4 #define CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_ 6 #include "caffe2/core/common.h" 10 #include "caffe2/core/logging.h" 11 #include "caffe2/operators/conv_op_shared.h" 12 #include "caffe2/operators/conv_transpose_op_mobile.h" 13 #include "caffe2/utils/cpu_neon.h" 14 #include "caffe2/utils/eigen_utils.h" 15 #include "caffe2/utils/fixed_divisor.h" 16 #include "caffe2/utils/math.h" 17 #include "caffe2/utils/math/utils.h" 19 C10_DECLARE_bool(caffe2_force_shared_col_buffer);
23 template <
typename T,
typename Context>
24 void runTileContiguous(
46 auto kernelDataSize = C * kernelH * kernelW;
47 auto currentTileStart = tileSize * tileId;
50 math::GemmEx<T, Context>(
59 Xdata + currentTileStart,
76 int colBlockSize = (W + kernelW / strideW);
77 int numColBlocks = strideW;
79 for (
int c = 0; c < kernelDataSize; ++c) {
80 int w_offset = c % kernelW;
81 int h_offset = (c / kernelW) % kernelH;
82 int c_im = c / kernelH / kernelW;
87 int rowY = tileId * strideH - padT + h_offset;
90 if (!math::utils::IsAGeZeroAndALtB(rowY, outputH)) {
95 constexpr
int kPadL = 0;
96 int colOffsetStart = -kPadL + w_offset;
97 int colBlockY = colOffsetStart % strideW;
102 int colWithinBlockOffsetY = colOffsetStart / strideW;
105 int colY = colBlockY * colBlockSize + colWithinBlockOffsetY;
109 int offsetY = rowY * colBlockSize * numColBlocks + colY;
111 T* colBufferPointer = colBufferData + c * tileSize;
113 Ydata + c_im * outputH * (colBlockSize * numColBlocks) + offsetY;
116 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 119 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float)) * 4;
120 int limit = (tileSize / kUnroll) * kUnroll;
122 for (; b < limit; b += kUnroll) {
123 float32x4_t cb0 = vld1q_f32(colBufferPointer + 0);
124 float32x4_t cb1 = vld1q_f32(colBufferPointer + 4);
125 float32x4_t cb2 = vld1q_f32(colBufferPointer + 8);
126 float32x4_t cb3 = vld1q_f32(colBufferPointer + 12);
128 float32x4_t y0 = vld1q_f32(yPointer + 0);
129 float32x4_t y1 = vld1q_f32(yPointer + 4);
130 float32x4_t y2 = vld1q_f32(yPointer + 8);
131 float32x4_t y3 = vld1q_f32(yPointer + 12);
133 y0 = vaddq_f32(y0, cb0);
134 y1 = vaddq_f32(y1, cb1);
135 y2 = vaddq_f32(y2, cb2);
136 y3 = vaddq_f32(y3, cb3);
138 vst1q_f32(yPointer + 0, y0);
139 vst1q_f32(yPointer + 4, y1);
140 vst1q_f32(yPointer + 8, y2);
141 vst1q_f32(yPointer + 12, y3);
143 colBufferPointer += kUnroll;
149 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float));
150 int limit = (tileSize / kUnroll) * kUnroll;
152 for (; b < limit; b += kUnroll) {
153 float32x4_t cb0 = vld1q_f32(colBufferPointer);
154 float32x4_t y0 = vld1q_f32(yPointer);
156 y0 = vaddq_f32(y0, cb0);
158 vst1q_f32(yPointer, y0);
160 colBufferPointer += kUnroll;
167 for (; b < tileSize; ++b) {
168 *yPointer += *colBufferPointer;
175 template <
typename T,
int N>
176 struct StoreInterleaved {};
179 struct StoreInterleaved<float, 1> {
180 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 181 inline static void store(
float* p, float32x4_t v[1]) {
186 inline static void store(
float* p,
float v[1]) {
192 struct StoreInterleaved<float, 2> {
193 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 194 inline static void store(
float* p, float32x4_t v[2]) {
195 float32x4x2_t x = {{v[0], v[1]}};
200 inline static void store(
float* p,
float v[2]) {
207 struct StoreInterleaved<float, 3> {
208 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 209 inline static void store(
float* p, float32x4_t v[3]) {
210 float32x4x3_t x = {{v[0], v[1], v[2]}};
215 inline static void store(
float* p,
float v[3]) {
223 struct StoreInterleaved<float, 4> {
224 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 225 inline static void store(
float* p, float32x4_t v[4]) {
226 float32x4x4_t x = {{v[0], v[1], v[2], v[3]}};
231 inline static void store(
float* p,
float v[4]) {
239 template <
int kStr
ideW>
240 void reinterleaveRows(
257 int colBlockSize = inputW + kernelW / kStrideW;
258 int noAdjOutputW = (inputW - 1) * kStrideW + kernelW;
260 int point = c * outputH + h;
261 src += point * colBlockSize * kStrideW;
262 dst += point * outputW;
264 float b = bias ? bias[c] : 0;
265 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 266 float32x4_t biasV = vdupq_n_f32(b);
270 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 271 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float)) * 2;
272 int limit = ((inputW - 1) / kUnroll) * kUnroll;
274 for (; w < limit; w += kUnroll) {
276 float32x4_t v0[kStrideW];
277 float32x4_t v1[kStrideW];
279 for (
int i = 0; i < kStrideW; ++i) {
280 v0[i] = vld1q_f32(src + i * colBlockSize);
281 v1[i] = vld1q_f32(src + i * colBlockSize + 4);
285 for (
int i = 0; i < kStrideW; ++i) {
286 v0[i] = vaddq_f32(v0[i], biasV);
287 v1[i] = vaddq_f32(v1[i], biasV);
291 StoreInterleaved<float, kStrideW>::store(dst + 0 * kStrideW, v0);
292 StoreInterleaved<float, kStrideW>::store(dst + 4 * kStrideW, v1);
295 dst += kUnroll * kStrideW;
300 for (; w < inputW - 1; ++w) {
303 for (
int i = 0; i < kStrideW; ++i) {
304 v[i] = src[i * colBlockSize];
308 for (
int i = 0; i < kStrideW; ++i) {
313 StoreInterleaved<float, kStrideW>::store(dst, v);
321 int outputPoint = (inputW - 1) * kStrideW;
326 while (outputPoint < noAdjOutputW) {
327 float v = src[block * colBlockSize];
333 if (block >= kStrideW) {
341 for (; outputPoint < outputW; ++outputPoint) {
347 template <
int N,
typename T,
typename Context>
348 void reinterleaveMultithreaded(
361 size_t totalTiles = (size_t)outputC * outputH;
362 FixedDivisor<int> divOutputH(outputH);
364 #define REINTERLEAVE(N) \ 366 reinterleaveRows<N>( \ 381 std::function<void(int, size_t)> fnReinterleave = [&](
int threadId,
385 divOutputH.DivMod((
int)tileId, &c, &h);
392 pool->run(fnReinterleave, totalTiles);
395 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 398 static void sumInto(
float* acc,
float** toSum,
size_t size);
402 struct SumMultiple<1> {
403 static void sumInto(
float* acc,
float** toSum,
size_t size) {
404 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float));
405 int limit = (size / kUnroll) * kUnroll;
407 auto toSum0 = toSum[0];
410 for (; i < limit; i += kUnroll) {
411 float32x4_t v0 = vld1q_f32_aligned(acc + i);
412 float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
414 v0 = vaddq_f32(v0, v1);
416 vst1q_f32_aligned(acc + i, v0);
419 for (; i < size; ++i) {
421 float v1 = toSum0[i];
431 struct SumMultiple<2> {
432 static void sumInto(
float* acc,
float** toSum,
size_t size) {
433 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float));
434 int limit = (size / kUnroll) * kUnroll;
436 auto toSum0 = toSum[0];
437 auto toSum1 = toSum[1];
440 for (; i < limit; i += kUnroll) {
441 float32x4_t v0 = vld1q_f32_aligned(acc + i);
442 float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
443 float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
445 v0 = vaddq_f32(v0, v1);
446 v0 = vaddq_f32(v0, v2);
448 vst1q_f32_aligned(acc + i, v0);
451 for (; i < size; ++i) {
453 float v1 = toSum0[i];
454 float v2 = toSum1[i];
465 struct SumMultiple<3> {
466 static void sumInto(
float* acc,
float** toSum,
size_t size) {
467 constexpr
int kUnroll = (
sizeof(float32x4_t) /
sizeof(
float));
468 int limit = (size / kUnroll) * kUnroll;
470 auto toSum0 = toSum[0];
471 auto toSum1 = toSum[1];
472 auto toSum2 = toSum[2];
475 for (; i < limit; i += kUnroll) {
476 float32x4_t v0 = vld1q_f32_aligned(acc + i);
477 float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
478 float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
479 float32x4_t v3 = vld1q_f32_aligned(toSum2 + i);
481 v0 = vaddq_f32(v0, v1);
482 v2 = vaddq_f32(v2, v3);
483 v0 = vaddq_f32(v0, v2);
485 vst1q_f32_aligned(acc + i, v0);
488 for (; i < size; ++i) {
490 float v1 = toSum0[i];
491 float v2 = toSum1[i];
492 float v3 = toSum2[i];
505 void sumInto(
float* acc, std::vector<float*>& toSum,
size_t size) {
506 #if defined(__ARM_NEON__) || defined(__ARM_NEON) 507 if (toSum.size() == 1) {
508 SumMultiple<1>::sumInto(acc, toSum.data(), size);
510 }
else if (toSum.size() == 2) {
511 SumMultiple<2>::sumInto(acc, toSum.data(), size);
513 }
else if (toSum.size() == 3) {
514 SumMultiple<3>::sumInto(acc, toSum.data(), size);
520 EigenVectorArrayMap<float> accT(acc, size);
522 for (
auto p : toSum) {
523 accT += ConstEigenVectorArrayMap<float>(p, size);
527 template <
typename T,
class Context>
528 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
529 const Tensor& X = Input(INPUT);
530 auto& filter = Input(FILTER);
531 const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
532 CAFFE_ENFORCE(filter.ndim() == 4,
"filter must be 4D tensor");
534 filter.dim32(0) == M,
535 "filter number must be equal to input channel number");
536 const int C = filter.dim32(1);
538 filter.dim32(2) == this->kernel_h(),
539 "filter height must be equal to kernel height");
541 filter.dim32(3) == this->kernel_w(),
542 "filter width must be equal to kernel width");
543 if (InputSize() == 3) {
544 auto& bias = Input(BIAS);
545 CAFFE_ENFORCE(bias.ndim() == 1,
"bias must be 1D tensor");
548 "bias dimension must be equal to output channel number");
551 auto sizes = ConvTransposeUnpoolBase<Context>::GetOutputSize(X, C);
552 Tensor* Y = Output(0, sizes, at::dtype<T>());
554 const int outputH = Y->dim32(2);
555 const int outputW = Y->dim32(3);
556 const int outputPlaneSize = outputH * outputW;
557 const int outputBatchElementSize = Y->dim32(1) * outputPlaneSize;
559 auto Xdata = X.template data<T>();
560 auto Ydata = Y->template mutable_data<T>();
562 auto pool = ws_->GetThreadPool();
563 auto numThreads = pool->getNumThreads();
568 size_t colBlockSize = W + this->kernel_w() / this->stride_w();
569 size_t threadYBufferSize = C * outputH * colBlockSize * this->stride_w();
571 size_t threadYBufferSizeAligned =
572 ((C * outputH * colBlockSize * this->stride_w() + 3) / 4) * 4;
573 size_t threadColBufferSize = C * this->kernel_h() * this->kernel_w() * W;
576 auto runLocalTile = [&](TensorCPU* threadBuffer,
579 auto localYData = threadBuffer->template mutable_data<T>() +
580 threadId * threadYBufferSizeAligned;
582 auto localColBufferData = threadBuffer->template mutable_data<T>() +
583 numThreads * threadYBufferSizeAligned + threadId * threadColBufferSize;
585 runTileContiguous<T, Context>(
599 filter.template data<T>(),
606 auto f = [&](
Tensor* threadBuffer) {
607 threadBuffer->Resize(
608 numThreads * threadYBufferSizeAligned +
609 numThreads * threadColBufferSize);
611 std::vector<T*> toSum(numThreads - 1);
612 for (
int i = 1; i < numThreads; ++i) {
613 toSum[i - 1] = threadBuffer->template mutable_data<T>() +
614 i * threadYBufferSizeAligned;
617 for (
auto image_id = 0; image_id < N; ++image_id) {
622 math::Set<T, Context>(
623 numThreads * threadYBufferSizeAligned,
625 threadBuffer->template mutable_data<T>(),
632 [&](
int threadId,
int tileId) {
633 runLocalTile(threadBuffer, threadId, tileId);
641 threadBuffer->template mutable_data<T>(), toSum, threadYBufferSize);
646 #define REINTERLEAVE(N) \ 648 reinterleaveMultithreaded<N, T, Context>( \ 649 threadBuffer->template mutable_data<T>(), \ 650 InputSize() == 3 ? Input(BIAS).template data<T>() : nullptr, \ 662 if (this->stride_w() == 1) {
664 }
else if (this->stride_w() == 2) {
666 }
else if (this->stride_w() == 3) {
668 }
else if (this->stride_w() == 4) {
675 Ydata += Y->size() / Y->dim32(0);
678 if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
679 runWithSharedBuffer<Context>(ws_, f);
687 template <
typename T,
class Context>
688 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNHWC() {
689 CAFFE_THROW(
"Not implemented.");
696 #endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...