Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_transpose_op_mobile_impl.h
1 
17 // conv_transpose_op_impl.h is the templated implementation of the
18 // conv_transpose_op.h file.
19 #ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
20 #define CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
21 
22 #include "caffe2/core/common.h"
23 
24 #ifndef CAFFE2_MOBILE
25 #error "mobile build state not defined"
26 #endif
27 
28 #if CAFFE2_MOBILE
29 
30 #include "caffe2/core/logging.h"
31 #include "caffe2/operators/conv_op_shared.h"
32 #include "caffe2/operators/conv_transpose_op_mobile.h"
33 #include "caffe2/utils/cpu_neon.h"
34 #include "caffe2/utils/fixed_divisor.h"
35 #include "caffe2/utils/math.h"
36 
37 CAFFE2_DECLARE_bool(caffe2_force_shared_col_buffer);
38 
39 namespace caffe2 {
40 
41 template <typename T, typename Context>
42 void runTileContiguous(
43  int tileId,
44  int N,
45  int M,
46  int H,
47  int W,
48  int outputH,
49  int outputW,
50  int C,
51  int kernelH,
52  int kernelW,
53  int strideH,
54  int strideW,
55  int padT,
56  const T* filterData,
57  const T* Xdata,
58  T* colBufferData,
59  T* Ydata,
60  Context* context) {
61  // The tile size is exactly the length of a single row
62  int tileSize = W;
63 
64  auto kernelDataSize = C * kernelH * kernelW;
65  auto currentTileStart = tileSize * tileId;
66 
67  // gemm tile
68  math::GemmEx<T, Context>(
69  CblasTrans,
70  CblasNoTrans,
71  kernelDataSize,
72  tileSize,
73  M,
74  1,
75  filterData,
76  kernelDataSize,
77  Xdata + currentTileStart,
78  H * W,
79  0,
80  colBufferData,
81  tileSize,
82  context);
83 
84  // col2im tile
85  // We assume that there is no padding in the columns (padL and padR
86  // == 0).
87  // FIXME: it is actually possible for us to handle padding, figure
88  // out how to adjust the bounds
89 
90  // We write into Y in a de-interleaved fashion; in other words,
91  // every column (mod strideW) == 0 together in one block,
92  // every column (mod strideW) == 1 in another,
93  // ... and so on.
94  int colBlockSize = (W + kernelW / strideW);
95  int numColBlocks = strideW;
96 
97  for (int c = 0; c < kernelDataSize; ++c) {
98  int w_offset = c % kernelW;
99  int h_offset = (c / kernelW) % kernelH;
100  int c_im = c / kernelH / kernelW;
101 
102  // Each row is a separate tile that we handle. First determine the
103  // row into which we are writing the output.
104  // We can properly handle padding for the rows.
105  int rowY = tileId * strideH - padT + h_offset;
106 
107  // If this row is out of bounds, then skip it
108  if (!math::is_a_ge_zero_and_a_lt_b(rowY, outputH)) {
109  continue;
110  }
111 
112  // FIXME: we don't actually handle a dynamic padL > 0
113  constexpr int kPadL = 0;
114  int colOffsetStart = -kPadL + w_offset;
115  int colBlockY = colOffsetStart % strideW;
116 
117  // However, within a block we may not start writing at offset
118  // 0. The offset at which we begin writing is determined by
119  // colOffsetStart
120  int colWithinBlockOffsetY = colOffsetStart / strideW;
121 
122  // So, this is where we begin reading/writing in Y
123  int colY = colBlockY * colBlockSize + colWithinBlockOffsetY;
124 
125  // This is the complete offset into Y from the start
126  // Each row has strideW blocks of size colBlockSize
127  int offsetY = rowY * colBlockSize * numColBlocks + colY;
128 
129  T* colBufferPointer = colBufferData + c * tileSize;
130  T* yPointer =
131  Ydata + c_im * outputH * (colBlockSize * numColBlocks) + offsetY;
132 
133  int b = 0;
134 #ifdef __ARM_NEON__
135  // We vectorize the loop within the row
136  {
137  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float)) * 4;
138  int limit = (tileSize / kUnroll) * kUnroll;
139 
140  for (; b < limit; b += kUnroll) {
141  float32x4_t cb0 = vld1q_f32(colBufferPointer + 0);
142  float32x4_t cb1 = vld1q_f32(colBufferPointer + 4);
143  float32x4_t cb2 = vld1q_f32(colBufferPointer + 8);
144  float32x4_t cb3 = vld1q_f32(colBufferPointer + 12);
145 
146  float32x4_t y0 = vld1q_f32(yPointer + 0);
147  float32x4_t y1 = vld1q_f32(yPointer + 4);
148  float32x4_t y2 = vld1q_f32(yPointer + 8);
149  float32x4_t y3 = vld1q_f32(yPointer + 12);
150 
151  y0 = vaddq_f32(y0, cb0);
152  y1 = vaddq_f32(y1, cb1);
153  y2 = vaddq_f32(y2, cb2);
154  y3 = vaddq_f32(y3, cb3);
155 
156  vst1q_f32(yPointer + 0, y0);
157  vst1q_f32(yPointer + 4, y1);
158  vst1q_f32(yPointer + 8, y2);
159  vst1q_f32(yPointer + 12, y3);
160 
161  colBufferPointer += kUnroll;
162  yPointer += kUnroll;
163  }
164  }
165 
166  {
167  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
168  int limit = (tileSize / kUnroll) * kUnroll;
169 
170  for (; b < limit; b += kUnroll) {
171  float32x4_t cb0 = vld1q_f32(colBufferPointer);
172  float32x4_t y0 = vld1q_f32(yPointer);
173 
174  y0 = vaddq_f32(y0, cb0);
175 
176  vst1q_f32(yPointer, y0);
177 
178  colBufferPointer += kUnroll;
179  yPointer += kUnroll;
180  }
181  }
182 #endif
183 
184  // Handle un-vectorizable epilogue
185  for (; b < tileSize; ++b) {
186  *yPointer += *colBufferPointer;
187  ++yPointer;
188  ++colBufferPointer;
189  }
190  }
191 }
192 
193 template <typename T, int N>
194 struct StoreInterleaved {};
195 
196 template <>
197 struct StoreInterleaved<float, 1> {
198 #ifdef __ARM_NEON__
199  inline static void store(float* p, float32x4_t v[1]) {
200  vst1q_f32(p, v[0]);
201  }
202 #endif
203 
204  inline static void store(float* p, float v[1]) {
205  p[0] = v[0];
206  }
207 };
208 
209 template <>
210 struct StoreInterleaved<float, 2> {
211 #ifdef __ARM_NEON__
212  inline static void store(float* p, float32x4_t v[2]) {
213  float32x4x2_t x = {{v[0], v[1]}};
214  vst2q_f32(p, x);
215  }
216 #endif
217 
218  inline static void store(float* p, float v[2]) {
219  p[0] = v[0];
220  p[1] = v[1];
221  }
222 };
223 
224 template <>
225 struct StoreInterleaved<float, 3> {
226 #ifdef __ARM_NEON__
227  inline static void store(float* p, float32x4_t v[3]) {
228  float32x4x3_t x = {{v[0], v[1], v[2]}};
229  vst3q_f32(p, x);
230  }
231 #endif
232 
233  inline static void store(float* p, float v[3]) {
234  p[0] = v[0];
235  p[1] = v[1];
236  p[2] = v[2];
237  }
238 };
239 
240 template <>
241 struct StoreInterleaved<float, 4> {
242 #ifdef __ARM_NEON__
243  inline static void store(float* p, float32x4_t v[4]) {
244  float32x4x4_t x = {{v[0], v[1], v[2], v[3]}};
245  vst4q_f32(p, x);
246  }
247 #endif
248 
249  inline static void store(float* p, float v[4]) {
250  p[0] = v[0];
251  p[1] = v[1];
252  p[2] = v[2];
253  p[3] = v[3];
254  }
255 };
256 
257 template <int kStrideW>
258 void reinterleaveRows(
259  const float* src,
260  const float* bias,
261  int c,
262  int h,
263  float* dst,
264  int outputC,
265  int outputH,
266  int outputW,
267  int inputW,
268  int kernelW,
269  int strideW,
270  int adjH) {
271  // Each row in src is of the form:
272  // [w mod strideW == 0 elements]...[w mod strideW == strideW - 1
273  // elements]
274  // We need to re-interleave the values and write them in the output
275  int colBlockSize = inputW + kernelW / kStrideW;
276  int noAdjOutputW = (inputW - 1) * kStrideW + kernelW;
277 
278  int point = c * outputH + h;
279  src += point * colBlockSize * kStrideW;
280  dst += point * outputW;
281 
282  float b = bias ? bias[c] : 0;
283 #ifdef __ARM_NEON__
284  float32x4_t biasV = vdupq_n_f32(b);
285 #endif
286 
287  int w = 0;
288 #ifdef __ARM_NEON__
289  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float)) * 2;
290  int limit = ((inputW - 1) / kUnroll) * kUnroll;
291 
292  for (; w < limit; w += kUnroll) {
293  // We need to interleave in terms of kStrideW units
294  float32x4_t v0[kStrideW];
295  float32x4_t v1[kStrideW];
296 
297  for (int i = 0; i < kStrideW; ++i) {
298  v0[i] = vld1q_f32(src + i * colBlockSize);
299  v1[i] = vld1q_f32(src + i * colBlockSize + 4);
300  }
301 
302  // add per-channel bias
303  for (int i = 0; i < kStrideW; ++i) {
304  v0[i] = vaddq_f32(v0[i], biasV);
305  v1[i] = vaddq_f32(v1[i], biasV);
306  }
307 
308  // Write interleaved into the output
309  StoreInterleaved<float, kStrideW>::store(dst + 0 * kStrideW, v0);
310  StoreInterleaved<float, kStrideW>::store(dst + 4 * kStrideW, v1);
311 
312  src += kUnroll;
313  dst += kUnroll * kStrideW;
314  }
315 #endif
316 
317  // Handle non-vectorizable remainder
318  for (; w < inputW - 1; ++w) {
319  float v[kStrideW];
320 
321  for (int i = 0; i < kStrideW; ++i) {
322  v[i] = src[i * colBlockSize];
323  }
324 
325  // add per-channel bias
326  for (int i = 0; i < kStrideW; ++i) {
327  v[i] += b;
328  }
329 
330  // Write interleaved into the output
331  StoreInterleaved<float, kStrideW>::store(dst, v);
332 
333  src += 1;
334  dst += kStrideW;
335  }
336 
337  // We have handled 0 .. (inputW - 1) * stride inclusive so far.
338  // Handle the remainder
339  int outputPoint = (inputW - 1) * kStrideW;
340  int block = 0;
341 
342  // Output width may include adjustment into which we don't
343  // write; ignore it
344  while (outputPoint < noAdjOutputW) {
345  float v = src[block * colBlockSize];
346  dst[0] = v + b;
347  ++outputPoint;
348  dst += 1;
349 
350  ++block;
351  if (block >= kStrideW) {
352  block = 0;
353  src += 1;
354  }
355  }
356 
357  // Remainder of the buffer comprised of just the `adj` must have
358  // bias added
359  for (; outputPoint < outputW; ++outputPoint) {
360  dst[0] = b;
361  dst += 1;
362  }
363 }
364 
365 template <int N, typename T, typename Context>
366 void reinterleaveMultithreaded(
367  const T* y0,
368  const T* bias_data,
369  T* y,
370  int outputC,
371  int outputH,
372  int outputW,
373  int inputW,
374  int kernelW,
375  int strideW,
376  int adjH,
377  ThreadPool* pool) {
378  // # channels times height
379  size_t totalTiles = (size_t)outputC * outputH;
380  FixedDivisor<int> divOutputH(outputH);
381 
382 #define REINTERLEAVE(N) \
383  do { \
384  reinterleaveRows<N>( \
385  y0, \
386  bias_data, \
387  c, \
388  h, \
389  y, \
390  outputC, \
391  outputH, \
392  outputW, \
393  inputW, \
394  kernelW, \
395  strideW, \
396  adjH); \
397  } while (false)
398 
399  std::function<void(int, size_t)> fnReinterleave = [&](int threadId,
400  size_t tileId) {
401  int h;
402  int c;
403  divOutputH.divMod((int)tileId, c, h);
404 
405  REINTERLEAVE(N);
406  };
407 
408 #undef REINTERLEAVE
409 
410  pool->run(fnReinterleave, totalTiles);
411 }
412 
413 #ifdef __ARM_NEON__
414 template <int N>
415 struct SumMultiple {
416  static void sumInto(float* acc, float** toSum, size_t size);
417 };
418 
419 template <>
420 struct SumMultiple<1> {
421  static void sumInto(float* acc, float** toSum, size_t size) {
422  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
423  int limit = (size / kUnroll) * kUnroll;
424 
425  auto toSum0 = toSum[0];
426 
427  size_t i = 0;
428  for (; i < limit; i += kUnroll) {
429  float32x4_t v0 = vld1q_f32_aligned(acc + i);
430  float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
431 
432  v0 = vaddq_f32(v0, v1);
433 
434  vst1q_f32_aligned(acc + i, v0);
435  }
436 
437  for (; i < size; ++i) {
438  float v0 = acc[i];
439  float v1 = toSum0[i];
440 
441  v0 += v1;
442 
443  acc[i] = v0;
444  }
445  }
446 };
447 
448 template <>
449 struct SumMultiple<2> {
450  static void sumInto(float* acc, float** toSum, size_t size) {
451  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
452  int limit = (size / kUnroll) * kUnroll;
453 
454  auto toSum0 = toSum[0];
455  auto toSum1 = toSum[1];
456 
457  size_t i = 0;
458  for (; i < limit; i += kUnroll) {
459  float32x4_t v0 = vld1q_f32_aligned(acc + i);
460  float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
461  float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
462 
463  v0 = vaddq_f32(v0, v1);
464  v0 = vaddq_f32(v0, v2);
465 
466  vst1q_f32_aligned(acc + i, v0);
467  }
468 
469  for (; i < size; ++i) {
470  float v0 = acc[i];
471  float v1 = toSum0[i];
472  float v2 = toSum1[i];
473 
474  v0 += v1;
475  v0 += v2;
476 
477  acc[i] = v0;
478  }
479  }
480 };
481 
482 template <>
483 struct SumMultiple<3> {
484  static void sumInto(float* acc, float** toSum, size_t size) {
485  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
486  int limit = (size / kUnroll) * kUnroll;
487 
488  auto toSum0 = toSum[0];
489  auto toSum1 = toSum[1];
490  auto toSum2 = toSum[2];
491 
492  size_t i = 0;
493  for (; i < limit; i += kUnroll) {
494  float32x4_t v0 = vld1q_f32_aligned(acc + i);
495  float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
496  float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
497  float32x4_t v3 = vld1q_f32_aligned(toSum2 + i);
498 
499  v0 = vaddq_f32(v0, v1);
500  v2 = vaddq_f32(v2, v3);
501  v0 = vaddq_f32(v0, v2);
502 
503  vst1q_f32_aligned(acc + i, v0);
504  }
505 
506  for (; i < size; ++i) {
507  float v0 = acc[i];
508  float v1 = toSum0[i];
509  float v2 = toSum1[i];
510  float v3 = toSum2[i];
511 
512  v0 += v1;
513  v2 += v3;
514  v0 += v2;
515 
516  acc[i] = v0;
517  }
518  }
519 };
520 #endif
521 
522 // Performs acc[i] += sum_j toSum_j[i] pointwise
523 void sumInto(float* acc, std::vector<float*>& toSum, size_t size) {
524 #ifdef __ARM_NEON__
525  if (toSum.size() == 1) {
526  SumMultiple<1>::sumInto(acc, toSum.data(), size);
527  return;
528  } else if (toSum.size() == 2) {
529  SumMultiple<2>::sumInto(acc, toSum.data(), size);
530  return;
531  } else if (toSum.size() == 3) {
532  SumMultiple<3>::sumInto(acc, toSum.data(), size);
533  return;
534  }
535 #endif
536 
537  // Otherwise, use fallback implementation
538  EigenVectorArrayMap<float> accT(acc, size);
539 
540  for (auto p : toSum) {
541  accT += ConstEigenVectorArrayMap<float>(p, size);
542  }
543 }
544 
545 template <typename T, class Context>
546 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
547  const Tensor<Context>& X = Input(INPUT);
548  auto& filter = Input(FILTER);
549  Tensor<Context>* Y = Output(0);
550  const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
551  CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
552  CAFFE_ENFORCE(
553  filter.dim32(0) == M,
554  "filter number must be equal to input channel number");
555  const int C = filter.dim32(1);
556  CAFFE_ENFORCE(
557  filter.dim32(2) == this->kernel_h(),
558  "filter height must be equal to kernel height");
559  CAFFE_ENFORCE(
560  filter.dim32(3) == this->kernel_w(),
561  "filter width must be equal to kernel width");
562  if (InputSize() == 3) {
563  auto& bias = Input(BIAS);
564  CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
565  CAFFE_ENFORCE(
566  bias.dim32(0) == C,
567  "bias dimension must be equal to output channel number");
568  }
569 
570  ConvTransposeUnpoolBase<Context>::SetOutputSize(X, Y, C);
571 
572  const int outputH = Y->dim32(2);
573  const int outputW = Y->dim32(3);
574  const int outputPlaneSize = outputH * outputW;
575  const int outputBatchElementSize = Y->dim32(1) * outputPlaneSize;
576 
577  auto Xdata = X.template data<T>();
578  auto Ydata = Y->template mutable_data<T>();
579 
580  auto pool = ws_->GetThreadPool();
581  auto numThreads = pool->getNumThreads();
582 
583  // Initialize per-thread buffers for output
584  // The main thread will write directly into the output Y, we just
585  // need buffers for the worker threads
586  size_t colBlockSize = W + this->kernel_w() / this->stride_w();
587  size_t threadYBufferSize = C * outputH * colBlockSize * this->stride_w();
588  // Require 16 byte alignment, so 4-element alignment as these are floats.
589  size_t threadYBufferSizeAligned =
590  ((C * outputH * colBlockSize * this->stride_w() + 3) / 4) * 4;
591  size_t threadColBufferSize = C * this->kernel_h() * this->kernel_w() * W;
592 
593  // Work around GCC 4.9 bug when this is declared inside the inner lambda.
594  auto runLocalTile = [&](TensorCPU* threadBuffer,
595  int threadId,
596  size_t tileId) {
597  auto localYData = threadBuffer->template mutable_data<T>() +
598  threadId * threadYBufferSizeAligned;
599 
600  auto localColBufferData = threadBuffer->template mutable_data<T>() +
601  numThreads * threadYBufferSizeAligned + threadId * threadColBufferSize;
602 
603  runTileContiguous<T, Context>(
604  tileId,
605  N,
606  M,
607  H,
608  W,
609  outputH,
610  outputW,
611  C,
612  this->kernel_h(),
613  this->kernel_w(),
614  this->stride_h(),
615  this->stride_w(),
616  this->pad_t(),
617  filter.template data<T>(),
618  Xdata,
619  localColBufferData,
620  localYData,
621  &context_);
622  };
623 
624  auto f = [&](Tensor<Context>* threadBuffer) {
625  threadBuffer->Resize(
626  numThreads * threadYBufferSizeAligned +
627  numThreads * threadColBufferSize);
628  // Group together thread buffers for accumulation
629  std::vector<T*> toSum(numThreads - 1);
630  for (int i = 1; i < numThreads; ++i) {
631  toSum[i - 1] = threadBuffer->template mutable_data<T>() +
632  i * threadYBufferSizeAligned;
633  }
634 
635  for (auto image_id = 0; image_id < N; ++image_id) {
636  // Each time through, we have to reset all per-thread output
637  // buffers, since the output buffer is only per-batch element
638  // The column buffers are overwritten by the matrix multiplication
639  // each time, so we need not clear them out each round
640  math::Set<T, Context>(
641  numThreads * threadYBufferSizeAligned,
642  0,
643  threadBuffer->template mutable_data<T>(),
644  &context_);
645 
646  // Run tiled gemm and col2im in our threadpool; all of these tiles
647  // are guaranteed to be full tiles
648  // Each tile handles a single row of the input
649  pool->run(
650  [&](int threadId, int tileId) {
651  runLocalTile(threadBuffer, threadId, tileId);
652  },
653  H);
654 
655  // We need to accumulate the per-thread results into the output
656  // Y; the first worker thread (main thread) already produced its
657  // results in Y
658  sumInto(
659  threadBuffer->template mutable_data<T>(), toSum, threadYBufferSize);
660 
661 // y0 now contains the final output, but it is in deinterleaved
662 // form. We have to re-interleave it to produce the final form in Y
663 // This operation also handles adding the per-channel bias.
664 #define REINTERLEAVE(N) \
665  do { \
666  reinterleaveMultithreaded<N, T, Context>( \
667  threadBuffer->template mutable_data<T>(), \
668  InputSize() == 3 ? Input(BIAS).template data<T>() : nullptr, \
669  Ydata, \
670  Y->dim32(1), \
671  Y->dim32(2), \
672  Y->dim32(3), \
673  W, \
674  this->kernel_w(), \
675  this->stride_w(), \
676  this->adj_h(), \
677  pool); \
678  } while (false)
679 
680  if (this->stride_w() == 1) {
681  REINTERLEAVE(1);
682  } else if (this->stride_w() == 2) {
683  REINTERLEAVE(2);
684  } else if (this->stride_w() == 3) {
685  REINTERLEAVE(3);
686  } else if (this->stride_w() == 4) {
687  REINTERLEAVE(4);
688  }
689 
690 #undef REINTERLEAVE
691 
692  Xdata += M * H * W;
693  Ydata += Y->size() / Y->dim32(0);
694  }
695  };
696  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
697  runWithSharedBuffer<Context>(ws_, f);
698  } else {
699  f(&threadBuffer_);
700  }
701 
702  return true;
703 }
704 
705 template <typename T, class Context>
706 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNHWC() {
707  CAFFE_THROW("Not implemented.");
708 }
709 
710 } // namespace caffe2
711 
712 #endif // CAFFE2_MOBILE
713 
714 #endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
Copyright (c) 2016-present, Facebook, Inc.
Copyright (c) 2016-present, Facebook, Inc.
Copyright (c) 2016-present, Facebook, Inc.