Caffe2 - C++ API
A deep learning, cross platform ML framework
conv_transpose_op_mobile_impl.h
1 // conv_transpose_op_impl.h is the templated implementation of the
2 // conv_transpose_op.h file.
3 #ifndef CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
4 #define CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
5 
6 #include "caffe2/core/common.h"
7 
8 #ifdef C10_MOBILE
9 
10 #include "caffe2/core/logging.h"
11 #include "caffe2/operators/conv_op_shared.h"
12 #include "caffe2/operators/conv_transpose_op_mobile.h"
13 #include "caffe2/utils/cpu_neon.h"
14 #include "caffe2/utils/eigen_utils.h"
15 #include "caffe2/utils/fixed_divisor.h"
16 #include "caffe2/utils/math.h"
17 #include "caffe2/utils/math/utils.h"
18 
19 C10_DECLARE_bool(caffe2_force_shared_col_buffer);
20 
21 namespace caffe2 {
22 
23 template <typename T, typename Context>
24 void runTileContiguous(
25  int tileId,
26  int N,
27  int M,
28  int H,
29  int W,
30  int outputH,
31  int outputW,
32  int C,
33  int kernelH,
34  int kernelW,
35  int strideH,
36  int strideW,
37  int padT,
38  const T* filterData,
39  const T* Xdata,
40  T* colBufferData,
41  T* Ydata,
42  Context* context) {
43  // The tile size is exactly the length of a single row
44  int tileSize = W;
45 
46  auto kernelDataSize = C * kernelH * kernelW;
47  auto currentTileStart = tileSize * tileId;
48 
49  // gemm tile
50  math::GemmEx<T, Context>(
51  CblasTrans,
52  CblasNoTrans,
53  kernelDataSize,
54  tileSize,
55  M,
56  1,
57  filterData,
58  kernelDataSize,
59  Xdata + currentTileStart,
60  H * W,
61  0,
62  colBufferData,
63  tileSize,
64  context);
65 
66  // col2im tile
67  // We assume that there is no padding in the columns (padL and padR
68  // == 0).
69  // FIXME: it is actually possible for us to handle padding, figure
70  // out how to adjust the bounds
71 
72  // We write into Y in a de-interleaved fashion; in other words,
73  // every column (mod strideW) == 0 together in one block,
74  // every column (mod strideW) == 1 in another,
75  // ... and so on.
76  int colBlockSize = (W + kernelW / strideW);
77  int numColBlocks = strideW;
78 
79  for (int c = 0; c < kernelDataSize; ++c) {
80  int w_offset = c % kernelW;
81  int h_offset = (c / kernelW) % kernelH;
82  int c_im = c / kernelH / kernelW;
83 
84  // Each row is a separate tile that we handle. First determine the
85  // row into which we are writing the output.
86  // We can properly handle padding for the rows.
87  int rowY = tileId * strideH - padT + h_offset;
88 
89  // If this row is out of bounds, then skip it
90  if (!math::utils::IsAGeZeroAndALtB(rowY, outputH)) {
91  continue;
92  }
93 
94  // FIXME: we don't actually handle a dynamic padL > 0
95  constexpr int kPadL = 0;
96  int colOffsetStart = -kPadL + w_offset;
97  int colBlockY = colOffsetStart % strideW;
98 
99  // However, within a block we may not start writing at offset
100  // 0. The offset at which we begin writing is determined by
101  // colOffsetStart
102  int colWithinBlockOffsetY = colOffsetStart / strideW;
103 
104  // So, this is where we begin reading/writing in Y
105  int colY = colBlockY * colBlockSize + colWithinBlockOffsetY;
106 
107  // This is the complete offset into Y from the start
108  // Each row has strideW blocks of size colBlockSize
109  int offsetY = rowY * colBlockSize * numColBlocks + colY;
110 
111  T* colBufferPointer = colBufferData + c * tileSize;
112  T* yPointer =
113  Ydata + c_im * outputH * (colBlockSize * numColBlocks) + offsetY;
114 
115  int b = 0;
116 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
117  // We vectorize the loop within the row
118  {
119  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float)) * 4;
120  int limit = (tileSize / kUnroll) * kUnroll;
121 
122  for (; b < limit; b += kUnroll) {
123  float32x4_t cb0 = vld1q_f32(colBufferPointer + 0);
124  float32x4_t cb1 = vld1q_f32(colBufferPointer + 4);
125  float32x4_t cb2 = vld1q_f32(colBufferPointer + 8);
126  float32x4_t cb3 = vld1q_f32(colBufferPointer + 12);
127 
128  float32x4_t y0 = vld1q_f32(yPointer + 0);
129  float32x4_t y1 = vld1q_f32(yPointer + 4);
130  float32x4_t y2 = vld1q_f32(yPointer + 8);
131  float32x4_t y3 = vld1q_f32(yPointer + 12);
132 
133  y0 = vaddq_f32(y0, cb0);
134  y1 = vaddq_f32(y1, cb1);
135  y2 = vaddq_f32(y2, cb2);
136  y3 = vaddq_f32(y3, cb3);
137 
138  vst1q_f32(yPointer + 0, y0);
139  vst1q_f32(yPointer + 4, y1);
140  vst1q_f32(yPointer + 8, y2);
141  vst1q_f32(yPointer + 12, y3);
142 
143  colBufferPointer += kUnroll;
144  yPointer += kUnroll;
145  }
146  }
147 
148  {
149  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
150  int limit = (tileSize / kUnroll) * kUnroll;
151 
152  for (; b < limit; b += kUnroll) {
153  float32x4_t cb0 = vld1q_f32(colBufferPointer);
154  float32x4_t y0 = vld1q_f32(yPointer);
155 
156  y0 = vaddq_f32(y0, cb0);
157 
158  vst1q_f32(yPointer, y0);
159 
160  colBufferPointer += kUnroll;
161  yPointer += kUnroll;
162  }
163  }
164 #endif
165 
166  // Handle un-vectorizable epilogue
167  for (; b < tileSize; ++b) {
168  *yPointer += *colBufferPointer;
169  ++yPointer;
170  ++colBufferPointer;
171  }
172  }
173 }
174 
175 template <typename T, int N>
176 struct StoreInterleaved {};
177 
178 template <>
179 struct StoreInterleaved<float, 1> {
180 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
181  inline static void store(float* p, float32x4_t v[1]) {
182  vst1q_f32(p, v[0]);
183  }
184 #endif
185 
186  inline static void store(float* p, float v[1]) {
187  p[0] = v[0];
188  }
189 };
190 
191 template <>
192 struct StoreInterleaved<float, 2> {
193 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
194  inline static void store(float* p, float32x4_t v[2]) {
195  float32x4x2_t x = {{v[0], v[1]}};
196  vst2q_f32(p, x);
197  }
198 #endif
199 
200  inline static void store(float* p, float v[2]) {
201  p[0] = v[0];
202  p[1] = v[1];
203  }
204 };
205 
206 template <>
207 struct StoreInterleaved<float, 3> {
208 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
209  inline static void store(float* p, float32x4_t v[3]) {
210  float32x4x3_t x = {{v[0], v[1], v[2]}};
211  vst3q_f32(p, x);
212  }
213 #endif
214 
215  inline static void store(float* p, float v[3]) {
216  p[0] = v[0];
217  p[1] = v[1];
218  p[2] = v[2];
219  }
220 };
221 
222 template <>
223 struct StoreInterleaved<float, 4> {
224 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
225  inline static void store(float* p, float32x4_t v[4]) {
226  float32x4x4_t x = {{v[0], v[1], v[2], v[3]}};
227  vst4q_f32(p, x);
228  }
229 #endif
230 
231  inline static void store(float* p, float v[4]) {
232  p[0] = v[0];
233  p[1] = v[1];
234  p[2] = v[2];
235  p[3] = v[3];
236  }
237 };
238 
239 template <int kStrideW>
240 void reinterleaveRows(
241  const float* src,
242  const float* bias,
243  int c,
244  int h,
245  float* dst,
246  int outputC,
247  int outputH,
248  int outputW,
249  int inputW,
250  int kernelW,
251  int strideW,
252  int adjH) {
253  // Each row in src is of the form:
254  // [w mod strideW == 0 elements]...[w mod strideW == strideW - 1
255  // elements]
256  // We need to re-interleave the values and write them in the output
257  int colBlockSize = inputW + kernelW / kStrideW;
258  int noAdjOutputW = (inputW - 1) * kStrideW + kernelW;
259 
260  int point = c * outputH + h;
261  src += point * colBlockSize * kStrideW;
262  dst += point * outputW;
263 
264  float b = bias ? bias[c] : 0;
265 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
266  float32x4_t biasV = vdupq_n_f32(b);
267 #endif
268 
269  int w = 0;
270 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
271  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float)) * 2;
272  int limit = ((inputW - 1) / kUnroll) * kUnroll;
273 
274  for (; w < limit; w += kUnroll) {
275  // We need to interleave in terms of kStrideW units
276  float32x4_t v0[kStrideW];
277  float32x4_t v1[kStrideW];
278 
279  for (int i = 0; i < kStrideW; ++i) {
280  v0[i] = vld1q_f32(src + i * colBlockSize);
281  v1[i] = vld1q_f32(src + i * colBlockSize + 4);
282  }
283 
284  // add per-channel bias
285  for (int i = 0; i < kStrideW; ++i) {
286  v0[i] = vaddq_f32(v0[i], biasV);
287  v1[i] = vaddq_f32(v1[i], biasV);
288  }
289 
290  // Write interleaved into the output
291  StoreInterleaved<float, kStrideW>::store(dst + 0 * kStrideW, v0);
292  StoreInterleaved<float, kStrideW>::store(dst + 4 * kStrideW, v1);
293 
294  src += kUnroll;
295  dst += kUnroll * kStrideW;
296  }
297 #endif
298 
299  // Handle non-vectorizable remainder
300  for (; w < inputW - 1; ++w) {
301  float v[kStrideW];
302 
303  for (int i = 0; i < kStrideW; ++i) {
304  v[i] = src[i * colBlockSize];
305  }
306 
307  // add per-channel bias
308  for (int i = 0; i < kStrideW; ++i) {
309  v[i] += b;
310  }
311 
312  // Write interleaved into the output
313  StoreInterleaved<float, kStrideW>::store(dst, v);
314 
315  src += 1;
316  dst += kStrideW;
317  }
318 
319  // We have handled 0 .. (inputW - 1) * stride inclusive so far.
320  // Handle the remainder
321  int outputPoint = (inputW - 1) * kStrideW;
322  int block = 0;
323 
324  // Output width may include adjustment into which we don't
325  // write; ignore it
326  while (outputPoint < noAdjOutputW) {
327  float v = src[block * colBlockSize];
328  dst[0] = v + b;
329  ++outputPoint;
330  dst += 1;
331 
332  ++block;
333  if (block >= kStrideW) {
334  block = 0;
335  src += 1;
336  }
337  }
338 
339  // Remainder of the buffer comprised of just the `adj` must have
340  // bias added
341  for (; outputPoint < outputW; ++outputPoint) {
342  dst[0] = b;
343  dst += 1;
344  }
345 }
346 
347 template <int N, typename T, typename Context>
348 void reinterleaveMultithreaded(
349  const T* y0,
350  const T* bias_data,
351  T* y,
352  int outputC,
353  int outputH,
354  int outputW,
355  int inputW,
356  int kernelW,
357  int strideW,
358  int adjH,
359  ThreadPool* pool) {
360  // # channels times height
361  size_t totalTiles = (size_t)outputC * outputH;
362  FixedDivisor<int> divOutputH(outputH);
363 
364 #define REINTERLEAVE(N) \
365  do { \
366  reinterleaveRows<N>( \
367  y0, \
368  bias_data, \
369  c, \
370  h, \
371  y, \
372  outputC, \
373  outputH, \
374  outputW, \
375  inputW, \
376  kernelW, \
377  strideW, \
378  adjH); \
379  } while (false)
380 
381  std::function<void(int, size_t)> fnReinterleave = [&](int threadId,
382  size_t tileId) {
383  int h;
384  int c;
385  divOutputH.DivMod((int)tileId, &c, &h);
386 
387  REINTERLEAVE(N);
388  };
389 
390 #undef REINTERLEAVE
391 
392  pool->run(fnReinterleave, totalTiles);
393 }
394 
395 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
396 template <int N>
397 struct SumMultiple {
398  static void sumInto(float* acc, float** toSum, size_t size);
399 };
400 
401 template <>
402 struct SumMultiple<1> {
403  static void sumInto(float* acc, float** toSum, size_t size) {
404  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
405  int limit = (size / kUnroll) * kUnroll;
406 
407  auto toSum0 = toSum[0];
408 
409  size_t i = 0;
410  for (; i < limit; i += kUnroll) {
411  float32x4_t v0 = vld1q_f32_aligned(acc + i);
412  float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
413 
414  v0 = vaddq_f32(v0, v1);
415 
416  vst1q_f32_aligned(acc + i, v0);
417  }
418 
419  for (; i < size; ++i) {
420  float v0 = acc[i];
421  float v1 = toSum0[i];
422 
423  v0 += v1;
424 
425  acc[i] = v0;
426  }
427  }
428 };
429 
430 template <>
431 struct SumMultiple<2> {
432  static void sumInto(float* acc, float** toSum, size_t size) {
433  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
434  int limit = (size / kUnroll) * kUnroll;
435 
436  auto toSum0 = toSum[0];
437  auto toSum1 = toSum[1];
438 
439  size_t i = 0;
440  for (; i < limit; i += kUnroll) {
441  float32x4_t v0 = vld1q_f32_aligned(acc + i);
442  float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
443  float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
444 
445  v0 = vaddq_f32(v0, v1);
446  v0 = vaddq_f32(v0, v2);
447 
448  vst1q_f32_aligned(acc + i, v0);
449  }
450 
451  for (; i < size; ++i) {
452  float v0 = acc[i];
453  float v1 = toSum0[i];
454  float v2 = toSum1[i];
455 
456  v0 += v1;
457  v0 += v2;
458 
459  acc[i] = v0;
460  }
461  }
462 };
463 
464 template <>
465 struct SumMultiple<3> {
466  static void sumInto(float* acc, float** toSum, size_t size) {
467  constexpr int kUnroll = (sizeof(float32x4_t) / sizeof(float));
468  int limit = (size / kUnroll) * kUnroll;
469 
470  auto toSum0 = toSum[0];
471  auto toSum1 = toSum[1];
472  auto toSum2 = toSum[2];
473 
474  size_t i = 0;
475  for (; i < limit; i += kUnroll) {
476  float32x4_t v0 = vld1q_f32_aligned(acc + i);
477  float32x4_t v1 = vld1q_f32_aligned(toSum0 + i);
478  float32x4_t v2 = vld1q_f32_aligned(toSum1 + i);
479  float32x4_t v3 = vld1q_f32_aligned(toSum2 + i);
480 
481  v0 = vaddq_f32(v0, v1);
482  v2 = vaddq_f32(v2, v3);
483  v0 = vaddq_f32(v0, v2);
484 
485  vst1q_f32_aligned(acc + i, v0);
486  }
487 
488  for (; i < size; ++i) {
489  float v0 = acc[i];
490  float v1 = toSum0[i];
491  float v2 = toSum1[i];
492  float v3 = toSum2[i];
493 
494  v0 += v1;
495  v2 += v3;
496  v0 += v2;
497 
498  acc[i] = v0;
499  }
500  }
501 };
502 #endif
503 
504 // Performs acc[i] += sum_j toSum_j[i] pointwise
505 void sumInto(float* acc, std::vector<float*>& toSum, size_t size) {
506 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
507  if (toSum.size() == 1) {
508  SumMultiple<1>::sumInto(acc, toSum.data(), size);
509  return;
510  } else if (toSum.size() == 2) {
511  SumMultiple<2>::sumInto(acc, toSum.data(), size);
512  return;
513  } else if (toSum.size() == 3) {
514  SumMultiple<3>::sumInto(acc, toSum.data(), size);
515  return;
516  }
517 #endif
518 
519  // Otherwise, use fallback implementation
520  EigenVectorArrayMap<float> accT(acc, size);
521 
522  for (auto p : toSum) {
523  accT += ConstEigenVectorArrayMap<float>(p, size);
524  }
525 }
526 
527 template <typename T, class Context>
528 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
529  const Tensor& X = Input(INPUT);
530  auto& filter = Input(FILTER);
531  const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
532  CAFFE_ENFORCE(filter.ndim() == 4, "filter must be 4D tensor");
533  CAFFE_ENFORCE(
534  filter.dim32(0) == M,
535  "filter number must be equal to input channel number");
536  const int C = filter.dim32(1);
537  CAFFE_ENFORCE(
538  filter.dim32(2) == this->kernel_h(),
539  "filter height must be equal to kernel height");
540  CAFFE_ENFORCE(
541  filter.dim32(3) == this->kernel_w(),
542  "filter width must be equal to kernel width");
543  if (InputSize() == 3) {
544  auto& bias = Input(BIAS);
545  CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
546  CAFFE_ENFORCE(
547  bias.dim32(0) == C,
548  "bias dimension must be equal to output channel number");
549  }
550 
551  auto sizes = ConvTransposeUnpoolBase<Context>::GetOutputSize(X, C);
552  Tensor* Y = Output(0, sizes, at::dtype<T>());
553 
554  const int outputH = Y->dim32(2);
555  const int outputW = Y->dim32(3);
556  const int outputPlaneSize = outputH * outputW;
557  const int outputBatchElementSize = Y->dim32(1) * outputPlaneSize;
558 
559  auto Xdata = X.template data<T>();
560  auto Ydata = Y->template mutable_data<T>();
561 
562  auto pool = ws_->GetThreadPool();
563  auto numThreads = pool->getNumThreads();
564 
565  // Initialize per-thread buffers for output
566  // The main thread will write directly into the output Y, we just
567  // need buffers for the worker threads
568  size_t colBlockSize = W + this->kernel_w() / this->stride_w();
569  size_t threadYBufferSize = C * outputH * colBlockSize * this->stride_w();
570  // Require 16 byte alignment, so 4-element alignment as these are floats.
571  size_t threadYBufferSizeAligned =
572  ((C * outputH * colBlockSize * this->stride_w() + 3) / 4) * 4;
573  size_t threadColBufferSize = C * this->kernel_h() * this->kernel_w() * W;
574 
575  // Work around GCC 4.9 bug when this is declared inside the inner lambda.
576  auto runLocalTile = [&](TensorCPU* threadBuffer,
577  int threadId,
578  size_t tileId) {
579  auto localYData = threadBuffer->template mutable_data<T>() +
580  threadId * threadYBufferSizeAligned;
581 
582  auto localColBufferData = threadBuffer->template mutable_data<T>() +
583  numThreads * threadYBufferSizeAligned + threadId * threadColBufferSize;
584 
585  runTileContiguous<T, Context>(
586  tileId,
587  N,
588  M,
589  H,
590  W,
591  outputH,
592  outputW,
593  C,
594  this->kernel_h(),
595  this->kernel_w(),
596  this->stride_h(),
597  this->stride_w(),
598  this->pad_t(),
599  filter.template data<T>(),
600  Xdata,
601  localColBufferData,
602  localYData,
603  &context_);
604  };
605 
606  auto f = [&](Tensor* threadBuffer) {
607  threadBuffer->Resize(
608  numThreads * threadYBufferSizeAligned +
609  numThreads * threadColBufferSize);
610  // Group together thread buffers for accumulation
611  std::vector<T*> toSum(numThreads - 1);
612  for (int i = 1; i < numThreads; ++i) {
613  toSum[i - 1] = threadBuffer->template mutable_data<T>() +
614  i * threadYBufferSizeAligned;
615  }
616 
617  for (auto image_id = 0; image_id < N; ++image_id) {
618  // Each time through, we have to reset all per-thread output
619  // buffers, since the output buffer is only per-batch element
620  // The column buffers are overwritten by the matrix multiplication
621  // each time, so we need not clear them out each round
622  math::Set<T, Context>(
623  numThreads * threadYBufferSizeAligned,
624  0,
625  threadBuffer->template mutable_data<T>(),
626  &context_);
627 
628  // Run tiled gemm and col2im in our threadpool; all of these tiles
629  // are guaranteed to be full tiles
630  // Each tile handles a single row of the input
631  pool->run(
632  [&](int threadId, int tileId) {
633  runLocalTile(threadBuffer, threadId, tileId);
634  },
635  H);
636 
637  // We need to accumulate the per-thread results into the output
638  // Y; the first worker thread (main thread) already produced its
639  // results in Y
640  sumInto(
641  threadBuffer->template mutable_data<T>(), toSum, threadYBufferSize);
642 
643 // y0 now contains the final output, but it is in deinterleaved
644 // form. We have to re-interleave it to produce the final form in Y
645 // This operation also handles adding the per-channel bias.
646 #define REINTERLEAVE(N) \
647  do { \
648  reinterleaveMultithreaded<N, T, Context>( \
649  threadBuffer->template mutable_data<T>(), \
650  InputSize() == 3 ? Input(BIAS).template data<T>() : nullptr, \
651  Ydata, \
652  Y->dim32(1), \
653  Y->dim32(2), \
654  Y->dim32(3), \
655  W, \
656  this->kernel_w(), \
657  this->stride_w(), \
658  this->adj_h(), \
659  pool); \
660  } while (false)
661 
662  if (this->stride_w() == 1) {
663  REINTERLEAVE(1);
664  } else if (this->stride_w() == 2) {
665  REINTERLEAVE(2);
666  } else if (this->stride_w() == 3) {
667  REINTERLEAVE(3);
668  } else if (this->stride_w() == 4) {
669  REINTERLEAVE(4);
670  }
671 
672 #undef REINTERLEAVE
673 
674  Xdata += M * H * W;
675  Ydata += Y->size() / Y->dim32(0);
676  }
677  };
678  if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) {
679  runWithSharedBuffer<Context>(ws_, f);
680  } else {
681  f(&threadBuffer_);
682  }
683 
684  return true;
685 }
686 
687 template <typename T, class Context>
688 bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNHWC() {
689  CAFFE_THROW("Not implemented.");
690 }
691 
692 } // namespace caffe2
693 
694 #endif // C10_MOBILE
695 
696 #endif // CAFFE2_OPERATORS_CONV_TRANSPOSE_MOBILE_OP_IMPL_H_
Definition: any.cpp:108
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64