Caffe2 - C++ API
A deep learning, cross platform ML framework
depthwise3x3_conv_op.cc
1 #include "caffe2/core/context.h"
2 #include "caffe2/core/timer.h"
3 #include "caffe2/operators/conv_op.h"
4 #include "caffe2/operators/conv_pool_op_base.h"
5 
6 #include "c10/macros/Macros.h"
7 
8 #ifdef __ARM_NEON__
9 #include <arm_neon.h>
10 #endif
11 
12 C10_DEFINE_bool(caffe2_profile_depthwise, false, "");
13 
14 namespace caffe2 {
15 
16 namespace {
17 struct DepthwiseArgs {
18  // Input layer dimensions
19  int batch{0};
20  int in_rows{0};
21  int in_cols{0};
22  int stride{0};
23  int pad_rows{0};
24  int pad_cols{0};
25 
26  // Output layer dimensions
27  int out_rows{0};
28  int out_cols{0};
29 };
30 
31 #ifdef __ARM_NEON__
32 
33 static inline void winograd_f2k3_input_transform_inplace__neon(
34  float32x4_t* d0,
35  float32x4_t* d1,
36  float32x4_t* d2,
37  float32x4_t* d3) {
38  //*d7 = wd7;
39  float32x4_t wd0 = *d0 - *d2;
40  float32x4_t wd1 = *d1 + *d2;
41  float32x4_t wd2 = -*d1 + *d2;
42  float32x4_t wd3 = *d1 - *d3;
43  *d0 = wd0;
44  *d1 = wd1;
45  *d2 = wd2;
46  *d3 = wd3;
47 }
48 
49 static inline void winograd_f2k3_output_transform_inplace__neon(
50  float32x4_t* m0,
51  float32x4_t* m1,
52  float32x4_t* m2,
53  float32x4_t* m3) {
54  *m0 = *m0 + *m1 + *m2;
55  *m1 = *m1 - *m2 - *m3;
56 }
57 
58 static inline float32x4_t
59 vmuladdq_f32(float32x4_t c, float32x4_t a, float32x4_t b) {
60 #if defined(__aarch64__)
61  return vfmaq_f32(c, a, b);
62 #else
63  return vmlaq_f32(c, a, b);
64 #endif
65 }
66 
67 static inline float32x4_t
68 vmulsubq_f32(float32x4_t c, float32x4_t a, float32x4_t b) {
69 #if defined(__aarch64__)
70  return vfmsq_f32(c, a, b);
71 #else
72  return vmlsq_f32(c, a, b);
73 #endif
74 }
75 
76 static inline void winograd_f2k3_kernel_transform__neon(
77  const float32x4_t g0,
78  const float32x4_t g1,
79  const float32x4_t g2,
80  float32x4_t* transform0,
81  float32x4_t* transform1,
82  float32x4_t* transform2,
83  float32x4_t* transform3) {
84  const float32x4_t const_half = vdupq_n_f32(0.5f);
85  float32x4_t half_g0_plus_g2 = const_half * (g0 + g2);
86  *transform0 = g0;
87  *transform1 = vmuladdq_f32(half_g0_plus_g2, const_half, g1);
88  *transform2 = vmulsubq_f32(half_g0_plus_g2, const_half, g1);
89  *transform3 = g2;
90 }
91 
92 static inline float32x4x4_t v4f_transpose4x4__neon(float32x4x4_t m) {
93  float32x4x4_t ret;
94  vst4q_f32((float*)(&ret), m);
95  return ret;
96 }
97 
98 void runDepthwise3x3Conv(
99  const DepthwiseArgs& args,
100  const float* input,
101  const float* kernel,
102  const float* bias,
103  float* output) {
104  const float32x4_t vbias = vsetq_lane_f32(*bias, vdupq_n_f32(0.0), 1);
105  float32x4x4_t kernel_tile;
106  {
107  const float32x4_t g0 = vld1q_f32(kernel);
108  const float32x4_t g1 = vld1q_f32(kernel + 3);
109  // g2[3] is junk
110  const float32x4_t g2 =
111  vextq_f32(vld1q_f32(kernel + 5), vld1q_f32(kernel + 5), 1);
112  float32x4x4_t w;
113  winograd_f2k3_kernel_transform__neon(
114  g0, g1, g2, &w.val[0], &w.val[1], &w.val[2], &w.val[3]);
115  w = v4f_transpose4x4__neon(w);
116 
117  winograd_f2k3_kernel_transform__neon(
118  w.val[0],
119  w.val[1],
120  w.val[2],
121  &kernel_tile.val[0],
122  &kernel_tile.val[1],
123  &kernel_tile.val[2],
124  &kernel_tile.val[3]);
125  }
126 
127 #define TILE \
128  winograd_f2k3_input_transform_inplace__neon( \
129  &input_tile.val[0], \
130  &input_tile.val[1], \
131  &input_tile.val[2], \
132  &input_tile.val[3]); \
133  input_tile = v4f_transpose4x4__neon(input_tile); \
134  winograd_f2k3_input_transform_inplace__neon( \
135  &input_tile.val[0], \
136  &input_tile.val[1], \
137  &input_tile.val[2], \
138  &input_tile.val[3]); \
139  \
140  for (int row = 0; row < 4; ++row) { \
141  input_tile.val[row] = \
142  vmulq_f32(input_tile.val[row], kernel_tile.val[row]); \
143  } \
144  \
145  input_tile.val[1] = input_tile.val[1] + vbias; \
146  winograd_f2k3_output_transform_inplace__neon( \
147  &input_tile.val[0], \
148  &input_tile.val[1], \
149  &input_tile.val[2], \
150  &input_tile.val[3]); \
151  input_tile = v4f_transpose4x4__neon(input_tile); \
152  winograd_f2k3_output_transform_inplace__neon( \
153  &input_tile.val[0], \
154  &input_tile.val[1], \
155  &input_tile.val[2], \
156  &input_tile.val[3])
157 
158  // Non-padded regime.
159 
160  // Iterate over non-padded output tiles.
161  // TODO: avoid spilling W by breaking out the non-padded vs padded case.
162  for (int oth = 0; oth < (args.out_rows + 1) / 2; ++oth) {
163  for (int otw = 0; otw < (args.out_cols + 1) / 2; ++otw) {
164  // load input tile for [oth, otw];
165  int ih = oth * 2 - args.pad_rows;
166  int iw = otw * 2 - args.pad_cols;
167  // fast-path, all accesses in-bounds
168  if (C10_LIKELY(
169  ih >= 0 && iw >= 0 && ih + 3 < args.in_rows &&
170  iw + 3 < args.in_cols && 2 * oth + 1 < args.out_rows &&
171  2 * otw + 1 < args.out_cols
172  )) {
173  float32x4x4_t input_tile;
174  for (int row = 0; row < 4; ++row) {
175  input_tile.val[row] =
176  vld1q_f32(input + (ih + row) * args.in_cols + iw);
177  }
178 
179  TILE;
180 
181  for (size_t row = 0; row < 2; ++row) {
182  vst1_f32(
183  output + (oth * 2 + row) * args.out_cols + otw * 2,
184  vget_low_f32(input_tile.val[row]));
185  }
186  } else {
187  float block[4][4];
188  for (int row = 0; row < 4; ++row) {
189  for (int col = 0; col < 4; ++col) {
190  if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows &&
191  iw + col < args.in_cols) {
192  block[row][col] = input[(ih + row) * args.in_cols + iw + col];
193  } else {
194  block[row][col] = 0.0;
195  }
196  }
197  }
198 
199  float32x4x4_t input_tile;
200  for (int row = 0; row < 4; ++row) {
201  input_tile.val[row] = vld1q_f32(&block[row][0]);
202  }
203 
204  TILE;
205 
206  float oblock[2][2];
207  for (int row = 0; row < 2; ++row) {
208  vst1_f32(&oblock[row][0], vget_low_f32(input_tile.val[row]));
209  }
210  for (int row = 0; row < 2; ++row) {
211  for (int col = 0; col < 2; ++col) {
212  if (2 * oth + row < args.out_rows &&
213  2 * otw + col < args.out_cols) {
214  output[(2 * oth + row) * args.out_cols + 2 * otw + col] =
215  oblock[row][col];
216  }
217  }
218  }
219  }
220  }
221  }
222 }
223 
224 #else
225 
226 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
227 typedef float psimd_f32 __attribute__((vector_size(16), aligned(1)));
228 typedef int psimd_s32 __attribute__((__vector_size__(16)));
229 
230 PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
231  *((psimd_f32*)address) = value;
232 }
233 
234 PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
235  return *((const psimd_f32*)address);
236 }
237 
238 PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
239  return (psimd_f32){c, c, c, c};
240 }
241 
242 #if defined(__clang__)
243 
244 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
245  return __builtin_shufflevector(a, b, 0, 4 + 0, 1, 4 + 1);
246 }
247 
248 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
249  return __builtin_shufflevector(a, b, 2, 4 + 2, 3, 4 + 3);
250 }
251 
252 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
253  return __builtin_shufflevector(a, b, 0, 1, 4 + 0, 4 + 1);
254 }
255 
256 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
257  return __builtin_shufflevector(a, b, 2, 3, 4 + 2, 4 + 3);
258 }
259 
260 #else
261 
262 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
263  return __builtin_shuffle(a, b, (psimd_s32){0, 4 + 0, 1, 4 + 1});
264 }
265 
266 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
267  return __builtin_shuffle(a, b, (psimd_s32){2, 4 + 2, 3, 4 + 3});
268 }
269 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
270  return __builtin_shuffle(a, b, (psimd_s32){0, 1, 4 + 0, 4 + 1});
271 }
272 
273 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
274  return __builtin_shuffle(a, b, (psimd_s32){2, 3, 4 + 2, 4 + 3});
275 }
276 
277 #endif
278 
279 static inline void psimd_transpose4x4_f32(
280  const psimd_f32 row0,
281  const psimd_f32 row1,
282  const psimd_f32 row2,
283  const psimd_f32 row3,
284  psimd_f32* col0,
285  psimd_f32* col1,
286  psimd_f32* col2,
287  psimd_f32* col3) {
288  const psimd_f32 row01lo = psimd_interleave_lo_f32(row0, row1);
289  const psimd_f32 row01hi = psimd_interleave_hi_f32(row0, row1);
290  const psimd_f32 row23lo = psimd_interleave_lo_f32(row2, row3);
291  const psimd_f32 row23hi = psimd_interleave_hi_f32(row2, row3);
292  *col0 = psimd_concat_lo_f32(row01lo, row23lo);
293  *col1 = psimd_concat_hi_f32(row01lo, row23lo);
294  *col2 = psimd_concat_lo_f32(row01hi, row23hi);
295  *col3 = psimd_concat_hi_f32(row01hi, row23hi);
296 }
297 
298 static inline void winograd_f2k3_input_transform(
299  const psimd_f32 d0,
300  const psimd_f32 d1,
301  const psimd_f32 d2,
302  const psimd_f32 d3,
303  psimd_f32* transform0,
304  psimd_f32* transform1,
305  psimd_f32* transform2,
306  psimd_f32* transform3) {
307  *transform0 = d0 - d2;
308  *transform1 = d1 + d2;
309  *transform2 = -d1 + d2;
310  *transform3 = d1 - d3;
311 }
312 
313 static inline void winograd_f2k3_kernel_transform(
314  const psimd_f32 g0,
315  const psimd_f32 g1,
316  const psimd_f32 g2,
317  psimd_f32* transform0,
318  psimd_f32* transform1,
319  psimd_f32* transform2,
320  psimd_f32* transform3) {
321  const psimd_f32 const_half = psimd_splat_f32(0.5);
322  const psimd_f32 half_g0_plus_g2 = const_half * (g0 + g2);
323  *transform0 = g0;
324  *transform1 = half_g0_plus_g2 + const_half * g1;
325  *transform2 = half_g0_plus_g2 - const_half * g1;
326  *transform3 = g2;
327 }
328 
329 static inline void winograd_f2k3_output_transform(
330  const psimd_f32 m0,
331  const psimd_f32 m1,
332  const psimd_f32 m2,
333  const psimd_f32 m3,
334  psimd_f32* output0,
335  psimd_f32* output1) {
336  *output0 = m0 + m1 + m2;
337  *output1 = m1 - m2 - m3;
338 }
339 
340 void runDepthwise3x3Conv(
341  const DepthwiseArgs& args,
342  const float* input,
343  const float* kernel,
344  const float* bias,
345  float* output) {
346  const psimd_f32 vbias = {0, *bias, 0, 0};
347  const psimd_f32 g0 = psimd_load_f32(kernel);
348  const psimd_f32 g1 = psimd_load_f32(kernel + 3);
349  const psimd_f32 g5678 = psimd_load_f32(kernel + 5);
350 #ifdef __clang__
351  const psimd_f32 g2 = __builtin_shufflevector(g5678, g5678, 1, 2, 3, -1);
352 #else
353  const psimd_f32 g2 =
354  __builtin_shuffle(g5678, g5678, (psimd_s32){1, 2, 3, -1});
355 #endif
356  psimd_f32 w[4];
357  winograd_f2k3_kernel_transform(g0, g1, g2, &w[0], &w[1], &w[2], &w[3]);
358  psimd_transpose4x4_f32(w[0], w[1], w[2], w[3], &w[0], &w[1], &w[2], &w[3]);
359  psimd_f32 wg[4];
360  winograd_f2k3_kernel_transform(
361  w[0], w[1], w[2], &wg[0], &wg[1], &wg[2], &wg[3]);
362 
363  // Iterate over non-padded output tiles.
364  for (int oth = 0; oth < (args.out_rows + 1) / 2; ++oth) {
365  for (int otw = 0; otw < (args.out_cols + 1) / 2; ++otw) {
366  // load input tile for [oth, otw], i.e. [2 * oth - 1:2 * oth - 1 + 2, 2 *
367  // otw - 1:2 * otw - 1 + 2]]
368  int ih = oth * 2 - args.pad_rows;
369  int iw = otw * 2 - args.pad_cols;
370  // fast-path, all accesses in-bounds
371  float block[4][4];
372  for (int row = 0; row < 4; ++row) {
373  for (int col = 0; col < 4; ++col) {
374  if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows &&
375  iw + col < args.in_cols) {
376  block[row][col] = input[(ih + row) * args.in_cols + iw + col];
377  } else {
378  block[row][col] = 0.0;
379  }
380  }
381  }
382  psimd_f32 wd[4];
383  winograd_f2k3_input_transform(
384  psimd_load_f32(&block[0]),
385  psimd_load_f32(&block[1]),
386  psimd_load_f32(&block[2]),
387  psimd_load_f32(&block[3]),
388  &wd[0],
389  &wd[1],
390  &wd[2],
391  &wd[3]);
392  psimd_transpose4x4_f32(
393  wd[0], wd[1], wd[2], wd[3], &wd[0], &wd[1], &wd[2], &wd[3]);
394  winograd_f2k3_input_transform(
395  wd[0], wd[1], wd[2], wd[3], &wd[0], &wd[1], &wd[2], &wd[3]);
396 
397  for (int row = 0; row < 4; ++row) {
398  wd[row] = wg[row] * wd[row];
399  }
400  wd[1] += vbias;
401  psimd_f32 s[4] = {{0}};
402  winograd_f2k3_output_transform(wd[0], wd[1], wd[2], wd[3], &s[0], &s[1]);
403  psimd_transpose4x4_f32(
404  s[0], s[1], s[2], s[3], &s[0], &s[1], &s[2], &s[3]);
405 
406  psimd_f32 t0, t1;
407  winograd_f2k3_output_transform(s[0], s[1], s[2], s[3], &t0, &t1);
408 
409  float oblock[2][4];
410  psimd_store_f32(&oblock[0], t0);
411  psimd_store_f32(&oblock[1], t1);
412  for (int row = 0; row < 2; ++row) {
413  for (int col = 0; col < 2; ++col) {
414  if (2 * oth + row >= 0 && 2 * otw + col >= 0 &&
415  2 * oth + row < args.out_rows && 2 * otw + col < args.out_cols) {
416  output[(2 * oth + row) * args.out_cols + 2 * otw + col] =
417  oblock[row][col];
418  }
419  }
420  }
421  }
422  }
423 }
424 
425 #endif
426 
427 class Depthwise3x3ConvOp final : public ConvPoolOpBase<CPUContext> {
428  public:
429  USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
430  Depthwise3x3ConvOp(const OperatorDef& operator_def, Workspace* ws)
431  : ConvPoolOpBase<CPUContext>(operator_def, ws) {
432  OPERATOR_NEEDS_FEATURE(
433  this->order_ == StorageOrder::NCHW,
434  "Depthwise3x3ConvOp only supports NCHW order");
435  OPERATOR_NEEDS_FEATURE(this->group_ > 1);
436  OPERATOR_NEEDS_FEATURE(this->kernel_w() == 3);
437  OPERATOR_NEEDS_FEATURE(this->kernel_h() == 3);
438  OPERATOR_NEEDS_FEATURE(this->stride_h() == 1);
439  OPERATOR_NEEDS_FEATURE(this->stride_w() == 1);
440  }
441 
442  bool RunOnDeviceWithOrderNCHW() override {
443  const Tensor& X = Input(0);
444  auto& filter = Input(1);
445  const int N = X.dim32(0), C = X.dim32(1);
446  CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
447  const int M = filter.dim32(0);
448 
449  CAFFE_ENFORCE_EQ(M, X.dim32(1));
450  CAFFE_ENFORCE_EQ(C, X.dim32(1));
451  CAFFE_ENFORCE_EQ(C, this->group_);
452  CAFFE_ENFORCE_EQ(M, this->group_);
453 
454  auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0));
455  Tensor* Y = Output(0, sizes, at::dtype<float>());
456 
457  DepthwiseArgs args;
458  args.batch = X.dim32(0);
459  args.in_rows = X.dim32(2);
460  args.in_cols = X.dim32(3);
461  args.stride = this->stride_w();
462  args.pad_rows = this->pad_t();
463  args.pad_cols = this->pad_l();
464  args.out_rows = Y->dim32(2);
465  args.out_cols = Y->dim32(3);
466 
467  const auto G = this->group_;
468  const auto IS = X.dim32(2) * X.dim32(3);
469  const auto OS = Y->dim32(2) * Y->dim32(3);
470 
471  if (InputSize() != 3 && bias_.size() != M) {
472  // no bias.
473  bias_.Resize(M);
474  math::Set<float, CPUContext>(
475  M, 0.0, bias_.mutable_data<float>(), &context_);
476  }
477  const auto* bias =
478  InputSize() == 3 ? Input(2).data<float>() : bias_.data<float>();
479 
480  auto f = [&](int n, int g) {
481  runDepthwise3x3Conv(
482  args,
483  X.data<float>() + g * IS + n * G * IS,
484  filter.data<float>() + g * 3 * 3,
485  bias + g,
486  Y->mutable_data<float>() + g * OS + n * G * OS);
487  };
488 
489  Timer t;
490 
491 #ifdef C10_MOBILE
492  ws_->GetThreadPool()->run(
493  [&](int, int n_g) {
494  const int g = n_g / N;
495  const int n = n_g % N;
496  f(n, g);
497  },
498  N * G);
499 #else
500  for (auto n = 0; n < N; ++n) {
501  for (auto g = 0; g < G; ++g) {
502  f(n, g);
503  }
504  }
505 #endif
506  if (FLAGS_caffe2_profile_depthwise) {
507  char buffer[1024];
508  const double gmacs = double(
509  Y->dim32(2) * Y->dim32(3) * Y->dim32(1) *
510  kernel_w() * kernel_h()) /
511  1.0E9;
512  const double gflops = 2 * gmacs / t.Seconds();
513  auto ret = snprintf(
514  buffer,
515  sizeof(buffer),
516  "H: %3zu, W: %3zu, iC: %3zu, oC: %3zu, K: %1zu, S: %1zu, P: %1zu, GMACs: "
517  "%4.2f, totalT: %6.3f, inputT: %6.3f, "
518  "kernelT: %6.3f, blockT: %6.3f, outputT: %6.3f, GFLOPS: %6.3f",
519  size_t(X.dim(2)),
520  size_t(X.dim(3)),
521  size_t(X.dim(1)),
522  size_t(Y->dim(1)),
523  size_t(kernel_w()),
524  size_t(stride_w()),
525  size_t(pad_t()),
526  gmacs,
527  t.Seconds() * 1E3,
528  0 * 1E3,
529  0 * 1E3,
530  0 * 1E3,
531  0 * 1E3,
532  gflops);
533  CAFFE_ENFORCE(ret > 0);
534  LOG(INFO) << buffer;
535  }
536  return true;
537  }
538 
539  private:
540  Tensor bias_{CPU};
541 };
542 
543 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, DEPTHWISE_3x3, Depthwise3x3ConvOp);
544 } // namespace
545 } // namespace caffe2
Definition: any.cpp:108
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: OpClasses.h:13
Definition: static.cpp:64