1 #include "caffe2/core/context.h" 2 #include "caffe2/core/timer.h" 3 #include "caffe2/operators/conv_op.h" 4 #include "caffe2/operators/conv_pool_op_base.h" 6 #include "c10/macros/Macros.h" 12 C10_DEFINE_bool(caffe2_profile_depthwise,
false,
"");
17 struct DepthwiseArgs {
33 static inline void winograd_f2k3_input_transform_inplace__neon(
39 float32x4_t wd0 = *d0 - *d2;
40 float32x4_t wd1 = *d1 + *d2;
41 float32x4_t wd2 = -*d1 + *d2;
42 float32x4_t wd3 = *d1 - *d3;
49 static inline void winograd_f2k3_output_transform_inplace__neon(
54 *m0 = *m0 + *m1 + *m2;
55 *m1 = *m1 - *m2 - *m3;
58 static inline float32x4_t
59 vmuladdq_f32(float32x4_t c, float32x4_t a, float32x4_t b) {
60 #if defined(__aarch64__) 61 return vfmaq_f32(c, a, b);
63 return vmlaq_f32(c, a, b);
67 static inline float32x4_t
68 vmulsubq_f32(float32x4_t c, float32x4_t a, float32x4_t b) {
69 #if defined(__aarch64__) 70 return vfmsq_f32(c, a, b);
72 return vmlsq_f32(c, a, b);
76 static inline void winograd_f2k3_kernel_transform__neon(
80 float32x4_t* transform0,
81 float32x4_t* transform1,
82 float32x4_t* transform2,
83 float32x4_t* transform3) {
84 const float32x4_t const_half = vdupq_n_f32(0.5f);
85 float32x4_t half_g0_plus_g2 = const_half * (g0 + g2);
87 *transform1 = vmuladdq_f32(half_g0_plus_g2, const_half, g1);
88 *transform2 = vmulsubq_f32(half_g0_plus_g2, const_half, g1);
92 static inline float32x4x4_t v4f_transpose4x4__neon(float32x4x4_t m) {
94 vst4q_f32((
float*)(&ret), m);
98 void runDepthwise3x3Conv(
99 const DepthwiseArgs& args,
104 const float32x4_t vbias = vsetq_lane_f32(*bias, vdupq_n_f32(0.0), 1);
105 float32x4x4_t kernel_tile;
107 const float32x4_t g0 = vld1q_f32(kernel);
108 const float32x4_t g1 = vld1q_f32(kernel + 3);
110 const float32x4_t g2 =
111 vextq_f32(vld1q_f32(kernel + 5), vld1q_f32(kernel + 5), 1);
113 winograd_f2k3_kernel_transform__neon(
114 g0, g1, g2, &w.val[0], &w.val[1], &w.val[2], &w.val[3]);
115 w = v4f_transpose4x4__neon(w);
117 winograd_f2k3_kernel_transform__neon(
124 &kernel_tile.val[3]);
128 winograd_f2k3_input_transform_inplace__neon( \ 129 &input_tile.val[0], \ 130 &input_tile.val[1], \ 131 &input_tile.val[2], \ 132 &input_tile.val[3]); \ 133 input_tile = v4f_transpose4x4__neon(input_tile); \ 134 winograd_f2k3_input_transform_inplace__neon( \ 135 &input_tile.val[0], \ 136 &input_tile.val[1], \ 137 &input_tile.val[2], \ 138 &input_tile.val[3]); \ 140 for (int row = 0; row < 4; ++row) { \ 141 input_tile.val[row] = \ 142 vmulq_f32(input_tile.val[row], kernel_tile.val[row]); \ 145 input_tile.val[1] = input_tile.val[1] + vbias; \ 146 winograd_f2k3_output_transform_inplace__neon( \ 147 &input_tile.val[0], \ 148 &input_tile.val[1], \ 149 &input_tile.val[2], \ 150 &input_tile.val[3]); \ 151 input_tile = v4f_transpose4x4__neon(input_tile); \ 152 winograd_f2k3_output_transform_inplace__neon( \ 153 &input_tile.val[0], \ 154 &input_tile.val[1], \ 155 &input_tile.val[2], \ 162 for (
int oth = 0; oth < (args.out_rows + 1) / 2; ++oth) {
163 for (
int otw = 0; otw < (args.out_cols + 1) / 2; ++otw) {
165 int ih = oth * 2 - args.pad_rows;
166 int iw = otw * 2 - args.pad_cols;
169 ih >= 0 && iw >= 0 && ih + 3 < args.in_rows &&
170 iw + 3 < args.in_cols && 2 * oth + 1 < args.out_rows &&
171 2 * otw + 1 < args.out_cols
173 float32x4x4_t input_tile;
174 for (
int row = 0; row < 4; ++row) {
175 input_tile.val[row] =
176 vld1q_f32(input + (ih + row) * args.in_cols + iw);
181 for (
size_t row = 0; row < 2; ++row) {
183 output + (oth * 2 + row) * args.out_cols + otw * 2,
184 vget_low_f32(input_tile.val[row]));
188 for (
int row = 0; row < 4; ++row) {
189 for (
int col = 0; col < 4; ++col) {
190 if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows &&
191 iw + col < args.in_cols) {
192 block[row][col] = input[(ih + row) * args.in_cols + iw + col];
194 block[row][col] = 0.0;
199 float32x4x4_t input_tile;
200 for (
int row = 0; row < 4; ++row) {
201 input_tile.val[row] = vld1q_f32(&block[row][0]);
207 for (
int row = 0; row < 2; ++row) {
208 vst1_f32(&oblock[row][0], vget_low_f32(input_tile.val[row]));
210 for (
int row = 0; row < 2; ++row) {
211 for (
int col = 0; col < 2; ++col) {
212 if (2 * oth + row < args.out_rows &&
213 2 * otw + col < args.out_cols) {
214 output[(2 * oth + row) * args.out_cols + 2 * otw + col] =
226 #define PSIMD_INTRINSIC inline static __attribute__((__always_inline__)) 227 typedef float psimd_f32 __attribute__((vector_size(16), aligned(1)));
228 typedef int psimd_s32 __attribute__((__vector_size__(16)));
230 PSIMD_INTRINSIC
void psimd_store_f32(
void* address, psimd_f32 value) {
231 *((psimd_f32*)address) = value;
234 PSIMD_INTRINSIC psimd_f32 psimd_load_f32(
const void* address) {
235 return *((
const psimd_f32*)address);
238 PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(
float c) {
239 return (psimd_f32){c, c, c, c};
242 #if defined(__clang__) 244 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
245 return __builtin_shufflevector(a, b, 0, 4 + 0, 1, 4 + 1);
248 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
249 return __builtin_shufflevector(a, b, 2, 4 + 2, 3, 4 + 3);
252 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
253 return __builtin_shufflevector(a, b, 0, 1, 4 + 0, 4 + 1);
256 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
257 return __builtin_shufflevector(a, b, 2, 3, 4 + 2, 4 + 3);
262 PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
263 return __builtin_shuffle(a, b, (psimd_s32){0, 4 + 0, 1, 4 + 1});
266 PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
267 return __builtin_shuffle(a, b, (psimd_s32){2, 4 + 2, 3, 4 + 3});
269 PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
270 return __builtin_shuffle(a, b, (psimd_s32){0, 1, 4 + 0, 4 + 1});
273 PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
274 return __builtin_shuffle(a, b, (psimd_s32){2, 3, 4 + 2, 4 + 3});
279 static inline void psimd_transpose4x4_f32(
280 const psimd_f32 row0,
281 const psimd_f32 row1,
282 const psimd_f32 row2,
283 const psimd_f32 row3,
288 const psimd_f32 row01lo = psimd_interleave_lo_f32(row0, row1);
289 const psimd_f32 row01hi = psimd_interleave_hi_f32(row0, row1);
290 const psimd_f32 row23lo = psimd_interleave_lo_f32(row2, row3);
291 const psimd_f32 row23hi = psimd_interleave_hi_f32(row2, row3);
292 *col0 = psimd_concat_lo_f32(row01lo, row23lo);
293 *col1 = psimd_concat_hi_f32(row01lo, row23lo);
294 *col2 = psimd_concat_lo_f32(row01hi, row23hi);
295 *col3 = psimd_concat_hi_f32(row01hi, row23hi);
298 static inline void winograd_f2k3_input_transform(
303 psimd_f32* transform0,
304 psimd_f32* transform1,
305 psimd_f32* transform2,
306 psimd_f32* transform3) {
307 *transform0 = d0 - d2;
308 *transform1 = d1 + d2;
309 *transform2 = -d1 + d2;
310 *transform3 = d1 - d3;
313 static inline void winograd_f2k3_kernel_transform(
317 psimd_f32* transform0,
318 psimd_f32* transform1,
319 psimd_f32* transform2,
320 psimd_f32* transform3) {
321 const psimd_f32 const_half = psimd_splat_f32(0.5);
322 const psimd_f32 half_g0_plus_g2 = const_half * (g0 + g2);
324 *transform1 = half_g0_plus_g2 + const_half * g1;
325 *transform2 = half_g0_plus_g2 - const_half * g1;
329 static inline void winograd_f2k3_output_transform(
335 psimd_f32* output1) {
336 *output0 = m0 + m1 + m2;
337 *output1 = m1 - m2 - m3;
340 void runDepthwise3x3Conv(
341 const DepthwiseArgs& args,
346 const psimd_f32 vbias = {0, *bias, 0, 0};
347 const psimd_f32 g0 = psimd_load_f32(kernel);
348 const psimd_f32 g1 = psimd_load_f32(kernel + 3);
349 const psimd_f32 g5678 = psimd_load_f32(kernel + 5);
351 const psimd_f32 g2 = __builtin_shufflevector(g5678, g5678, 1, 2, 3, -1);
354 __builtin_shuffle(g5678, g5678, (psimd_s32){1, 2, 3, -1});
357 winograd_f2k3_kernel_transform(g0, g1, g2, &w[0], &w[1], &w[2], &w[3]);
358 psimd_transpose4x4_f32(w[0], w[1], w[2], w[3], &w[0], &w[1], &w[2], &w[3]);
360 winograd_f2k3_kernel_transform(
361 w[0], w[1], w[2], &wg[0], &wg[1], &wg[2], &wg[3]);
364 for (
int oth = 0; oth < (args.out_rows + 1) / 2; ++oth) {
365 for (
int otw = 0; otw < (args.out_cols + 1) / 2; ++otw) {
368 int ih = oth * 2 - args.pad_rows;
369 int iw = otw * 2 - args.pad_cols;
372 for (
int row = 0; row < 4; ++row) {
373 for (
int col = 0; col < 4; ++col) {
374 if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows &&
375 iw + col < args.in_cols) {
376 block[row][col] = input[(ih + row) * args.in_cols + iw + col];
378 block[row][col] = 0.0;
383 winograd_f2k3_input_transform(
384 psimd_load_f32(&block[0]),
385 psimd_load_f32(&block[1]),
386 psimd_load_f32(&block[2]),
387 psimd_load_f32(&block[3]),
392 psimd_transpose4x4_f32(
393 wd[0], wd[1], wd[2], wd[3], &wd[0], &wd[1], &wd[2], &wd[3]);
394 winograd_f2k3_input_transform(
395 wd[0], wd[1], wd[2], wd[3], &wd[0], &wd[1], &wd[2], &wd[3]);
397 for (
int row = 0; row < 4; ++row) {
398 wd[row] = wg[row] * wd[row];
401 psimd_f32 s[4] = {{0}};
402 winograd_f2k3_output_transform(wd[0], wd[1], wd[2], wd[3], &s[0], &s[1]);
403 psimd_transpose4x4_f32(
404 s[0], s[1], s[2], s[3], &s[0], &s[1], &s[2], &s[3]);
407 winograd_f2k3_output_transform(s[0], s[1], s[2], s[3], &t0, &t1);
410 psimd_store_f32(&oblock[0], t0);
411 psimd_store_f32(&oblock[1], t1);
412 for (
int row = 0; row < 2; ++row) {
413 for (
int col = 0; col < 2; ++col) {
414 if (2 * oth + row >= 0 && 2 * otw + col >= 0 &&
415 2 * oth + row < args.out_rows && 2 * otw + col < args.out_cols) {
416 output[(2 * oth + row) * args.out_cols + 2 * otw + col] =
427 class Depthwise3x3ConvOp final :
public ConvPoolOpBase<CPUContext> {
429 USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
430 Depthwise3x3ConvOp(
const OperatorDef& operator_def, Workspace* ws)
431 : ConvPoolOpBase<CPUContext>(operator_def, ws) {
432 OPERATOR_NEEDS_FEATURE(
433 this->order_ == StorageOrder::NCHW,
434 "Depthwise3x3ConvOp only supports NCHW order");
435 OPERATOR_NEEDS_FEATURE(this->group_ > 1);
436 OPERATOR_NEEDS_FEATURE(this->kernel_w() == 3);
437 OPERATOR_NEEDS_FEATURE(this->kernel_h() == 3);
438 OPERATOR_NEEDS_FEATURE(this->stride_h() == 1);
439 OPERATOR_NEEDS_FEATURE(this->stride_w() == 1);
442 bool RunOnDeviceWithOrderNCHW()
override {
443 const Tensor& X = Input(0);
444 auto& filter = Input(1);
445 const int N = X.dim32(0),
C = X.dim32(1);
446 CAFFE_ENFORCE_EQ(X.ndim(), filter.ndim());
447 const int M = filter.dim32(0);
449 CAFFE_ENFORCE_EQ(M, X.dim32(1));
450 CAFFE_ENFORCE_EQ(
C, X.dim32(1));
451 CAFFE_ENFORCE_EQ(
C, this->group_);
452 CAFFE_ENFORCE_EQ(M, this->group_);
454 auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0));
455 Tensor* Y = Output(0, sizes, at::dtype<float>());
458 args.batch = X.dim32(0);
459 args.in_rows = X.dim32(2);
460 args.in_cols = X.dim32(3);
461 args.stride = this->stride_w();
462 args.pad_rows = this->pad_t();
463 args.pad_cols = this->pad_l();
464 args.out_rows = Y->dim32(2);
465 args.out_cols = Y->dim32(3);
467 const auto G = this->group_;
468 const auto IS = X.dim32(2) * X.dim32(3);
469 const auto OS = Y->dim32(2) * Y->dim32(3);
471 if (InputSize() != 3 && bias_.size() != M) {
474 math::Set<float, CPUContext>(
475 M, 0.0, bias_.mutable_data<
float>(), &context_);
478 InputSize() == 3 ? Input(2).data<
float>() : bias_.data<
float>();
480 auto f = [&](
int n,
int g) {
483 X.data<
float>() + g * IS + n * G * IS,
484 filter.data<
float>() + g * 3 * 3,
486 Y->mutable_data<
float>() + g * OS + n * G * OS);
492 ws_->GetThreadPool()->run(
494 const int g = n_g / N;
495 const int n = n_g % N;
500 for (
auto n = 0; n < N; ++n) {
501 for (
auto g = 0; g < G; ++g) {
506 if (FLAGS_caffe2_profile_depthwise) {
508 const double gmacs = double(
509 Y->dim32(2) * Y->dim32(3) * Y->dim32(1) *
510 kernel_w() * kernel_h()) /
512 const double gflops = 2 * gmacs / t.Seconds();
516 "H: %3zu, W: %3zu, iC: %3zu, oC: %3zu, K: %1zu, S: %1zu, P: %1zu, GMACs: " 517 "%4.2f, totalT: %6.3f, inputT: %6.3f, " 518 "kernelT: %6.3f, blockT: %6.3f, outputT: %6.3f, GFLOPS: %6.3f",
533 CAFFE_ENFORCE(ret > 0);
543 REGISTER_CPU_OPERATOR_WITH_ENGINE(
Conv, DEPTHWISE_3x3, Depthwise3x3ConvOp);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...