Caffe2 - C++ API
A deep learning, cross platform ML framework
common.h
1 // !!!! PLEASE READ !!!!
2 // Minimize (transitively) included headers from _avx*.cc because some of the
3 // functions defined in the headers compiled with platform dependent compiler
4 // options can be reused by other translation units generating illegal
5 // instruction run-time error.
6 
7 // Common utilities for writing performance kernels and easy dispatching of
8 // different backends.
9 /*
10 The general workflow shall be as follows, say we want to
11 implement a functionality called void foo(int a, float b).
12 
13 In foo.h, do:
14  void foo(int a, float b);
15 
16 In foo_avx512.cc, do:
17  void foo__avx512(int a, float b) {
18  [actual avx512 implementation]
19  }
20 
21 In foo_avx2.cc, do:
22  void foo__avx2(int a, float b) {
23  [actual avx2 implementation]
24  }
25 
26 In foo_avx.cc, do:
27  void foo__avx(int a, float b) {
28  [actual avx implementation]
29  }
30 
31 In foo.cc, do:
32  // The base implementation should *always* be provided.
33  void foo__base(int a, float b) {
34  [base, possibly slow implementation]
35  }
36  decltype(foo__base) foo__avx512;
37  decltype(foo__base) foo__avx2;
38  decltype(foo__base) foo__avx;
39  void foo(int a, float b) {
40  // You should always order things by their preference, faster
41  // implementations earlier in the function.
42  AVX512_DO(foo, a, b);
43  AVX2_DO(foo, a, b);
44  AVX_DO(foo, a, b);
45  BASE_DO(foo, a, b);
46  }
47 
48 */
49 // Details: this functionality basically covers the cases for both build time
50 // and run time architecture support.
51 //
52 // During build time:
53 // The build system should provide flags CAFFE2_PERF_WITH_AVX512,
54 // CAFFE2_PERF_WITH_AVX2, and CAFFE2_PERF_WITH_AVX that corresponds to the
55 // __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__, and __AVX__ flags the
56 // compiler provides. Note that we do not use the compiler flags but rely on
57 // the build system flags, because the common files (like foo.cc above) will
58 // always be built without __AVX512F__, __AVX512DQ__, __AVX512VL__, __AVX2__
59 // and __AVX__.
60 // During run time:
61 // we use cpuid to identify cpu support and run the proper functions.
62 
63 #pragma once
64 
65 #include "caffe2/utils/cpuid.h"
66 
67 // DO macros: these should be used in your entry function, similar to foo()
68 // above, that routes implementations based on CPU capability.
69 
70 #define BASE_DO(funcname, ...) return funcname##__base(__VA_ARGS__);
71 
72 #ifdef CAFFE2_PERF_WITH_AVX512
73 #define AVX512_DO(funcname, ...) \
74  if (GetCpuId().avx512f() && GetCpuId().avx512dq() && \
75  GetCpuId().avx512vl()) { \
76  return funcname##__avx512(__VA_ARGS__); \
77  }
78 #else // CAFFE2_PERF_WITH_AVX512
79 #define AVX512_DO(funcname, ...)
80 #endif // CAFFE2_PERF_WITH_AVX512
81 
82 #ifdef CAFFE2_PERF_WITH_AVX2
83 #define AVX2_DO(funcname, ...) \
84  if (GetCpuId().avx2()) { \
85  return funcname##__avx2(__VA_ARGS__); \
86  }
87 #define AVX2_FMA_DO(funcname, ...) \
88  if (GetCpuId().avx2() && GetCpuId().fma()) { \
89  return funcname##__avx2_fma(__VA_ARGS__); \
90  }
91 #else // CAFFE2_PERF_WITH_AVX2
92 #define AVX2_DO(funcname, ...)
93 #define AVX2_FMA_DO(funcname, ...)
94 #endif // CAFFE2_PERF_WITH_AVX2
95 
96 #ifdef CAFFE2_PERF_WITH_AVX
97 #define AVX_DO(funcname, ...) \
98  if (GetCpuId().avx()) { \
99  return funcname##__avx(__VA_ARGS__); \
100  }
101 #define AVX_F16C_DO(funcname, ...) \
102  if (GetCpuId().avx() && GetCpuId().f16c()) { \
103  return funcname##__avx_f16c(__VA_ARGS__); \
104  }
105 #else // CAFFE2_PERF_WITH_AVX
106 #define AVX_DO(funcname, ...)
107 #define AVX_F16C_DO(funcname, ...)
108 #endif // CAFFE2_PERF_WITH_AVX