Caffe2 - C++ API
A deep learning, cross platform ML framework
Macros.h
1 #ifndef C10_MACROS_MACROS_H_
2 #define C10_MACROS_MACROS_H_
3 
4 /* Main entry for c10/macros.
5  *
6  * In your code, include c10/macros/Macros.h directly, instead of individual
7  * files in this folder.
8  */
9 
10 // For build systems that do not directly depend on CMake and directly build
11 // from the source directory (such as Buck), one may not have a cmake_macros.h
12 // file at all. In this case, the build system is responsible for providing
13 // correct macro definitions corresponding to the cmake_macros.h.in file.
14 //
15 // In such scenarios, one should define the macro
16 // C10_USING_CUSTOM_GENERATED_MACROS
17 // to inform this header that it does not need to include the cmake_macros.h
18 // file.
19 
20 #ifndef C10_USING_CUSTOM_GENERATED_MACROS
21 #include "c10/macros/cmake_macros.h"
22 #endif // C10_USING_CUSTOM_GENERATED_MACROS
23 
24 #include "c10/macros/Export.h"
25 
26 // Disable the copy and assignment operator for a class. Note that this will
27 // disable the usage of the class in std containers.
28 #define C10_DISABLE_COPY_AND_ASSIGN(classname) \
29  classname(const classname&) = delete; \
30  classname& operator=(const classname&) = delete
31 
32 #define CONCAT_IMPL(x, y) x##y
33 #define MACRO_CONCAT(x, y) CONCAT_IMPL(x, y)
34 
35 #define MACRO_EXPAND(args) args
36 
38 #define C10_NODISCARD
39 #if __cplusplus > 201402L && defined(__has_cpp_attribute)
40 #if __has_cpp_attribute(nodiscard)
41 #undef C10_NODISCARD
42 #define C10_NODISCARD [[nodiscard]]
43 #endif
44 // Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious
45 // error when __has_cpp_attribute is given a scoped attribute in C mode.
46 #elif __cplusplus && defined(__has_cpp_attribute)
47 #if __has_cpp_attribute(clang::warn_unused_result)
48 #undef C10_NODISCARD
49 #define C10_NODISCARD [[clang::warn_unused_result]]
50 #endif
51 #endif
52 
53 // suppress an unused variable.
54 #ifdef _MSC_VER
55 #define C10_UNUSED
56 #else
57 #define C10_UNUSED __attribute__((__unused__))
58 #endif //_MSC_VER
59 
60 // Simply define the namespace, in case a dependent library want to refer to
61 // the c10 namespace but not any nontrivial files.
62 namespace c10 {} // namespace c10
63 namespace c10 { namespace cuda {} }
64 namespace c10 { namespace hip {} }
65 
66 // Since C10 is the core library for caffe2 (and aten), we will simply reroute
67 // all abstractions defined in c10 to be available in caffe2 as well.
68 // This is only for backwards compatibility. Please use the symbols from the
69 // c10 namespace where possible.
70 namespace caffe2 { using namespace c10; }
71 namespace at { using namespace c10; }
72 namespace at { namespace cuda { using namespace c10::cuda; }}
73 
74 // WARNING!!! THIS IS A GIANT HACK!!!
75 // This line means you cannot simultaneously include c10/hip
76 // and c10/cuda and then use them from the at::cuda namespace.
77 // This is true in practice, because HIPIFY works inplace on
78 // files in ATen/cuda, so it assumes that c10::hip is available
79 // from at::cuda. This namespace makes that happen. When
80 // HIPIFY is no longer out-of-place, we can switch the cuda
81 // here to hip and everyone is happy.
82 namespace at { namespace cuda { using namespace c10::hip; }}
83 
84 // C10_NORETURN
85 #if defined(_MSC_VER)
86 #define C10_NORETURN __declspec(noreturn)
87 #else
88 #define C10_NORETURN __attribute__((noreturn))
89 #endif
90 
91 // C10_LIKELY/C10_UNLIKELY
92 //
93 // These macros provide parentheses, so you can use these macros as:
94 //
95 // if C10_LIKELY(some_expr) {
96 // ...
97 // }
98 //
99 // NB: static_cast to boolean is mandatory in C++, because __builtin_expect
100 // takes a long argument, which means you may trigger the wrong conversion
101 // without it.
102 //
103 #if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
104 #define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
105 #define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
106 #else
107 #define C10_LIKELY(expr) (expr)
108 #define C10_UNLIKELY(expr) (expr)
109 #endif
110 
111 #include <sstream>
112 #include <string>
113 
114 #if defined(__CUDACC__) || defined(__HIPCC__)
115 // Designates functions callable from the host (CPU) and the device (GPU)
116 #define C10_HOST_DEVICE __host__ __device__
117 #define C10_DEVICE __device__
118 #define C10_HOST __host__
119 // constants from (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
120 // The maximum number of threads per multiprocessor is 1024 for Turing architecture (7.5)
121 // but 2048 for previous architectures. You'll get warnings if you exceed these constants.
122 // Hence, the following macros adjust the input values from the user to resolve potential warnings.
123 #if __CUDA_ARCH__ >= 750
124 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
125 #else
126 constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
127 #endif
128 // CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
129 constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
130 // CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block size.
131 // 256 is a good number for this fallback and should give good occupancy and
132 // versatility across all architectures.
133 constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
134 // NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
135 // turns out that although __launch_bounds__ can take constexpr, it
136 // can't take a constexpr that has anything to do with templates.
137 // Currently we use launch_bounds that depend on template arguments in
138 // Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK and
139 // C10_MIN_BLOCKS_PER_SM are kept as macros.
140 // Suppose you were planning to write __launch_bounds__(a, b), based on your performance tuning on a modern GPU.
141 // Instead, you should write __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
142 // which will also properly respect limits on old architectures.
143 #define C10_MAX_THREADS_PER_BLOCK(val) (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) : CUDA_THREADS_PER_BLOCK_FALLBACK)
144 #define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) ((((threads_per_block)*(blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) ? (blocks_per_sm) : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / (threads_per_block))))
145 // C10_LAUNCH_BOUNDS is analogous to __launch_bounds__
146 #define C10_LAUNCH_BOUNDS_0 __launch_bounds__(256, 4) // default launch bounds that should give good occupancy and versatility across all architectures.
147 #define C10_LAUNCH_BOUNDS_1(max_threads_per_block) __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))))
148 #define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))), (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm))))
149 #else
150 #define C10_HOST_DEVICE
151 #define C10_HOST
152 #define C10_DEVICE
153 #endif
154 
155 #ifdef __HIP_PLATFORM_HCC__
156 #define C10_HIP_HOST_DEVICE __host__ __device__
157 #else
158 #define C10_HIP_HOST_DEVICE
159 #endif
160 
161 #if defined(__ANDROID__)
162 #define C10_ANDROID 1
163 #define C10_MOBILE 1
164 #elif ( \
165  defined(__APPLE__) && \
166  (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
167 #define C10_IOS 1
168 #define C10_MOBILE 1
169 #elif (defined(__APPLE__) && TARGET_OS_MAC)
170 #define C10_IOS 1
171 #endif // ANDROID / IOS / MACOS
172 
173 // Portably determine if a type T is trivially copyable or not.
174 #if __GNUG__ && __GNUC__ < 5
175 #define C10_IS_TRIVIALLY_COPYABLE(T) __has_trivial_copy(T)
176 #else
177 #define C10_IS_TRIVIALLY_COPYABLE(T) std::is_trivially_copyable<T>::value
178 #endif
179 
180 #endif // C10_MACROS_MACROS_H_
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
To register your own kernel for an operator, do in one (!) cpp file: C10_REGISTER_KERNEL(OperatorHand...
Definition: alias_info.h:7
Flush-To-Zero and Denormals-Are-Zero mode.