Caffe2 - C++ API
A deep learning, cross platform ML framework
vec256_base.h
1 #pragma once
2 
3 #include <cstring>
4 #include <functional>
5 #include <cmath>
6 #include <type_traits>
7 #include <bitset>
8 
9 #include <ATen/Utils.h>
10 #include <ATen/native/Copy.h>
11 #include <ATen/NumericUtils.h>
12 #include <c10/util/C++17.h>
13 
14 #if defined(__GNUC__)
15 #define __at_align32__ __attribute__((aligned(32)))
16 #elif defined(_WIN32)
17 #define __at_align32__ __declspec(align(32))
18 #else
19 #define __at_align32__
20 #endif
21 
22 namespace at {
23 namespace vec256 {
24 // See Note [Acceptable use of anonymous namespace in header]
25 namespace {
26 
27 template<size_t n> struct int_of_size;
28 
29 #define DEFINE_INT_OF_SIZE(int_t) \
30 template<> struct int_of_size<sizeof(int_t)> { using type = int_t; }
31 
32 DEFINE_INT_OF_SIZE(int64_t);
33 DEFINE_INT_OF_SIZE(int32_t);
34 DEFINE_INT_OF_SIZE(int16_t);
35 DEFINE_INT_OF_SIZE(int8_t);
36 
37 #undef DEFINE_INT_OF_SIZE
38 
39 template <typename T>
40 using int_same_size_t = typename int_of_size<sizeof(T)>::type;
41 
42 // NOTE: If you specialize on a type, you must define all operations!
43 
44 // emulates vectorized types
45 template <class T>
46 struct Vec256 {
47 private:
48  T values[32 / sizeof(T)] = {0};
49 public:
50  // Note [constexpr static function to avoid odr-usage compiler bug]
51  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
52  // Why, you might ask, is size defined to be a static constexpr function,
53  // rather than a more ordinary 'static constexpr int size;' variable?
54  // The problem lies within ODR rules for static constexpr members versus
55  // static constexpr functions. First, recall that this class (along with all
56  // of its derivations) live in an anonymous namespace: they are intended to be
57  // *completely* inlined at their use-sites, because we need to compile it
58  // multiple times for different instruction sets.
59  //
60  // Because of this constraint, we CANNOT provide a single definition for
61  // any static members in this class; since we want to compile the class
62  // multiple times, there wouldn't actually be any good place to put the
63  // definition. Now here is the problem: if we ODR-use a static constexpr
64  // member, we are *obligated* to provide a definition. Without the
65  // definition, you get a compile error like:
66  //
67  // relocation R_X86_64_PC32 against undefined symbol
68  // `_ZN2at6vec25612_GLOBAL__N_16Vec256IdE4sizeE' can not be used when making
69  // a shared object; recompile with -fPIC
70  //
71  // If this were C++17, we could replace a static constexpr variable with
72  // an inline variable which doesn't require one definition. But we are not
73  // C++17. So the next best thing is to replace the member with a static
74  // constexpr (and therefore inline) function, which does not require ODR
75  // either.
76  //
77  // Also, technically according to the C++ standard, we don't have to define
78  // a constexpr variable if we never odr-use it. But it seems that some
79  // versions GCC/Clang have buggy determinations on whether or not an
80  // identifier is odr-used or not, and in any case it's hard to tel if
81  // a variabe is odr-used or not. So best to just cut the probem at the root.
82  static constexpr int size() {
83  return 32 / sizeof(T);
84  }
85  Vec256() {}
86  Vec256(T val) {
87  for (int i = 0; i != size(); i++) {
88  values[i] = val;
89  }
90  }
91  template<typename... Args,
92  typename = c10::guts::enable_if_t<(sizeof...(Args) == size())>>
93  Vec256(Args... vals) {
94  values = { vals... };
95  }
96  template <int64_t mask_>
97  static Vec256<T> blend(const Vec256<T>& a, const Vec256<T>& b) {
98  int64_t mask = mask_;
99  Vec256 vec;
100  for (int64_t i = 0; i < size(); i++) {
101  if (mask & 0x01) {
102  vec[i] = b[i];
103  } else {
104  vec[i] = a[i];
105  }
106  mask = mask >> 1;
107  }
108  return vec;
109  }
110  static Vec256<T> blendv(const Vec256<T>& a, const Vec256<T>& b,
111  const Vec256<T>& mask) {
112  Vec256 vec;
113  int_same_size_t<T> buffer[size()];
114  mask.store(buffer);
115  for (int64_t i = 0; i < size(); i++) {
116  if (buffer[i] & 0x01)
117  {
118  vec[i] = b[i];
119  } else {
120  vec[i] = a[i];
121  }
122  }
123  return vec;
124  }
125  static Vec256<T> arange(T base = static_cast<T>(0), T step = static_cast<T>(1)) {
126  Vec256 vec;
127  for (int64_t i = 0; i < size(); i++) {
128  vec.values[i] = base + i * step;
129  }
130  return vec;
131  }
132  static Vec256<T> set(const Vec256<T>& a, const Vec256<T>& b, int64_t count = size()) {
133  Vec256 vec;
134  for (int64_t i = 0; i < size(); i++) {
135  if (i < count) {
136  vec[i] = b[i];
137  } else {
138  vec[i] = a[i];
139  }
140  }
141  return vec;
142  }
143  static Vec256<T> loadu(const void* ptr) {
144  Vec256 vec;
145  std::memcpy(vec.values, ptr, 32);
146  return vec;
147  }
148  static Vec256<T> loadu(const void* ptr, int64_t count) {
149  Vec256 vec;
150  std::memcpy(vec.values, ptr, count * sizeof(T));
151  return vec;
152  }
153  void store(void* ptr, int count = size()) const {
154  std::memcpy(ptr, values, count * sizeof(T));
155  }
156  const T& operator[](int idx) const {
157  return values[idx];
158  }
159  T& operator[](int idx) {
160  return values[idx];
161  }
162  Vec256<T> map(T (*f)(T)) const {
163  Vec256<T> ret;
164  for (int64_t i = 0; i != size(); i++) {
165  ret[i] = f(values[i]);
166  }
167  return ret;
168  }
169  Vec256<T> abs() const {
170  Vec256<T> ret;
171  for (int64_t i = 0; i < size(); i++) {
172  ret[i] = values[i] < 0 ? -values[i] : values[i];
173  }
174  return ret;
175  }
176  Vec256<T> acos() const {
177  return map(std::acos);
178  }
179  Vec256<T> asin() const {
180  return map(std::asin);
181  }
182  Vec256<T> atan() const {
183  return map(std::atan);
184  }
185  Vec256<T> erf() const {
186  return map(std::erf);
187  }
188  Vec256<T> erfc() const {
189  return map(std::erfc);
190  }
191  Vec256<T> exp() const {
192  return map(std::exp);
193  }
194  Vec256<T> expm1() const {
195  return map(std::expm1);
196  }
197  Vec256<T> log() const {
198  return map(std::log);
199  }
200  Vec256<T> log10() const {
201  return map(std::log10);
202  }
203  Vec256<T> log1p() const {
204  return map(std::log1p);
205  }
206  Vec256<T> log2() const {
207  return map(std::log2);
208  }
209  Vec256<T> ceil() const {
210  return map(std::ceil);
211  }
212  Vec256<T> cos() const {
213  return map(std::cos);
214  }
215  Vec256<T> cosh() const {
216  return map(std::cosh);
217  }
218  Vec256<T> floor() const {
219  return map(std::floor);
220  }
221  Vec256<T> neg() const {
222  return map([](T x) { return -x; });
223  }
224  Vec256<T> round() const {
225  return map(std::nearbyint);
226  }
227  Vec256<T> sin() const {
228  return map(std::sin);
229  }
230  Vec256<T> sinh() const {
231  return map(std::sinh);
232  }
233  Vec256<T> tan() const {
234  return map(std::tan);
235  }
236  Vec256<T> tanh() const {
237  return map(std::tanh);
238  }
239  Vec256<T> trunc() const {
240  return map(std::trunc);
241  }
242  Vec256<T> sqrt() const {
243  return map(std::sqrt);
244  }
245  Vec256<T> reciprocal() const {
246  return map([](T x) { return (T)(1) / x; });
247  }
248  Vec256<T> rsqrt() const {
249  return map([](T x) { return 1 / std::sqrt(x); });
250  }
251  Vec256<T> pow(const Vec256<T> &exp) const {
252  Vec256<T> ret;
253  for (int64_t i = 0; i < size(); i++) {
254  ret[i] = std::pow(values[i], exp[i]);
255  }
256  return ret;
257  }
258 #define DEFINE_COMP(binary_pred) \
259  Vec256<T> operator binary_pred(const Vec256<T> &other) const { \
260  Vec256<T> vec; \
261  for (int64_t i = 0; i != size(); i++) { \
262  if (values[i] binary_pred other.values[i]) { \
263  std::memset(static_cast<void*>(vec.values + i), 0xFF, sizeof(T)); \
264  } else { \
265  std::memset(static_cast<void*>(vec.values + i), 0, sizeof(T)); \
266  } \
267  } \
268  return vec; \
269  }
270  DEFINE_COMP(==)
271  DEFINE_COMP(!=)
272  DEFINE_COMP(>=)
273  DEFINE_COMP(<=)
274  DEFINE_COMP(>)
275  DEFINE_COMP(<)
276 #undef DEFINE_COMP
277 
278 };
279 
280 template <class T> Vec256<T> inline operator+(const Vec256<T> &a, const Vec256<T> &b) {
281  Vec256<T> c = Vec256<T>();
282  for (int i = 0; i != Vec256<T>::size(); i++) {
283  c[i] = a[i] + b[i];
284  }
285  return c;
286 }
287 
288 template <class T> Vec256<T> inline operator-(const Vec256<T> &a, const Vec256<T> &b) {
289  Vec256<T> c = Vec256<T>();
290  for (int i = 0; i != Vec256<T>::size(); i++) {
291  c[i] = a[i] - b[i];
292  }
293  return c;
294 }
295 
296 template <class T> Vec256<T> inline operator*(const Vec256<T> &a, const Vec256<T> &b) {
297  Vec256<T> c = Vec256<T>();
298  for (int i = 0; i != Vec256<T>::size(); i++) {
299  c[i] = a[i] * b[i];
300  }
301  return c;
302 }
303 
304 template <class T> Vec256<T> inline operator/(const Vec256<T> &a, const Vec256<T> &b) __ubsan_ignore_float_divide_by_zero__ {
305  Vec256<T> c = Vec256<T>();
306  for (int i = 0; i != Vec256<T>::size(); i++) {
307  c[i] = a[i] / b[i];
308  }
309  return c;
310 }
311 
312 template <class T> Vec256<T> inline operator||(
313  const Vec256<T> &a, const Vec256<T> &b) {
314  Vec256<T> c = Vec256<T>();
315  for (int i = 0; i != Vec256<T>::size(); i++) {
316  c[i] = a[i] || b[i];
317  }
318  return c;
319 }
320 
321 // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
322 // either input is a NaN.
323 template <class T> Vec256<T> inline maximum(const Vec256<T> &a, const Vec256<T> &b) {
324  Vec256<T> c = Vec256<T>();
325  for (int i = 0; i != Vec256<T>::size(); i++) {
326  c[i] = (a[i] > b[i]) ? a[i] : b[i];
327  if (_isnan(a[i])) {
328  // If either input is NaN, propagate a NaN.
329  // NOTE: The case where b[i] was NaN is handled correctly by the naive
330  // ternary operator above.
331  c[i] = a[i];
332  }
333  }
334  return c;
335 }
336 
337 template <typename T>
338 inline T maximum(const T& a, const T& b) {
339  T c = (a > b) ? a : b;
340  if (_isnan(a)) {
341  c = a;
342  }
343  return c;
344 }
345 
346 // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
347 // either input is a NaN.
348 template <class T> Vec256<T> inline minimum(const Vec256<T> &a, const Vec256<T> &b) {
349  Vec256<T> c = Vec256<T>();
350  for (int i = 0; i != Vec256<T>::size(); i++) {
351  c[i] = (a[i] < b[i]) ? a[i] : b[i];
352  if (_isnan(a[i])) {
353  // If either input is NaN, propagate a NaN.
354  // NOTE: The case where b[i] was NaN is handled correctly by the naive
355  // ternary operator above.
356  c[i] = a[i];
357  }
358  }
359  return c;
360 }
361 
362 template <typename T>
363 inline T minimum(const T& a, const T& b) {
364  T c = (a < b) ? a : b;
365  if (_isnan(a)) {
366  c = a;
367  }
368  return c;
369 }
370 
371 
372 #define DEFINE_BITWISE_OP(op) \
373 template <class T> \
374 Vec256<T> inline operator op(const Vec256<T> &a, const Vec256<T> &b) { \
375  using iT = int_same_size_t<T>; \
376  iT buffer[Vec256<T>::size()]; \
377  for (int64_t i = 0; i != Vec256<T>::size(); i++) { \
378  auto a_val = a[i]; \
379  auto b_val = b[i]; \
380  iT *i_a_ptr = reinterpret_cast<iT*>(&a_val); \
381  iT *i_b_ptr = reinterpret_cast<iT*>(&b_val); \
382  buffer[i] = *i_a_ptr op *i_b_ptr; \
383  } \
384  return Vec256<T>::loadu(buffer); \
385 }
386 DEFINE_BITWISE_OP(&)
387 DEFINE_BITWISE_OP(|)
388 DEFINE_BITWISE_OP(^)
389 #undef DEFINE_BITWISE_OP
390 
391 template <typename T>
392 inline T fmadd(const T& a, const T& b, const T& c) {
393  return a * b + c;
394 }
395 
396 template <int64_t scale = 1, typename T = void>
397 c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
398 inline gather(T const* base_addr, const Vec256<int_same_size_t<T>>& vindex) {
399  static constexpr int size = Vec256<T>::size();
400  int_same_size_t<T> index_arr[size];
401  vindex.store(static_cast<void*>(index_arr));
402  T buffer[size];
403  for (int64_t i = 0; i < size; i++) {
404  buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
405  }
406  return Vec256<T>::loadu(static_cast<void*>(buffer));
407 }
408 
409 template <int64_t scale = 1, typename T = void>
410 c10::guts::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vec256<T>>
411 inline mask_gather(const Vec256<T>& src, T const* base_addr,
412  const Vec256<int_same_size_t<T>>& vindex, Vec256<T>& mask) {
413  static constexpr int size = Vec256<T>::size();
414  T src_arr[size];
415  int_same_size_t<T> mask_arr[size]; // use int type so we can logical and
416  int_same_size_t<T> index_arr[size];
417  src.store(static_cast<void*>(src_arr));
418  mask.store(static_cast<void*>(mask_arr));
419  vindex.store(static_cast<void*>(index_arr));
420  T buffer[size];
421  for (int64_t i = 0; i < size; i++) {
422  if (mask_arr[i] & 0x01) { // check highest bit
423  buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)];
424  } else {
425  buffer[i] = src_arr[i];
426  }
427  }
428  mask = Vec256<T>(); // "zero out" mask
429  return Vec256<T>::loadu(static_cast<void*>(buffer));
430 }
431 
432 // Cast a given vector to another type without changing the bits representation.
433 // So a Vec<double> of 256 bits containing all ones can be cast to a
434 // Vec<int64_t> of 256 bits containing all ones (i.e., four negative 1s).
435 namespace {
436  // There is a struct here because we don't have static_if and I can't
437  // partially specialize a templated function.
438  template<typename dst_t, typename src_t>
439  struct CastImpl {
440  static inline Vec256<dst_t> apply(const Vec256<src_t>& src) {
441  src_t src_arr[Vec256<src_t>::size()];
442  src.store(static_cast<void*>(src_arr));
443  return Vec256<dst_t>::loadu(static_cast<const void*>(src_arr));
444  }
445  };
446 
447  template<typename scalar_t>
448  struct CastImpl<scalar_t, scalar_t> {
449  static inline Vec256<scalar_t> apply(const Vec256<scalar_t>& src) {
450  return src;
451  }
452  };
453 }
454 template<typename dst_t, typename src_t>
455 Vec256<dst_t> cast(const Vec256<src_t>& src) {
456  return CastImpl<dst_t, src_t>::apply(src);
457 }
458 
459 template <typename T>
460 inline Vec256<int_same_size_t<T>> convert_to_int_of_same_size(const Vec256<T>& src) {
461  static constexpr int size = Vec256<T>::size();
462  T src_arr[size];
463  src.store(static_cast<void*>(src_arr));
464  int_same_size_t<T> buffer[size];
465  for (int64_t i = 0; i < size; i++) {
466  buffer[i] = static_cast<int_same_size_t<T>>(src_arr[i]);
467  }
468  return Vec256<int_same_size_t<T>>::loadu(static_cast<void*>(buffer));
469 }
470 
471 // E.g., inputs: a Vec256<float> = {a0, b0, a1, b1, a2, b2, a3, b3}
472 // b Vec256<float> = {a4, b4, a5, b5, a6, b6, a7, b7}
473 // returns: Vec256<float> = {a0, a1, a2, a3, a4, a5, a6, a7}
474 // Vec256<float> = {b0, b1, b2, b3, b4, b5, b6, b7}
475 template <typename T>
476 inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
477 deinterleave2(const Vec256<T>& a, const Vec256<T>& b) {
478  static constexpr int size = Vec256<T>::size();
479  static constexpr int half_size = size / 2;
480  T a_arr[size];
481  T b_arr[size];
482  T buffer1[size];
483  T buffer2[size];
484  a.store(static_cast<void*>(a_arr));
485  b.store(static_cast<void*>(b_arr));
486  for (int64_t i = 0; i < half_size; i++) {
487  buffer1[i] = a_arr[i * 2];
488  buffer1[half_size + i] = b_arr[i * 2];
489  buffer2[i] = a_arr[i * 2 + 1];
490  buffer2[half_size + i] = b_arr[i * 2 + 1];
491  }
492  return std::make_pair(Vec256<T>::loadu(static_cast<void*>(buffer1)),
493  Vec256<T>::loadu(static_cast<void*>(buffer2)));
494 }
495 
496 // inverse operation of deinterleave2
497 // E.g., inputs: a Vec256<float> = {a0, a1, a2, a3, a4, a5, a6, a7}
498 // b Vec256<float> = {b0, b1, b2, b3, b4, b5, b6, b7}
499 // returns: Vec256<float> = {a0, b0, a1, b1, a2, b2, a3, b3}
500 // Vec256<float> = {a4, b4, a5, b5, a6, b6, a7, b7}
501 template <typename T>
502 inline c10::guts::enable_if_t<Vec256<T>::size() % 2 == 0, std::pair<Vec256<T>, Vec256<T>>>
503 interleave2(const Vec256<T>& a, const Vec256<T>& b) {
504  static constexpr int size = Vec256<T>::size();
505  static constexpr int half_size = size / 2;
506  T a_arr[size];
507  T b_arr[size];
508  T buffer1[size];
509  T buffer2[size];
510  a.store(static_cast<void*>(a_arr));
511  b.store(static_cast<void*>(b_arr));
512  for (int64_t i = 0; i < half_size; i++) {
513  buffer1[i * 2] = a_arr[i];
514  buffer1[i * 2 + 1] = b_arr[i];
515  buffer2[i * 2] = a_arr[half_size + i];
516  buffer2[i * 2 + 1] = b_arr[half_size + i];
517  }
518  return std::make_pair(Vec256<T>::loadu(static_cast<void*>(buffer1)),
519  Vec256<T>::loadu(static_cast<void*>(buffer2)));
520 }
521 
522 template <typename src_T, typename dst_T>
523 void convert(const src_T *src, dst_T *dst, int64_t n) {
524 #ifndef _MSC_VER
525 # pragma unroll
526 #endif
527  for (int64_t i = 0; i < n; i++) {
528  *dst = static_cast<dst_T>(
529  static_cast<at::native::inter_copy_type_t<dst_T>>(*src));
530  src++;
531  dst++;
532  }
533 }
534 
535 }}}
C10_HOST_DEVICE Half operator+(const Half &a, const Half &b)
Arithmetic.
Definition: Half-inl.h:56
Flush-To-Zero and Denormals-Are-Zero mode.