Caffe2 - C++ API
A deep learning, cross platform ML framework
resource_strings.h
1 #pragma once
2 
3 #include <torch/csrc/WindowsTorchApiMacro.h>
4 #include <torch/csrc/jit/code_template.h>
5 
6 namespace torch {
7 namespace jit {
8 namespace fuser {
9 namespace cuda {
10 
11 /*with type_as not checking type of its input, a fusion group can have non-fp32
12 tensor as input. Correct code for this case is generated, however, nvrtc does
13 not know how to handle int*_t integer types, so typedefs help it handle those
14 cases*/
15 
16 static auto type_declarations_template = CodeTemplate(R"(
17 typedef unsigned char uint8_t;
18 typedef signed char int8_t;
19 typedef short int int16_t;
20 typedef long long int int64_t;
21 ${HalfHeader}
22 ${RandHeader}
23 
24 #define NAN __int_as_float(0x7fffffff)
25 #define POS_INFINITY __int_as_float(0x7f800000)
26 #define NEG_INFINITY __int_as_float(0xff800000)
27 
28 typedef ${IndexType} IndexType;
29 template<typename T, size_t N>
30 struct TensorInfo {
31  T* data;
32  IndexType sizes[N];
33  IndexType strides[N];
34 };
35 template<typename T>
36 struct TensorInfo<T, 0> {
37  T * data;
38 };
39 )");
40 
41 // We rewrite the code for philox RNG from curand as nvrtc couldn't resolve the
42 // curand header correctly.
43 constexpr auto rand_support_literal = R"(
44 
45  class Philox {
46  public:
47  __device__ inline Philox(unsigned long long seed,
48  unsigned long long subsequence,
49  unsigned long long offset) {
50  key.x = (unsigned int)seed;
51  key.y = (unsigned int)(seed >> 32);
52  counter = make_uint4(0, 0, 0, 0);
53  counter.z = (unsigned int)(subsequence);
54  counter.w = (unsigned int)(subsequence >> 32);
55  STATE = 0;
56  incr_n(offset / 4);
57  }
58 
59  __device__ inline unsigned long operator()() {
60  if(STATE == 0) {
61  uint4 counter_ = counter;
62  uint2 key_ = key;
63  for(int i = 0; i < 9; i++) {
64  counter_ = single_round(counter_, key_);
65  key_.x += (kPhilox10A); key_.y += (kPhilox10B);
66  }
67  output = single_round(counter_, key_);
68  incr();
69  }
70  unsigned long ret;
71  switch(STATE) {
72  case 0: ret = output.x; break;
73  case 1: ret = output.y; break;
74  case 2: ret = output.z; break;
75  case 3: ret = output.w; break;
76  }
77  STATE = (STATE + 1) % 4;
78  return ret;
79  }
80 
81  private:
82  uint4 counter;
83  uint4 output;
84  uint2 key;
85  unsigned int STATE;
86  __device__ inline void incr_n(unsigned long long n) {
87  unsigned int nlo = (unsigned int)(n);
88  unsigned int nhi = (unsigned int)(n >> 32);
89  counter.x += nlo;
90  if (counter.x < nlo)
91  nhi++;
92  counter.y += nhi;
93  if (nhi <= counter.y)
94  return;
95  if (++counter.z)
96  return;
97  ++counter.w;
98  }
99  __device__ inline void incr() {
100  if (++counter.x)
101  return;
102  if (++counter.y)
103  return;
104  if (++counter.z)
105  return;
106  ++counter.w;
107  }
108  __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
109  unsigned int *result_high) {
110  *result_high = __umulhi(a, b);
111  return a*b;
112  }
113 
114  __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
115  unsigned int hi0;
116  unsigned int hi1;
117  unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
118  unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
119 
120  uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
121  return ret;
122  }
123 
124  static const unsigned long kPhilox10A = 0x9E3779B9;
125  static const unsigned long kPhilox10B = 0xBB67AE85;
126  static const unsigned long kPhiloxSA = 0xD2511F53;
127  static const unsigned long kPhiloxSB = 0xCD9E8D57;
128  };
129 
130  // Inverse of 2^32.
131  #define M_RAN_INVM32 2.3283064e-10f
132  __device__ __inline__ float uniform(unsigned int x) {
133  return x * M_RAN_INVM32;
134  }
135 )";
136 
137 constexpr auto rand_param =
138  ",unsigned long long seed, unsigned long long offset";
139 
140 constexpr auto rand_init = R"(
141  int idx = blockIdx.x*blockDim.x + threadIdx.x;
142  Philox rnd(seed, idx, offset);
143 )";
144 
145 static auto cuda_compilation_unit_template = CodeTemplate(R"(
146 ${type_declarations}
147 
148 extern "C" __global__
149 void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) {
150  ${RandInit}
151  for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
152  linearIndex < totalElements;
153  linearIndex += gridDim.x * blockDim.x) {
154  // Convert `linearIndex` into an offset of tensor:
155  ${tensorOffsets}
156  // calculate the results
157  ${kernelBody}
158  }
159 }
160 )");
161 
162 // This snippet enables half support in the jit. Following the pattern for
163 // reductions, fp16 input data is immediately upconverted to float
164 // with __half2float(). All mathematical operations are done on float
165 // values, and if needed the intermediate float representation is
166 // converted to half with __float2half() when writing to a half tensor.
167 constexpr auto half_support_literal = R"(
168 #define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
169 #define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
170 #if defined(__cplusplus)
171  struct __align__(2) __half {
172  __host__ __device__ __half() { }
173 
174  protected:
175  unsigned short __x;
176  };
177 
178  /* All intrinsic functions are only available to nvcc compilers */
179  #if defined(__CUDACC__)
180  /* Definitions of intrinsics */
181  __device__ __half __float2half(const float f) {
182  __half val;
183  asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
184  return val;
185  }
186 
187  __device__ float __half2float(const __half h) {
188  float val;
189  asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
190  return val;
191  }
192 )"
193 // MSVC's preprocesor (but not the standard compiler) has a bug
194 // where it incorrectly tokenizes raw string literals, ending when it sees a "
195 // this causes the #endif in this string literal to be treated as a preprocessor
196 // token which, in turn, cause sccache on windows CI to fail.
197 // See https://godbolt.org/z/eVTIJq as an example.
198 // This workaround uses string-pasting to separate the " and the #endif into different
199 // strings
200 R"(
201  #endif /* defined(__CUDACC__) */
202 #endif /* defined(__cplusplus) */
203 #undef __HALF_TO_US
204 #undef __HALF_TO_CUS
205 
206 typedef __half half;
207 )";
208 
209 } // namespace cuda
210 } // namespace fuser
211 } // namespace jit
212 } // namespace torch
Definition: jit_type.h:17