Caffe2 - C++ API
A deep learning, cross platform ML framework
Related Pages
Modules
Data Structures
Files
C++ API
Python API
GitHub
File List
Globals
torch
csrc
jit
fuser
cuda
resource_strings.h
1
#pragma once
2
3
#include <torch/csrc/WindowsTorchApiMacro.h>
4
#include <torch/csrc/jit/code_template.h>
5
6
namespace
torch
{
7
namespace
jit {
8
namespace
fuser {
9
namespace
cuda {
10
11
/*with type_as not checking type of its input, a fusion group can have non-fp32
12
tensor as input. Correct code for this case is generated, however, nvrtc does
13
not know how to handle int*_t integer types, so typedefs help it handle those
14
cases*/
15
16
static
auto
type_declarations_template = CodeTemplate(R
"(
17
typedef unsigned char uint8_t;
18
typedef signed char int8_t;
19
typedef short int int16_t;
20
typedef long long int int64_t;
21
${HalfHeader}
22
${RandHeader}
23
24
#define NAN __int_as_float(0x7fffffff)
25
#define POS_INFINITY __int_as_float(0x7f800000)
26
#define NEG_INFINITY __int_as_float(0xff800000)
27
28
typedef ${IndexType} IndexType;
29
template<typename T, size_t N>
30
struct TensorInfo {
31
T* data;
32
IndexType sizes[N];
33
IndexType strides[N];
34
};
35
template<typename T>
36
struct TensorInfo<T, 0> {
37
T * data;
38
};
39
)");
40
41
// We rewrite the code for philox RNG from curand as nvrtc couldn't resolve the
42
// curand header correctly.
43
constexpr
auto
rand_support_literal = R
"(
44
45
class Philox {
46
public:
47
__device__ inline Philox(unsigned long long seed,
48
unsigned long long subsequence,
49
unsigned long long offset) {
50
key.x = (unsigned int)seed;
51
key.y = (unsigned int)(seed >> 32);
52
counter = make_uint4(0, 0, 0, 0);
53
counter.z = (unsigned int)(subsequence);
54
counter.w = (unsigned int)(subsequence >> 32);
55
STATE = 0;
56
incr_n(offset / 4);
57
}
58
59
__device__ inline unsigned long operator()() {
60
if(STATE == 0) {
61
uint4 counter_ = counter;
62
uint2 key_ = key;
63
for(int i = 0; i < 9; i++) {
64
counter_ = single_round(counter_, key_);
65
key_.x += (kPhilox10A); key_.y += (kPhilox10B);
66
}
67
output = single_round(counter_, key_);
68
incr();
69
}
70
unsigned long ret;
71
switch(STATE) {
72
case 0: ret = output.x; break;
73
case 1: ret = output.y; break;
74
case 2: ret = output.z; break;
75
case 3: ret = output.w; break;
76
}
77
STATE = (STATE + 1) % 4;
78
return ret;
79
}
80
81
private:
82
uint4 counter;
83
uint4 output;
84
uint2 key;
85
unsigned int STATE;
86
__device__ inline void incr_n(unsigned long long n) {
87
unsigned int nlo = (unsigned int)(n);
88
unsigned int nhi = (unsigned int)(n >> 32);
89
counter.x += nlo;
90
if (counter.x < nlo)
91
nhi++;
92
counter.y += nhi;
93
if (nhi <= counter.y)
94
return;
95
if (++counter.z)
96
return;
97
++counter.w;
98
}
99
__device__ inline void incr() {
100
if (++counter.x)
101
return;
102
if (++counter.y)
103
return;
104
if (++counter.z)
105
return;
106
++counter.w;
107
}
108
__device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
109
unsigned int *result_high) {
110
*result_high = __umulhi(a, b);
111
return a*b;
112
}
113
114
__device__ inline uint4 single_round(uint4 ctr, uint2 key) {
115
unsigned int hi0;
116
unsigned int hi1;
117
unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
118
unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
119
120
uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
121
return ret;
122
}
123
124
static const unsigned long kPhilox10A = 0x9E3779B9;
125
static const unsigned long kPhilox10B = 0xBB67AE85;
126
static const unsigned long kPhiloxSA = 0xD2511F53;
127
static const unsigned long kPhiloxSB = 0xCD9E8D57;
128
};
129
130
// Inverse of 2^32.
131
#define M_RAN_INVM32 2.3283064e-10f
132
__device__ __inline__ float uniform(unsigned int x) {
133
return x * M_RAN_INVM32;
134
}
135
)";
136
137
constexpr
auto
rand_param =
138
",unsigned long long seed, unsigned long long offset"
;
139
140
constexpr
auto
rand_init = R
"(
141
int idx = blockIdx.x*blockDim.x + threadIdx.x;
142
Philox rnd(seed, idx, offset);
143
)";
144
145
static
auto
cuda_compilation_unit_template = CodeTemplate(R
"(
146
${type_declarations}
147
148
extern "C" __global__
149
void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) {
150
${RandInit}
151
for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
152
linearIndex < totalElements;
153
linearIndex += gridDim.x * blockDim.x) {
154
// Convert `linearIndex` into an offset of tensor:
155
${tensorOffsets}
156
// calculate the results
157
${kernelBody}
158
}
159
}
160
)");
161
162
// This snippet enables half support in the jit. Following the pattern for
163
// reductions, fp16 input data is immediately upconverted to float
164
// with __half2float(). All mathematical operations are done on float
165
// values, and if needed the intermediate float representation is
166
// converted to half with __float2half() when writing to a half tensor.
167
constexpr
auto
half_support_literal = R
"(
168
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
169
#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
170
#if defined(__cplusplus)
171
struct __align__(2) __half {
172
__host__ __device__ __half() { }
173
174
protected:
175
unsigned short __x;
176
};
177
178
/* All intrinsic functions are only available to nvcc compilers */
179
#if defined(__CUDACC__)
180
/* Definitions of intrinsics */
181
__device__ __half __float2half(const float f) {
182
__half val;
183
asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
184
return val;
185
}
186
187
__device__ float __half2float(const __half h) {
188
float val;
189
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
190
return val;
191
}
192
)"
193
// MSVC's preprocesor (but not the standard compiler) has a bug
194
// where it incorrectly tokenizes raw string literals, ending when it sees a "
195
// this causes the #endif in this string literal to be treated as a preprocessor
196
// token which, in turn, cause sccache on windows CI to fail.
197
// See https://godbolt.org/z/eVTIJq as an example.
198
// This workaround uses string-pasting to separate the " and the #endif into different
199
// strings
200
R
"(
201
#endif /* defined(__CUDACC__) */
202
#endif /* defined(__cplusplus) */
203
#undef __HALF_TO_US
204
#undef __HALF_TO_CUS
205
206
typedef __half half;
207
)";
208
209
}
// namespace cuda
210
}
// namespace fuser
211
}
// namespace jit
212
}
// namespace torch
torch
Definition:
jit_type.h:17
Generated on Thu Mar 21 2019 13:06:23 for Caffe2 - C++ API by
1.8.11