Caffe2 - C++ API
A deep learning, cross platform ML framework
conversions.h
1 
17 #pragma once
18 
19 #include <caffe2/core/types.h>
20 
21 #ifdef __CUDA_ARCH__
22 // Proxy for including cuda_fp16.h, because common_gpu.h
23 // has necessary diagnostic guards.
24 #include <caffe2/core/common_gpu.h>
25 #endif
26 
27 #ifdef __CUDA_ARCH__
28 #define CONVERSIONS_DECL __host__ __device__ inline
29 #else
30 #define CONVERSIONS_DECL inline
31 #endif
32 
33 namespace caffe2 {
34 
35 namespace convert {
36 
37 namespace {
38 inline float16 cpu_float2half_rn(float f) {
39  float16 ret;
40 
41  static_assert(
42  sizeof(unsigned int) == sizeof(float),
43  "Programming error sizeof(unsigned int) != sizeof(float)");
44 
45  unsigned* xp = reinterpret_cast<unsigned int*>(&f);
46  unsigned x = *xp;
47  unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
48  unsigned sign, exponent, mantissa;
49 
50  // Get rid of +NaN/-NaN case first.
51  if (u > 0x7f800000) {
52  ret.x = 0x7fffU;
53  return ret;
54  }
55 
56  sign = ((x >> 16) & 0x8000);
57 
58  // Get rid of +Inf/-Inf, +0/-0.
59  if (u > 0x477fefff) {
60  ret.x = sign | 0x7c00U;
61  return ret;
62  }
63  if (u < 0x33000001) {
64  ret.x = (sign | 0x0000);
65  return ret;
66  }
67 
68  exponent = ((u >> 23) & 0xff);
69  mantissa = (u & 0x7fffff);
70 
71  if (exponent > 0x70) {
72  shift = 13;
73  exponent -= 0x70;
74  } else {
75  shift = 0x7e - exponent;
76  exponent = 0;
77  mantissa |= 0x800000;
78  }
79  lsb = (1 << shift);
80  lsb_s1 = (lsb >> 1);
81  lsb_m1 = (lsb - 1);
82 
83  // Round to nearest even.
84  remainder = (mantissa & lsb_m1);
85  mantissa >>= shift;
86  if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
87  ++mantissa;
88  if (!(mantissa & 0x3ff)) {
89  ++exponent;
90  mantissa = 0;
91  }
92  }
93 
94  ret.x = (sign | (exponent << 10) | mantissa);
95 
96  return ret;
97 }
98 
99 inline float cpu_half2float(float16 h) {
100  unsigned sign = ((h.x >> 15) & 1);
101  unsigned exponent = ((h.x >> 10) & 0x1f);
102  unsigned mantissa = ((h.x & 0x3ff) << 13);
103 
104  if (exponent == 0x1f) { /* NaN or Inf */
105  mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
106  exponent = 0xff;
107  } else if (!exponent) { /* Denorm or Zero */
108  if (mantissa) {
109  unsigned int msb;
110  exponent = 0x71;
111  do {
112  msb = (mantissa & 0x400000);
113  mantissa <<= 1; /* normalize */
114  --exponent;
115  } while (!msb);
116  mantissa &= 0x7fffff; /* 1.mantissa is implicit */
117  }
118  } else {
119  exponent += 0x70;
120  }
121 
122  unsigned i = ((sign << 31) | (exponent << 23) | mantissa);
123  float ret;
124  memcpy(&ret, &i, sizeof(i));
125  return ret;
126 }
127 
128 }; // anonymous
129 
130 #if __CUDACC__
131 
132 #if CUDA_VERSION >= 9000
133 CONVERSIONS_DECL float16 halfToFloat16(half x) {
134 #ifdef __GNUC__
135 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
136 #pragma GCC diagnostic push
137 #endif
138 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
139 #endif // __GNUC__
140  float16 r = *reinterpret_cast<float16*>(&x);
141 #ifdef __GNUC__
142 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
143 #pragma GCC diagnostic pop
144 #endif
145 #endif // __GNUC__
146  return r;
147 }
148 
149 inline half float16ToHalf(const float16 x) {
150  __half_raw hr;
151  hr.x = x.x;
152  half r(hr);
153  return r;
154 }
155 
156 inline half floatToHalf(const float x) {
157  float16 xh = cpu_float2half_rn(x);
158  return float16ToHalf(xh);
159 }
160 
161 #else
162 inline float16 halfToFloat16(__half x) {
163  float16 r;
164  r.x = x.x;
165  return r;
166 }
167 
168 inline __half float16ToHalf(const float16 x) {
169  __half r;
170  r.x = x.x;
171  return r;
172 }
173 
174 inline half floatToHalf(const float x) {
175  float16 xh = cpu_float2half_rn(x);
176  return float16ToHalf(xh);
177 }
178 #endif // CUDA_VERSION
179 
180 #endif // __CUDACC__
181 
182 // general version: defer to static_cast
183 template <typename IN, typename OUT>
184 CONVERSIONS_DECL OUT To(const IN in) {
185  return static_cast<OUT>(in);
186 }
187 
188 // explicit for fp16
189 template <>
190 CONVERSIONS_DECL float16 To(const float in) {
191 #if __CUDA_ARCH__
192  // hacky interface between C2 fp16 and CUDA
193 #if CUDA_VERSION >= 9000
194  half rh = static_cast<half>(in);
195  return halfToFloat16(rh);
196 #else
197  float16 ret;
198  ret.x = __float2half(in).x;
199  return ret;
200 #endif // CUDA_VERSION >= 9000
201 #else
202  return cpu_float2half_rn(in);
203 #endif
204 }
205 
206 template <>
207 CONVERSIONS_DECL float To(const float16 in) {
208 #if __CUDA_ARCH__
209 #if CUDA_VERSION >= 9000
210  __half_raw tmp;
211 #else
212  __half tmp;
213 #endif
214  tmp.x = in.x;
215  return __half2float(tmp);
216 #else
217  return cpu_half2float(in);
218 #endif
219 };
220 
221 template <>
222 CONVERSIONS_DECL float To(const float in) {
223  return in;
224 }
225 
226 template <typename OUT, typename IN>
227 CONVERSIONS_DECL OUT Get(IN x) {
228  return static_cast<OUT>(x);
229 }
230 
231 template <>
232 CONVERSIONS_DECL float Get(float16 x) {
233  return To<float16, float>(x);
234 }
235 
236 template <>
237 CONVERSIONS_DECL float16 Get(float x) {
238  return To<float, float16>(x);
239 }
240 
241 }; // namespace convert
242 
243 }; // namespace caffe2
244 
245 #undef CONVERSIONS_DECL
Copyright (c) 2016-present, Facebook, Inc.