Caffe2 - C++ API
A deep learning, cross platform ML framework
fully_connected_op_gpu.cc
1 
17 #include "caffe2/core/common_gpu.h"
18 #include "caffe2/core/context_gpu.h"
19 #include "caffe2/operators/fully_connected_op.h"
20 
21 namespace caffe2 {
22 
23 namespace {
24 
25 constexpr int kFp16CUDADevicePropMajor = 6;
26 
27 template <class FullyConnectedOp>
28 bool RunFullyConnectedOpOnCUDADevice(
29  const bool float16_compute,
30  FullyConnectedOp* op) {
31  if (op->Input(0).template IsType<float>()) {
32  return op->template DoRunWithType<
33  float, // X
34  float, // W
35  float, // B
36  float, // Y
37  float>(); // Math
38  } else if (op->Input(0).template IsType<float16>()) {
39  if (float16_compute) {
40  const cudaDeviceProp& prop = GetDeviceProperty(0);
41  if (prop.major >= kFp16CUDADevicePropMajor) {
42  return op->template DoRunWithType<
43  float16, // X
44  float16, // W
45  float16, // B
46  float16, // Y
47  float16>(); // Math
48  } else {
49  LOG(INFO) << "CUDA Device does not support FP16 computation, "
50  "falling back to FP32.";
51  return op->template DoRunWithType<
52  float16, // X
53  float16, // W
54  float16, // B
55  float16, // Y
56  float>(); // Math
57  }
58  } else {
59  return op->template DoRunWithType<
60  float16, // X
61  float16, // W
62  float16, // B
63  float16, // Y
64  float>(); // Math
65  }
66  } else {
67  CAFFE_THROW("Unsupported type");
68  }
69  return false;
70 }
71 
72 template <class FullyConnectedGradientOp>
73 bool RunFullyConnectedGradientOpOnCUDADevice(
74  const bool float16_compute,
75  FullyConnectedGradientOp* op) {
76  if (op->Input(0).template IsType<float>()) {
77  return op->template DoRunWithType<
78  float, // X
79  float, // W
80  float, // dY
81  float, // B
82  float, // dX
83  float, // dW
84  float, // dB
85  float>(); // Math
86  } else if (op->Input(0).template IsType<float16>()) {
87  if (float16_compute) {
88  const cudaDeviceProp& prop = GetDeviceProperty(0);
89  if (prop.major >= kFp16CUDADevicePropMajor) {
90  return op->template DoRunWithType<
91  float16, // X
92  float16, // W
93  float16, // dY
94  float16, // B
95  float16, // dX
96  float16, // dW
97  float16, // dB
98  float16>(); // Math
99  } else {
100  LOG(INFO) << "CUDA Device does not support FP16 computation, "
101  "falling back to FP32.";
102  return op->template DoRunWithType<
103  float16, // X
104  float16, // W
105  float16, // dY
106  float16, // B
107  float16, // dX
108  float16, // dW
109  float16, // dB
110  float>(); // Math
111  }
112  } else {
113  return op->template DoRunWithType<
114  float16, // X
115  float16, // W
116  float16, // dY
117  float16, // B
118  float16, // dX
119  float16, // dW
120  float16, // dB
121  float>(); // Math
122  }
123  } else {
124  CAFFE_THROW("Unsupported type");
125  }
126  return false;
127 }
128 
129 } // namespace
130 
131 // The RunFullyConnectedOpOnCUDADevice Function will use the pointer of current
132 // op and the DoRunWithType will make sure to run the correct things.
133 template <>
134 bool FullyConnectedOp<CUDAContext>::RunOnDevice() {
135  return RunFullyConnectedOpOnCUDADevice(float16_compute_, this);
136 }
137 
138 template <>
139 bool FullyConnectedOp<
140  CUDAContext,
141  DefaultEngine,
142  false /* don't transpose weight */>::RunOnDevice() {
143  return RunFullyConnectedOpOnCUDADevice(float16_compute_, this);
144 }
145 
146 template <>
147 bool FullyConnectedGradientOp<CUDAContext>::RunOnDevice() {
148  return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
149 }
150 
151 template <>
152 bool FullyConnectedGradientOp<
153  CUDAContext,
154  DefaultEngine,
155  false /* don't transpose weight */>::RunOnDevice() {
156  return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_, this);
157 }
158 
159 #if CUDA_VERSION >= 9000
160 
161 // Require these to be defined otherwise TensorCore FC ops will end
162 // up calling the default FC implementation which doesn't have
163 // fp16 support...
164 
165 template <>
166 bool FullyConnectedOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
167  return RunFullyConnectedOpOnCUDADevice(false /* float16_compute */, this);
168 }
169 
170 template <>
171 bool FullyConnectedOp<
172  CUDAContext,
173  TensorCoreEngine,
174  false /* don't transpose weight */>::RunOnDevice() {
175  return RunFullyConnectedOpOnCUDADevice(false /* float16_compute */, this);
176 }
177 
178 template <>
179 bool FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
180  return RunFullyConnectedGradientOpOnCUDADevice(
181  false /* float16_compute */, this);
182 }
183 
184 template <>
185 bool FullyConnectedGradientOp<
186  CUDAContext,
187  TensorCoreEngine,
188  false /* don't transpose weight */>::RunOnDevice() {
189  return RunFullyConnectedGradientOpOnCUDADevice(
190  false /* float16_compute */, this);
191 }
192 
193 #endif
194 
195 REGISTER_CUDA_OPERATOR(FC, FullyConnectedOp<CUDAContext>);
196 REGISTER_CUDA_OPERATOR(FCGradient, FullyConnectedGradientOp<CUDAContext>);
197 
198 REGISTER_CUDA_OPERATOR(
199  FCTransposed,
200  FullyConnectedOp<
201  CUDAContext,
202  DefaultEngine,
203  false /* don't transpose weight */>);
204 REGISTER_CUDA_OPERATOR(
205  FCTransposedGradient,
206  FullyConnectedGradientOp<
207  CUDAContext,
208  DefaultEngine,
209  false /* don't transpose weight */>);
210 
211 #if CUDA_VERSION >= 9000
212 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
213  FC,
214  TENSORCORE,
215  FullyConnectedOp<CUDAContext, TensorCoreEngine>);
216 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
217  FCGradient,
218  TENSORCORE,
219  FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>);
220 
221 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
222  FCTransposed,
223  TENSORCORE,
224  FullyConnectedOp<
225  CUDAContext,
226  TensorCoreEngine,
227  false /* don't transpose weight */>);
228 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
229  FCTransposedGradient,
230  TENSORCORE,
231  FullyConnectedGradientOp<
232  CUDAContext,
233  TensorCoreEngine,
234  false /* don't transpose weight */>);
235 #endif
236 
237 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:182