1 #include "caffe2/core/common_gpu.h" 2 #include "caffe2/core/context_gpu.h" 3 #include "caffe2/operators/fully_connected_op.h" 9 template <
class FullyConnectedOp>
10 bool RunFullyConnectedOpOnCUDADevice(
11 const bool float16_compute,
12 FullyConnectedOp* op) {
13 if (op->Input(0).template IsType<float>()) {
14 return op->template DoRunWithType<
20 }
else if (op->Input(0).template IsType<at::Half>()) {
21 if (float16_compute) {
23 if (prop.major >= kFp16CUDADevicePropMajor) {
24 return op->template DoRunWithType<
31 LOG(INFO) <<
"CUDA Device does not support FP16 computation, " 32 "falling back to FP32.";
33 return op->template DoRunWithType<
41 return op->template DoRunWithType<
49 CAFFE_THROW(
"Unsupported type");
54 template <
class FullyConnectedGradientOp>
55 bool RunFullyConnectedGradientOpOnCUDADevice(
56 const bool float16_compute,
57 FullyConnectedGradientOp* op) {
58 if (op->Input(0).template IsType<float>()) {
59 return op->template DoRunWithType<
68 }
else if (op->Input(0).template IsType<at::Half>()) {
69 if (float16_compute) {
71 if (prop.major >= kFp16CUDADevicePropMajor) {
72 return op->template DoRunWithType<
82 LOG(INFO) <<
"CUDA Device does not support FP16 computation, " 83 "falling back to FP32.";
84 return op->template DoRunWithType<
95 return op->template DoRunWithType<
106 CAFFE_THROW(
"Unsupported type");
116 bool FullyConnectedOp<CUDAContext>::RunOnDevice() {
117 return RunFullyConnectedOpOnCUDADevice(float16_compute_,
this);
121 bool FullyConnectedOp<
124 false >::RunOnDevice() {
125 return RunFullyConnectedOpOnCUDADevice(float16_compute_,
this);
129 bool FullyConnectedGradientOp<CUDAContext>::RunOnDevice() {
130 return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_,
this);
134 bool FullyConnectedGradientOp<
137 false >::RunOnDevice() {
138 return RunFullyConnectedGradientOpOnCUDADevice(float16_compute_,
this);
141 #if CUDA_VERSION >= 9000 148 bool FullyConnectedOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
149 return RunFullyConnectedOpOnCUDADevice(
false ,
this);
153 bool FullyConnectedOp<
156 false >::RunOnDevice() {
157 return RunFullyConnectedOpOnCUDADevice(
false ,
this);
161 bool FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>::RunOnDevice() {
162 return RunFullyConnectedGradientOpOnCUDADevice(
167 bool FullyConnectedGradientOp<
170 false >::RunOnDevice() {
171 return RunFullyConnectedGradientOpOnCUDADevice(
177 REGISTER_CUDA_OPERATOR(
FC, FullyConnectedOp<CUDAContext>);
178 REGISTER_CUDA_OPERATOR(FCGradient, FullyConnectedGradientOp<CUDAContext>);
180 REGISTER_CUDA_OPERATOR(
186 REGISTER_CUDA_OPERATOR(
187 FCTransposedGradient,
188 FullyConnectedGradientOp<
193 #if CUDA_VERSION >= 9000 194 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
197 FullyConnectedOp<CUDAContext, TensorCoreEngine>);
198 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
201 FullyConnectedGradientOp<CUDAContext, TensorCoreEngine>);
203 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
210 REGISTER_CUDA_OPERATOR_WITH_ENGINE(
211 FCTransposedGradient,
213 FullyConnectedGradientOp<
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.