Caffe2 - C++ API
A deep learning, cross platform ML framework
elemenntwise_rtc_gpu.cc
1 #include "caffe2/core/common_gpu.h"
2 #include "caffe2/core/context_gpu.h"
3 #include "caffe2/core/operator.h"
4 #include "caffe2/cuda_rtc/common_rtc.h"
5 
6 namespace caffe2 {
7 namespace {
8 class ElementwiseRTCFunction
9  : public CudaRTCFunction<ElementwiseRTCFunction> {
10  public:
11  ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
12 
13  template <typename... Args>
14  string KernelName(Args... /*args*/) {
15  return name_;
16  }
17 
18  template <typename... Args>
19  string GetSource(Args... args);
20 
21  private:
22  string name_;
23 };
24 
25 template<>
26 string ElementwiseRTCFunction::GetSource(
27  int input_size, int output_size,
28  const string command_string) {
29  std::stringstream ss;
30  ss << "extern \"C\" __global__ void " << name_ <<
31  "(const size_t nthreads, \n";
32  // Insert the parameter list.
33  int remain_params = input_size + output_size;
34  for (int i = 0; i < input_size; ++i) {
35  ss << "const float* in" << i
36  << ((remain_params--) ? ", \n" : "");
37  }
38  for (int i = 0; i < output_size; ++i) {
39  ss << "float* out" << i
40  << ((remain_params--) ? ", \n" : "");
41  }
42  ss << ") {\n"
43  "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n"
44  "index < nthreads; index += blockDim.x * gridDim.x) {\n"
45  << command_string << "\n"
46  << "}\n}";
47  return ss.str();
48 }
49 } // namespace
50 
74 class ElementwiseRTCOp final : public Operator<CUDAContext> {
75  public:
76  ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws)
77  : Operator<CUDAContext>(operator_def, ws) {
78  const string src = OperatorBase::GetSingleArgument<string>(
79  "rtc_src", "");
80  CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
81  func_.Compile(InputSize(), OutputSize(), src);
82  }
83  ~ElementwiseRTCOp() override {}
84 
85  bool RunOnDevice() override {
86  static_assert(sizeof(void*) == sizeof(size_t),
87  "The argbuffer relies on the assumption that void* and "
88  "size_t have the same size.");
89  vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
90  size_t* argBuffer = argBuffer_vec.data();
91  CAFFE_ENFORCE(
92  Input(0).numel() < std::numeric_limits<int>::max(),
93  "The kernel function currently only supports int index.");
94  argBuffer[0] = Input(0).numel();
95  void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1);
96  for (int i = 0; i < InputSize(); ++i) {
97  ptr_buffer[i] = const_cast<float*>(Input(i).data<float>());
98  }
99  for (int i = 0; i < OutputSize(); ++i) {
100  Output(i)->ResizeLike(Input(0));
101  ptr_buffer[i + InputSize()] = Output(i)->mutable_data<float>();
102  }
103  size_t argBufferSize = sizeof(argBuffer);
104  void* config[] = {
105  CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
106  CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
107  CU_LAUNCH_PARAM_END
108  };
109  func_.LaunchEx(
110  CAFFE_GET_BLOCKS(Input(0).numel()),
111  1,
112  1,
113  CAFFE_CUDA_NUM_THREADS,
114  1,
115  1,
116  0,
117  context_.cuda_stream(),
118  config);
119  return true;
120  }
121 
122  private:
123  ElementwiseRTCFunction func_;
124 };
125 
126 namespace {
127 REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp);
128 }
129 
130 } // namespace caffe2
A GPU operator that can generate limited elementwise operations.
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.
Definition: common_gpu.h:340