Caffe2 - C++ API
A deep learning, cross platform ML framework
elemenntwise_rtc_gpu.cc
1 
17 #include "caffe2/core/common_gpu.h"
18 #include "caffe2/core/context_gpu.h"
19 #include "caffe2/core/operator.h"
20 #include "caffe2/cuda_rtc/common_rtc.h"
21 
22 namespace caffe2 {
23 namespace {
24 class ElementwiseRTCFunction
25  : public CudaRTCFunction<ElementwiseRTCFunction> {
26  public:
27  ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
28 
29  template <typename... Args>
30  string KernelName(Args... /*args*/) {
31  return name_;
32  }
33 
34  template <typename... Args>
35  string GetSource(Args... args);
36 
37  private:
38  string name_;
39 };
40 
41 template<>
42 string ElementwiseRTCFunction::GetSource(
43  int input_size, int output_size,
44  const string command_string) {
45  std::stringstream ss;
46  ss << "extern \"C\" __global__ void " << name_ <<
47  "(const size_t nthreads, \n";
48  // Insert the parameter list.
49  int remain_params = input_size + output_size;
50  for (int i = 0; i < input_size; ++i) {
51  ss << "const float* in" << i
52  << ((remain_params--) ? ", \n" : "");
53  }
54  for (int i = 0; i < output_size; ++i) {
55  ss << "float* out" << i
56  << ((remain_params--) ? ", \n" : "");
57  }
58  ss << ") {\n"
59  "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n"
60  "index < nthreads; index += blockDim.x * gridDim.x) {\n"
61  << command_string << "\n"
62  << "}\n}";
63  return ss.str();
64 }
65 } // namespace
66 
90 class ElementwiseRTCOp final : public Operator<CUDAContext> {
91  public:
92  ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws)
93  : Operator<CUDAContext>(operator_def, ws) {
94  const string src = OperatorBase::GetSingleArgument<string>(
95  "rtc_src", "");
96  CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
97  func_.Compile(InputSize(), OutputSize(), src);
98  }
99  ~ElementwiseRTCOp() {}
100 
101  bool RunOnDevice() override {
102  static_assert(sizeof(void*) == sizeof(size_t),
103  "The argbuffer relies on the assumption that void* and "
104  "size_t have the same size.");
105  vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
106  size_t* argBuffer = argBuffer_vec.data();
107  CAFFE_ENFORCE(
108  Input(0).size() < std::numeric_limits<int>::max(),
109  "The kernel function currently only supports int index.");
110  argBuffer[0] = Input(0).size();
111  void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1);
112  for (int i = 0; i < InputSize(); ++i) {
113  ptr_buffer[i] = const_cast<float*>(Input(i).data<float>());
114  }
115  for (int i = 0; i < OutputSize(); ++i) {
116  Output(i)->ResizeLike(Input(0));
117  ptr_buffer[i + InputSize()] = Output(i)->mutable_data<float>();
118  }
119  size_t argBufferSize = sizeof(argBuffer);
120  void* config[] = {
121  CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
122  CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
123  CU_LAUNCH_PARAM_END
124  };
125  func_.LaunchEx(CAFFE_GET_BLOCKS(Input(0).size()), 1, 1,
126  CAFFE_CUDA_NUM_THREADS, 1, 1,
127  0, context_.cuda_stream(), config);
128  return true;
129  }
130 
131  private:
132  ElementwiseRTCFunction func_;
133 };
134 
135 namespace {
136 REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp);
137 }
138 
139 } // namespace caffe2
A GPU operator that can generate limited elementwise operations.
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
Copyright (c) 2016-present, Facebook, Inc.
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.
Definition: common_gpu.h:285