1 #include "caffe2/core/common_gpu.h" 2 #include "caffe2/core/context_gpu.h" 3 #include "caffe2/core/operator.h" 4 #include "caffe2/cuda_rtc/common_rtc.h" 8 class ElementwiseRTCFunction
9 :
public CudaRTCFunction<ElementwiseRTCFunction> {
11 ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
13 template <
typename... Args>
14 string KernelName(Args... ) {
18 template <
typename... Args>
19 string GetSource(Args... args);
26 string ElementwiseRTCFunction::GetSource(
27 int input_size,
int output_size,
28 const string command_string) {
30 ss <<
"extern \"C\" __global__ void " << name_ <<
31 "(const size_t nthreads, \n";
33 int remain_params = input_size + output_size;
34 for (
int i = 0; i < input_size; ++i) {
35 ss <<
"const float* in" << i
36 << ((remain_params--) ?
", \n" :
"");
38 for (
int i = 0; i < output_size; ++i) {
39 ss <<
"float* out" << i
40 << ((remain_params--) ?
", \n" :
"");
43 "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n" 44 "index < nthreads; index += blockDim.x * gridDim.x) {\n" 45 << command_string <<
"\n" 78 const string src = OperatorBase::GetSingleArgument<string>(
80 CAFFE_ENFORCE(src.size(),
"Op should have a non-zero source code size.");
81 func_.Compile(InputSize(), OutputSize(), src);
85 bool RunOnDevice()
override {
86 static_assert(
sizeof(
void*) ==
sizeof(
size_t),
87 "The argbuffer relies on the assumption that void* and " 88 "size_t have the same size.");
89 vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
90 size_t* argBuffer = argBuffer_vec.data();
92 Input(0).numel() < std::numeric_limits<int>::max(),
93 "The kernel function currently only supports int index.");
94 argBuffer[0] = Input(0).numel();
95 void** ptr_buffer =
reinterpret_cast<void**
>(argBuffer + 1);
96 for (
int i = 0; i < InputSize(); ++i) {
97 ptr_buffer[i] =
const_cast<float*
>(Input(i).data<
float>());
99 for (
int i = 0; i < OutputSize(); ++i) {
100 Output(i)->ResizeLike(Input(0));
101 ptr_buffer[i + InputSize()] = Output(i)->mutable_data<
float>();
103 size_t argBufferSize =
sizeof(argBuffer);
105 CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
106 CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
113 CAFFE_CUDA_NUM_THREADS,
117 context_.cuda_stream(),
123 ElementwiseRTCFunction func_;
127 REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC,
ElementwiseRTCOp);
A GPU operator that can generate limited elementwise operations.
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.