3 #include "caffe2/core/common_gpu.h" 4 #include "caffe2/core/context_gpu.h" 5 #include "caffe2/operators/pool_op.h" 6 #include "caffe2/cuda_rtc/common_rtc.h" 17 const char kMaxPoolForwardNCHWSource[] = R
"( 19 __global__ void %s(const float* bottom_data, float* top_data) { 20 const int nthreads = %d; 21 const int channels = %d; 22 const int height = %d; 24 const int pooled_height = %d; 25 const int pooled_width = %d; 26 const int kernel_h = %d; 27 const int kernel_w = %d; 28 const int stride_h = %d; 29 const int stride_w = %d; 32 for (int index = blockIdx.x * blockDim.x + threadIdx.x; 33 index < nthreads; index += blockDim.x * gridDim.x) { 34 int pw = index %% pooled_width; 35 int ph = (index / pooled_width) %% pooled_height; 36 int c = (index / (pooled_width * pooled_height)) %% channels; 37 int n = index / (pooled_width * pooled_height * channels); 38 int hstart = ph * stride_h - pad_t; 39 int wstart = pw * stride_w - pad_l; 40 int hend = min(hstart + kernel_h, height); 41 int wend = min(wstart + kernel_w, width); 42 hstart = max(hstart, 0); 43 wstart = max(wstart, 0); 44 float maxval = -1.0e37f; 45 const float* bdata_offset = bottom_data + n * channels * height * width; 46 for (int h = hstart; h < hend; ++h) { 47 for (int w = wstart; w < wend; ++w) { 49 bdata_offset[c * height * width + h * width + w], maxval); 52 top_data[index] = maxval; 58 const char kMaxPoolBackwardNCHWSource[] = R
"( 61 const float* const bottom_data, const float* const top_data, 62 const float* const top_diff, float* const bottom_diff) { 63 const int nthreads = %d; 65 const int channels = %d; 66 const int height = %d; 68 const int pooled_height = %d; 69 const int pooled_width = %d; 70 const int kernel_h = %d; 71 const int kernel_w = %d; 72 const int stride_h = %d; 73 const int stride_w = %d; 76 for (int index = blockIdx.x * blockDim.x + threadIdx.x; 77 index < nthreads; index += blockDim.x * gridDim.x) { 78 const int w = index %% width + pad_l; 79 const int h = (index / width) %% height + pad_t; 80 const int c = (index / width / height) %% channels; 81 const int n = index / width / height / channels; 82 const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; 83 const int phend = min(h / stride_h + 1, pooled_height); 84 const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; 85 const int pwend = min(w / stride_w + 1, pooled_width); 86 const int top_offset = 87 (n * channels + c) * pooled_height * pooled_width; 88 bottom_diff[index] = 0; 89 for (int ph = phstart; ph < phend; ++ph) { 90 for (int pw = pwstart; pw < pwend; ++pw) { 91 int top_local_offset = top_offset + ph * pooled_width + pw; 92 if (bottom_data[index] == top_data[top_local_offset]) { 93 bottom_diff[index] += top_diff[top_local_offset]; 102 class MaxPoolRTCFunction :
public CudaRTCFunction<MaxPoolRTCFunction> {
104 MaxPoolRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
106 template <
typename... Args>
107 string KernelName(Args... ) {
111 template <
typename... Args>
112 string GetSource(Args... args);
118 class MaxPoolGradientRTCFunction
119 :
public CudaRTCFunction<MaxPoolGradientRTCFunction> {
121 MaxPoolGradientRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
123 template <
typename... Args>
124 string KernelName(Args... ) {
128 template <
typename... Args>
129 string GetSource(Args... args);
137 string MaxPoolRTCFunction::GetSource(
138 const int output_size,
142 const int pooled_height,
143 const int pooled_width,
151 int nbytes = snprintf(
152 buffer, 65536, kMaxPoolForwardNCHWSource, name_.c_str(), output_size,
153 channels, height, width, pooled_height, pooled_width, kernel_h, kernel_w,
154 stride_h, stride_w, pad_t, pad_l);
155 DCHECK_GE(nbytes, 0);
156 DCHECK_LT(nbytes, 65536);
157 return string(buffer);
161 string MaxPoolGradientRTCFunction::GetSource(
162 const int output_size,
167 const int pooled_height,
168 const int pooled_width,
176 int nbytes = snprintf(
177 buffer, 65536, kMaxPoolBackwardNCHWSource, name_.c_str(), output_size,
178 num, channels, height, width, pooled_height, pooled_width, kernel_h,
179 kernel_w, stride_h, stride_w, pad_t, pad_l);
180 DCHECK_GE(nbytes, 0);
181 DCHECK_LT(nbytes, 65536);
182 return string(buffer);
193 order_, StorageOrder::NCHW,
"Currently only NCHW is supported.");
197 bool RunOnDeviceWithOrderNCHW()
override {
200 auto* Y = Output(0, output_sizes, at::dtype<float>());
202 if (input_dims_ != X.sizes()) {
204 VLOG(1) <<
"MaxPool RTC recompiling";
205 CAFFE_ENFORCE_LT(Y->numel(), std::numeric_limits<int>::max());
207 static_cast<int>(Y->numel()),
219 input_dims_ = X.sizes().vec();
226 CAFFE_CUDA_NUM_THREADS,
230 context_.cuda_stream(),
232 Y->mutable_data<
float>());
236 bool RunOnDeviceWithOrderNHWC()
override {
237 LOG(FATAL) <<
"Not implemented.";
242 MaxPoolRTCFunction func_;
243 vector<int64_t> input_dims_;
251 order_, StorageOrder::NCHW,
"Currently only NCHW is supported.");
255 bool RunOnDeviceWithOrderNCHW()
override {
259 CAFFE_ENFORCE_EQ(dY.dim(), 4);
261 auto* dX = Output(0, X.sizes(), at::dtype<float>());
263 if (input_dims_ != X.sizes()) {
264 VLOG(1) <<
"MaxPoolGradient RTC recompiling";
265 CAFFE_ENFORCE_LT(X.numel(), std::numeric_limits<int>::max());
267 static_cast<int>(X.numel()),
280 input_dims_ = X.sizes().vec();
286 CAFFE_CUDA_NUM_THREADS,
290 context_.cuda_stream(),
294 dX->mutable_data<
float>());
298 bool RunOnDeviceWithOrderNHWC()
override {
299 LOG(FATAL) <<
"Not implemented.";
304 MaxPoolGradientRTCFunction func_;
305 vector<int64_t> input_dims_;
309 REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPool, NVRTC,
MaxPoolRTCOp);
310 REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPoolGradient, NVRTC,
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.