3 #include "caffe2/core/common_gpu.h"     4 #include "caffe2/core/context_gpu.h"     5 #include "caffe2/operators/pool_op.h"     6 #include "caffe2/cuda_rtc/common_rtc.h"    17 const char kMaxPoolForwardNCHWSource[] = R
"(    19 __global__ void %s(const float* bottom_data, float* top_data) {    20   const int nthreads = %d;    21   const int channels = %d;    22   const int height = %d;    24   const int pooled_height = %d;    25   const int pooled_width = %d;    26   const int kernel_h = %d;    27   const int kernel_w = %d;    28   const int stride_h = %d;    29   const int stride_w = %d;    32   for (int index = blockIdx.x * blockDim.x + threadIdx.x;    33        index < nthreads; index += blockDim.x * gridDim.x) {    34     int pw = index %% pooled_width;    35     int ph = (index / pooled_width) %% pooled_height;    36     int c = (index / (pooled_width * pooled_height)) %% channels;    37     int n = index / (pooled_width * pooled_height * channels);    38     int hstart = ph * stride_h - pad_t;    39     int wstart = pw * stride_w - pad_l;    40     int hend = min(hstart + kernel_h, height);    41     int wend = min(wstart + kernel_w, width);    42     hstart = max(hstart, 0);    43     wstart = max(wstart, 0);    44     float maxval = -1.0e37f;    45     const float* bdata_offset = bottom_data + n * channels * height * width;    46     for (int h = hstart; h < hend; ++h) {    47       for (int w = wstart; w < wend; ++w) {    49             bdata_offset[c * height * width + h * width + w], maxval);    52     top_data[index] = maxval;    58 const char kMaxPoolBackwardNCHWSource[] = R
"(    61     const float* const bottom_data, const float* const top_data,    62     const float* const top_diff, float* const bottom_diff) {    63   const int nthreads = %d;    65   const int channels = %d;    66   const int height = %d;    68   const int pooled_height = %d;    69   const int pooled_width = %d;    70   const int kernel_h = %d;    71   const int kernel_w = %d;    72   const int stride_h = %d;    73   const int stride_w = %d;    76   for (int index = blockIdx.x * blockDim.x + threadIdx.x;    77        index < nthreads; index += blockDim.x * gridDim.x) {    78     const int w = index %% width + pad_l;    79     const int h = (index / width) %% height + pad_t;    80     const int c = (index / width / height) %% channels;    81     const int n = index / width / height / channels;    82     const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;    83     const int phend = min(h / stride_h + 1, pooled_height);    84     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;    85     const int pwend = min(w / stride_w + 1, pooled_width);    86     const int top_offset =    87         (n * channels + c) * pooled_height * pooled_width;    88     bottom_diff[index] = 0;    89     for (int ph = phstart; ph < phend; ++ph) {    90       for (int pw = pwstart; pw < pwend; ++pw) {    91         int top_local_offset = top_offset + ph * pooled_width + pw;    92         if (bottom_data[index] == top_data[top_local_offset]) {    93           bottom_diff[index] += top_diff[top_local_offset];   102 class MaxPoolRTCFunction : 
public CudaRTCFunction<MaxPoolRTCFunction> {
   104   MaxPoolRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
   106   template <
typename... Args>
   107   string KernelName(Args... ) {
   111   template <
typename... Args>
   112   string GetSource(Args... args);
   118 class MaxPoolGradientRTCFunction
   119     : 
public CudaRTCFunction<MaxPoolGradientRTCFunction> {
   121   MaxPoolGradientRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
   123   template <
typename... Args>
   124   string KernelName(Args... ) {
   128   template <
typename... Args>
   129   string GetSource(Args... args);
   137 string MaxPoolRTCFunction::GetSource(
   138     const int output_size,
   142     const int pooled_height,
   143     const int pooled_width,
   151   int nbytes = snprintf(
   152       buffer, 65536, kMaxPoolForwardNCHWSource, name_.c_str(), output_size,
   153       channels, height, width, pooled_height, pooled_width, kernel_h, kernel_w,
   154       stride_h, stride_w, pad_t, pad_l);
   155   DCHECK_GE(nbytes, 0);
   156   DCHECK_LT(nbytes, 65536);
   157   return string(buffer);
   161 string MaxPoolGradientRTCFunction::GetSource(
   162     const int output_size,
   167     const int pooled_height,
   168     const int pooled_width,
   176   int nbytes = snprintf(
   177       buffer, 65536, kMaxPoolBackwardNCHWSource, name_.c_str(), output_size,
   178       num, channels, height, width, pooled_height, pooled_width, kernel_h,
   179       kernel_w, stride_h, stride_w, pad_t, pad_l);
   180   DCHECK_GE(nbytes, 0);
   181   DCHECK_LT(nbytes, 65536);
   182   return string(buffer);
   193         order_, StorageOrder::NCHW, 
"Currently only NCHW is supported.");
   197   bool RunOnDeviceWithOrderNCHW()
 override {
   200     auto* Y = Output(0, output_sizes, at::dtype<float>());
   202     if (input_dims_ != X.sizes()) {
   204       VLOG(1) << 
"MaxPool RTC recompiling";
   205       CAFFE_ENFORCE_LT(Y->numel(), std::numeric_limits<int>::max());
   207           static_cast<int>(Y->numel()),
   219       input_dims_ = X.sizes().vec();
   226         CAFFE_CUDA_NUM_THREADS,
   230         context_.cuda_stream(),
   232         Y->mutable_data<
float>());
   236   bool RunOnDeviceWithOrderNHWC()
 override {
   237     LOG(FATAL) << 
"Not implemented.";
   242   MaxPoolRTCFunction func_;
   243   vector<int64_t> input_dims_;
   251         order_, StorageOrder::NCHW, 
"Currently only NCHW is supported.");
   255   bool RunOnDeviceWithOrderNCHW()
 override {
   259     CAFFE_ENFORCE_EQ(dY.dim(), 4);
   261     auto* dX = Output(0, X.sizes(), at::dtype<float>());
   263     if (input_dims_ != X.sizes()) {
   264       VLOG(1) << 
"MaxPoolGradient RTC recompiling";
   265       CAFFE_ENFORCE_LT(X.numel(), std::numeric_limits<int>::max());
   267           static_cast<int>(X.numel()),
   280       input_dims_ = X.sizes().vec();
   286         CAFFE_CUDA_NUM_THREADS,
   290         context_.cuda_stream(),
   294         dX->mutable_data<
float>());
   298   bool RunOnDeviceWithOrderNHWC()
 override {
   299     LOG(FATAL) << 
"Not implemented.";
   304   MaxPoolGradientRTCFunction func_;
   305   vector<int64_t> input_dims_;
   309 REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPool, NVRTC, 
MaxPoolRTCOp);
   310 REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPoolGradient, NVRTC,
 
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
 
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
 
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.