Caffe2 - C++ API
A deep learning, cross platform ML framework
pool_op_rtc_gpu.cc
1 
17 #include <cstdio>
18 
19 #include "caffe2/core/common_gpu.h"
20 #include "caffe2/core/context_gpu.h"
21 #include "caffe2/operators/pool_op.h"
22 #include "caffe2/cuda_rtc/common_rtc.h"
23 
24 namespace caffe2 {
25 namespace {
26 class AveragePool {};
27 class MaxPool {};
28 } // namespace
29 
30 namespace {
31 
32 // The max pool forward function, with parameters written in const int.
33 const char kMaxPoolForwardNCHWSource[] = R"(
34 extern "C"
35 __global__ void %s(const float* bottom_data, float* top_data) {
36  const int nthreads = %d;
37  const int channels = %d;
38  const int height = %d;
39  const int width = %d;
40  const int pooled_height = %d;
41  const int pooled_width = %d;
42  const int kernel_h = %d;
43  const int kernel_w = %d;
44  const int stride_h = %d;
45  const int stride_w = %d;
46  const int pad_t = %d;
47  const int pad_l = %d;
48  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
49  index < nthreads; index += blockDim.x * gridDim.x) {
50  int pw = index %% pooled_width;
51  int ph = (index / pooled_width) %% pooled_height;
52  int c = (index / (pooled_width * pooled_height)) %% channels;
53  int n = index / (pooled_width * pooled_height * channels);
54  int hstart = ph * stride_h - pad_t;
55  int wstart = pw * stride_w - pad_l;
56  int hend = min(hstart + kernel_h, height);
57  int wend = min(wstart + kernel_w, width);
58  hstart = max(hstart, 0);
59  wstart = max(wstart, 0);
60  float maxval = -1.0e37f;
61  const float* bdata_offset = bottom_data + n * channels * height * width;
62  for (int h = hstart; h < hend; ++h) {
63  for (int w = wstart; w < wend; ++w) {
64  maxval = fmaxf(
65  bdata_offset[c * height * width + h * width + w], maxval);
66  }
67  }
68  top_data[index] = maxval;
69  }
70 }
71 )";
72 
73 // The max pool forward function, with parameters written in const int.
74 const char kMaxPoolBackwardNCHWSource[] = R"(
75 extern "C"
76 __global__ void %s(
77  const float* const bottom_data, const float* const top_data,
78  const float* const top_diff, float* const bottom_diff) {
79  const int nthreads = %d;
80  const int num = %d;
81  const int channels = %d;
82  const int height = %d;
83  const int width = %d;
84  const int pooled_height = %d;
85  const int pooled_width = %d;
86  const int kernel_h = %d;
87  const int kernel_w = %d;
88  const int stride_h = %d;
89  const int stride_w = %d;
90  const int pad_t = %d;
91  const int pad_l = %d;
92  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
93  index < nthreads; index += blockDim.x * gridDim.x) {
94  const int w = index %% width + pad_l;
95  const int h = (index / width) %% height + pad_t;
96  const int c = (index / width / height) %% channels;
97  const int n = index / width / height / channels;
98  const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
99  const int phend = min(h / stride_h + 1, pooled_height);
100  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
101  const int pwend = min(w / stride_w + 1, pooled_width);
102  const int top_offset =
103  (n * channels + c) * pooled_height * pooled_width;
104  bottom_diff[index] = 0;
105  for (int ph = phstart; ph < phend; ++ph) {
106  for (int pw = pwstart; pw < pwend; ++pw) {
107  int top_local_offset = top_offset + ph * pooled_width + pw;
108  if (bottom_data[index] == top_data[top_local_offset]) {
109  bottom_diff[index] += top_diff[top_local_offset];
110  }
111  }
112  }
113  }
114 }
115 )";
116 
117 
118 class MaxPoolRTCFunction : public CudaRTCFunction<MaxPoolRTCFunction> {
119  public:
120  MaxPoolRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
121 
122  template <typename... Args>
123  string KernelName(Args... /*args*/) {
124  return name_;
125  }
126 
127  template <typename... Args>
128  string GetSource(Args... args);
129 
130  private:
131  string name_;
132 };
133 
134 class MaxPoolGradientRTCFunction
135  : public CudaRTCFunction<MaxPoolGradientRTCFunction> {
136  public:
137  MaxPoolGradientRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
138 
139  template <typename... Args>
140  string KernelName(Args... /*args*/) {
141  return name_;
142  }
143 
144  template <typename... Args>
145  string GetSource(Args... args);
146 
147  private:
148  string name_;
149 };
150 
151 
152 template <>
153 string MaxPoolRTCFunction::GetSource(
154  const int output_size,
155  const int channels,
156  const int height,
157  const int width,
158  const int pooled_height,
159  const int pooled_width,
160  const int kernel_h,
161  const int kernel_w,
162  const int stride_h,
163  const int stride_w,
164  const int pad_t,
165  const int pad_l) {
166  char buffer[65536];
167  int nbytes = snprintf(
168  buffer, 65536, kMaxPoolForwardNCHWSource, name_.c_str(), output_size,
169  channels, height, width, pooled_height, pooled_width, kernel_h, kernel_w,
170  stride_h, stride_w, pad_t, pad_l);
171  DCHECK_GE(nbytes, 0);
172  DCHECK_LT(nbytes, 65536);
173  return string(buffer);
174 }
175 
176 template <>
177 string MaxPoolGradientRTCFunction::GetSource(
178  const int output_size,
179  const int num,
180  const int channels,
181  const int height,
182  const int width,
183  const int pooled_height,
184  const int pooled_width,
185  const int kernel_h,
186  const int kernel_w,
187  const int stride_h,
188  const int stride_w,
189  const int pad_t,
190  const int pad_l) {
191  char buffer[65536];
192  int nbytes = snprintf(
193  buffer, 65536, kMaxPoolBackwardNCHWSource, name_.c_str(), output_size,
194  num, channels, height, width, pooled_height, pooled_width, kernel_h,
195  kernel_w, stride_h, stride_w, pad_t, pad_l);
196  DCHECK_GE(nbytes, 0);
197  DCHECK_LT(nbytes, 65536);
198  return string(buffer);
199 }
200 
201 } // namespace
202 
203 
204 class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {
205  public:
206  MaxPoolRTCOp(const OperatorDef& operator_def, Workspace* ws)
207  : ConvPoolOpBase<CUDAContext>(operator_def, ws) {
208  CAFFE_ENFORCE_EQ(
209  order_, StorageOrder::NCHW, "Currently only NCHW is supported.");
210  }
211  ~MaxPoolRTCOp() {}
212 
213  bool RunOnDeviceWithOrderNCHW() override {
214  auto& X = Input(0);
215  auto* Y = Output(0);
216  ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1));
217 
218  if (input_dims_ != X.dims()) {
219  // recompile
220  VLOG(1) << "MaxPool RTC recompiling";
221  CAFFE_ENFORCE_LT(Y->size(), std::numeric_limits<int>::max());
222  func_.Compile(
223  static_cast<int>(Y->size()),
224  X.dim32(1),
225  X.dim32(2),
226  X.dim32(3),
227  Y->dim32(2),
228  Y->dim32(3),
229  kernel_h(),
230  kernel_w(),
231  stride_h(),
232  stride_w(),
233  pad_t(),
234  pad_l());
235  input_dims_ = X.dims();
236  }
237  // Carry out the pooling computation.
238  func_.Launch(CAFFE_GET_BLOCKS(Y->size()), 1, 1, CAFFE_CUDA_NUM_THREADS,
239  1, 1, 0, context_.cuda_stream(),
240  X.data<float>(), Y->mutable_data<float>());
241  return true;
242  }
243 
244  bool RunOnDeviceWithOrderNHWC() override {
245  LOG(FATAL) << "Not implemented.";
246  return false;
247  }
248 
249  private:
250  MaxPoolRTCFunction func_;
251  vector<TIndex> input_dims_;
252 };
253 
254 class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {
255  public:
256  MaxPoolGradientRTCOp(const OperatorDef& operator_def, Workspace* ws)
257  : ConvPoolOpBase<CUDAContext>(operator_def, ws) {
258  CAFFE_ENFORCE_EQ(
259  order_, StorageOrder::NCHW, "Currently only NCHW is supported.");
260  }
262 
263  bool RunOnDeviceWithOrderNCHW() override {
264  auto& X = Input(0);
265  auto& Y = Input(1);
266  auto& dY = Input(2);
267  CAFFE_ENFORCE_EQ(dY.ndim(), 4);
268  auto* dX = Output(0);
269  dX->ResizeLike(X);
270  ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
271  if (input_dims_ != X.dims()) {
272  VLOG(1) << "MaxPoolGradient RTC recompiling";
273  CAFFE_ENFORCE_LT(X.size(), std::numeric_limits<int>::max());
274  func_.Compile(
275  static_cast<int>(X.size()),
276  X.dim32(0),
277  X.dim32(1),
278  X.dim32(2),
279  X.dim32(3),
280  dY.dim32(2),
281  dY.dim32(3),
282  kernel_h(),
283  kernel_w(),
284  stride_h(),
285  stride_w(),
286  pad_t(),
287  pad_l());
288  input_dims_ = X.dims();
289  }
290  func_.Launch(CAFFE_GET_BLOCKS(X.size()), 1, 1, CAFFE_CUDA_NUM_THREADS, 1, 1,
291  0, context_.cuda_stream(),
292  X.data<float>(), Y.data<float>(), dY.data<float>(),
293  dX->mutable_data<float>());
294  return true;
295  }
296 
297  bool RunOnDeviceWithOrderNHWC() override {
298  LOG(FATAL) << "Not implemented.";
299  return false;
300  }
301 
302  private:
303  MaxPoolGradientRTCFunction func_;
304  vector<TIndex> input_dims_;
305 };
306 
307 namespace {
308 REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPool, NVRTC, MaxPoolRTCOp);
309 REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPoolGradient, NVRTC,
311 } // namespace
312 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
Copyright (c) 2016-present, Facebook, Inc.
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.
Definition: common_gpu.h:285