Caffe2 - C++ API
A deep learning, cross platform ML framework
roi_align_op.cc
1 #include "roi_align_op.h"
2 
3 #include "caffe2/utils/eigen_utils.h"
4 #include "caffe2/utils/math.h"
5 
6 #ifdef CAFFE2_USE_MKL
7 #include "caffe2/mkl/operators/operator_fallback_mkl.h"
8 #endif // CAFFE2_USE_MKL
9 
10 namespace caffe2 {
11 namespace {
12 
13 template <typename T>
14 struct PreCalc {
15  int pos1;
16  int pos2;
17  int pos3;
18  int pos4;
19  T w1;
20  T w2;
21  T w3;
22  T w4;
23 };
24 
25 template <typename T>
26 void pre_calc_for_bilinear_interpolate(
27  const int height,
28  const int width,
29  const int pooled_height,
30  const int pooled_width,
31  const int iy_upper,
32  const int ix_upper,
33  T roi_start_h,
34  T roi_start_w,
35  T bin_size_h,
36  T bin_size_w,
37  int roi_bin_grid_h,
38  int roi_bin_grid_w,
39  std::vector<PreCalc<T>>& pre_calc) {
40  int pre_calc_index = 0;
41  for (int ph = 0; ph < pooled_height; ph++) {
42  for (int pw = 0; pw < pooled_width; pw++) {
43  for (int iy = 0; iy < iy_upper; iy++) {
44  const T yy = roi_start_h + ph * bin_size_h +
45  static_cast<T>(iy + .5f) * bin_size_h /
46  static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
47  for (int ix = 0; ix < ix_upper; ix++) {
48  const T xx = roi_start_w + pw * bin_size_w +
49  static_cast<T>(ix + .5f) * bin_size_w /
50  static_cast<T>(roi_bin_grid_w);
51 
52  T x = xx;
53  T y = yy;
54  // deal with: inverse elements are out of feature map boundary
55  if (y < -1.0 || y > height || x < -1.0 || x > width) {
56  // empty
57  PreCalc<T> pc;
58  pc.pos1 = 0;
59  pc.pos2 = 0;
60  pc.pos3 = 0;
61  pc.pos4 = 0;
62  pc.w1 = 0;
63  pc.w2 = 0;
64  pc.w3 = 0;
65  pc.w4 = 0;
66  pre_calc[pre_calc_index] = pc;
67  pre_calc_index += 1;
68  continue;
69  }
70 
71  if (y <= 0) {
72  y = 0;
73  }
74  if (x <= 0) {
75  x = 0;
76  }
77 
78  int y_low = (int)y;
79  int x_low = (int)x;
80  int y_high;
81  int x_high;
82 
83  if (y_low >= height - 1) {
84  y_high = y_low = height - 1;
85  y = (T)y_low;
86  } else {
87  y_high = y_low + 1;
88  }
89 
90  if (x_low >= width - 1) {
91  x_high = x_low = width - 1;
92  x = (T)x_low;
93  } else {
94  x_high = x_low + 1;
95  }
96 
97  T ly = y - y_low;
98  T lx = x - x_low;
99  T hy = 1. - ly, hx = 1. - lx;
100  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
101 
102  // save weights and indeces
103  PreCalc<T> pc;
104  pc.pos1 = y_low * width + x_low;
105  pc.pos2 = y_low * width + x_high;
106  pc.pos3 = y_high * width + x_low;
107  pc.pos4 = y_high * width + x_high;
108  pc.w1 = w1;
109  pc.w2 = w2;
110  pc.w3 = w3;
111  pc.w4 = w4;
112  pre_calc[pre_calc_index] = pc;
113 
114  pre_calc_index += 1;
115  }
116  }
117  }
118  }
119 }
120 
121 template <typename T>
122 void ROIAlignForward(
123  const int nthreads,
124  const T* bottom_data,
125  const T& spatial_scale,
126  const int channels,
127  const int height,
128  const int width,
129  const int pooled_height,
130  const int pooled_width,
131  const int sampling_ratio,
132  const T* bottom_rois,
133  int roi_cols,
134  T* top_data,
135  StorageOrder order) {
136  DCHECK(roi_cols == 4 || roi_cols == 5);
137 
138  int n_rois = nthreads / channels / pooled_width / pooled_height;
139  // (n, c, ph, pw) is an element in the pooled output
140  // can be parallelized using omp
141  // #pragma omp parallel for num_threads(32)
142  for (int n = 0; n < n_rois; n++) {
143  int index_n = n * channels * pooled_width * pooled_height;
144 
145  // roi could have 4 or 5 columns
146  const T* offset_bottom_rois = bottom_rois + n * roi_cols;
147  int roi_batch_ind = 0;
148  if (roi_cols == 5) {
149  roi_batch_ind = offset_bottom_rois[0];
150  offset_bottom_rois++;
151  }
152 
153  // Do not using rounding; this implementation detail is critical
154  T roi_start_w = offset_bottom_rois[0] * spatial_scale;
155  T roi_start_h = offset_bottom_rois[1] * spatial_scale;
156  T roi_end_w = offset_bottom_rois[2] * spatial_scale;
157  T roi_end_h = offset_bottom_rois[3] * spatial_scale;
158  // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
159  // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
160  // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
161  // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
162 
163  // Force malformed ROIs to be 1x1
164  T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
165  T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
166  T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
167  T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
168 
169  // We use roi_bin_grid to sample the grid and mimic integral
170  int roi_bin_grid_h = (sampling_ratio > 0)
171  ? sampling_ratio
172  : ceil(roi_height / pooled_height); // e.g., = 2
173  int roi_bin_grid_w =
174  (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
175 
176  // We do average (integral) pooling inside a bin
177  const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
178 
179  // we want to precalculate indeces and weights shared by all chanels,
180  // this is the key point of optimiation
181  std::vector<PreCalc<T>> pre_calc(
182  roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
183  pre_calc_for_bilinear_interpolate(
184  height,
185  width,
186  pooled_height,
187  pooled_width,
188  roi_bin_grid_h,
189  roi_bin_grid_w,
190  roi_start_h,
191  roi_start_w,
192  bin_size_h,
193  bin_size_w,
194  roi_bin_grid_h,
195  roi_bin_grid_w,
196  pre_calc);
197 
198  if (order == StorageOrder::NCHW) {
199  for (int c = 0; c < channels; c++) {
200  int index_n_c = index_n + c * pooled_width * pooled_height;
201  const T* offset_bottom_data =
202  bottom_data + (roi_batch_ind * channels + c) * height * width;
203  int pre_calc_index = 0;
204 
205  for (int ph = 0; ph < pooled_height; ph++) {
206  for (int pw = 0; pw < pooled_width; pw++) {
207  int index = index_n_c + ph * pooled_width + pw;
208 
209  T output_val = 0.;
210  for (int iy = 0; iy < roi_bin_grid_h; iy++) {
211  for (int ix = 0; ix < roi_bin_grid_w; ix++) {
212  PreCalc<T> pc = pre_calc[pre_calc_index];
213  output_val += pc.w1 * offset_bottom_data[pc.pos1] +
214  pc.w2 * offset_bottom_data[pc.pos2] +
215  pc.w3 * offset_bottom_data[pc.pos3] +
216  pc.w4 * offset_bottom_data[pc.pos4];
217 
218  pre_calc_index += 1;
219  }
220  }
221  output_val /= count;
222 
223  top_data[index] = output_val;
224  } // for pw
225  } // for ph
226  } // for c
227  } // if nchw
228 
229  if (order == StorageOrder::NHWC) {
230  const T* offset_bottom_data =
231  bottom_data + roi_batch_ind * channels * height * width;
232  int pre_calc_index = 0;
233 
234  for (int ph = 0; ph < pooled_height; ph++) {
235  for (int pw = 0; pw < pooled_width; pw++) {
236  EVecXf output_vals = EVecXf::Zero(channels);
237 
238  for (int iy = 0; iy < roi_bin_grid_h; iy++) {
239  for (int ix = 0; ix < roi_bin_grid_w; ix++) {
240  PreCalc<T> pc = pre_calc[pre_calc_index];
241 
242  ConstEigenVectorMap<T> data_1(
243  offset_bottom_data + channels * pc.pos1, channels);
244  ConstEigenVectorMap<T> data_2(
245  offset_bottom_data + channels * pc.pos2, channels);
246  ConstEigenVectorMap<T> data_3(
247  offset_bottom_data + channels * pc.pos3, channels);
248  ConstEigenVectorMap<T> data_4(
249  offset_bottom_data + channels * pc.pos4, channels);
250 
251  output_vals += pc.w1 * data_1 + pc.w2 * data_2 + pc.w3 * data_3 +
252  pc.w4 * data_4;
253 
254  pre_calc_index += 1;
255  }
256  }
257  output_vals /= count;
258 
259  int index_nhw = index_n + (ph * pooled_width + pw) * channels;
260  std::memcpy(
261  top_data + index_nhw, output_vals.data(), channels * sizeof(T));
262  } // for pw
263  } // for ph
264  } // if nhwc
265 
266  } // for n
267 }
268 
269 } // namespace
270 
271 template <>
272 bool RoIAlignOp<float, CPUContext>::RunOnDevice() {
273  auto& X = Input(0); // Input data to pool, NCHW
274  auto& R = Input(1); // RoIs
275  auto* Y = Output(0); // RoI pooled data
276 
277  if (R.size() == 0) {
278  // Handle empty rois
279  if (order_ == StorageOrder::NCHW) {
280  Y->Resize(0, X.dim32(1), pooled_height_, pooled_width_);
281  } else if (order_ == StorageOrder::NHWC) {
282  Y->Resize(0, pooled_height_, pooled_width_, X.dim32(3));
283  }
284  // The following mutable_data calls are needed to allocate the tensors
285  Y->mutable_data<float>();
286  return true;
287  }
288 
289  CAFFE_ENFORCE_EQ(R.ndim(), 2);
290  // if R has 5 columns, the first column is the index, otherwise 0
291  CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5);
292 
293  assert(sampling_ratio_ >= 0);
294 
295  if (order_ == StorageOrder::NCHW) {
296  Y->Resize(R.dim32(0), X.dim32(1), pooled_height_, pooled_width_);
297  int output_size = Y->size();
298  ROIAlignForward<float>(
299  output_size,
300  X.data<float>(),
301  spatial_scale_,
302  X.dim32(1),
303  X.dim32(2),
304  X.dim32(3),
305  pooled_height_,
306  pooled_width_,
307  sampling_ratio_,
308  R.data<float>(),
309  R.dim32(1),
310  Y->mutable_data<float>(),
311  order_);
312  } else if (order_ == StorageOrder::NHWC) {
313  Y->Resize(R.dim32(0), pooled_height_, pooled_width_, X.dim32(3));
314  int output_size = Y->size();
315  ROIAlignForward<float>(
316  output_size,
317  X.data<float>(),
318  spatial_scale_,
319  X.dim32(3),
320  X.dim32(1),
321  X.dim32(2),
322  pooled_height_,
323  pooled_width_,
324  sampling_ratio_,
325  R.data<float>(),
326  R.dim32(1),
327  Y->mutable_data<float>(),
328  order_);
329  }
330 
331  return true;
332 }
333 
334 REGISTER_CPU_OPERATOR(RoIAlign, RoIAlignOp<float, CPUContext>);
335 
336 #ifdef CAFFE2_HAS_MKL_DNN
337 REGISTER_MKL_OPERATOR(
338  RoIAlign,
339  mkl::MKLFallbackOp<RoIAlignOp<float, CPUContext>>);
340 #endif // CAFFE2_HAS_MKL_DNN
341 
342 // Input: X, rois; Output: Y
343 OPERATOR_SCHEMA(RoIAlign)
344  .NumInputs(2)
345  .NumOutputs(1)
346  .SetDoc(R"DOC(
347 Region of Interest (RoI) align operation as used in Mask R-CNN.
348 )DOC")
349  .Arg(
350  "spatial_scale",
351  "(float) default 1.0; Spatial scale of the input feature map X "
352  "relative to the input image. E.g., 0.0625 if X has a stride of 16 "
353  "w.r.t. the input image.")
354  .Arg("pooled_h", "(int) default 1; Pooled output Y's height.")
355  .Arg("pooled_w", "(int) default 1; Pooled output Y's width.")
356  .Arg(
357  "sampling_ratio",
358  "(int) default -1; number of sampling points in the interpolation grid "
359  "used to compute the output value of each pooled output bin. If > 0, "
360  "then exactly sampling_ratio x sampling_ratio grid points are used. If "
361  "<= 0, then an adaptive number of grid points are used (computed as "
362  "ceil(roi_width / pooled_w), and likewise for height).")
363  .Input(0, "X", "4D feature map input of shape (N, C, H, W).")
364  .Input(
365  1,
366  "RoIs",
367  "2D input of shape (R, 5) specifying R RoIs with five columns "
368  "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI "
369  "coordinates are in the coordinate system of the input image.")
370  .Output(
371  0,
372  "Y",
373  "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element "
374  "is a pooled feature map cooresponding to the r-th RoI.");
375 
376 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.