Caffe2 - C++ API
A deep learning, cross platform ML framework
roi_pool_op.cc
1 
17 #include "roi_pool_op.h"
18 
19 #include <cfloat>
20 
21 namespace caffe2 {
22 
23 using std::max;
24 using std::min;
25 
26 template <>
27 bool RoIPoolOp<float, CPUContext>::RunOnDevice() {
28  const auto& X = Input(0); // Input data to pool
29  const auto& R = Input(1); // RoIs
30  auto* Y = Output(0); // RoI pooled data
31  auto* A = is_test_ ? nullptr : Output(1); // argmaxes
32 
33  // Each ROI is of the form [batch_index x1 y1 x2 y2]
34  CAFFE_ENFORCE_EQ(R.dim32(1), 5);
35 
36  // TODO: Handle the storage_order properly to get the NCWH.
37  int batch_size = X.dim32(0);
38  int channels = X.dim32(1);
39  int height = X.dim32(2);
40  int width = X.dim32(3);
41  int num_rois = R.dim32(0);
42 
43  Y->Resize(num_rois, channels, pooled_height_, pooled_width_);
44  if (!is_test_) {
45  A->Resize(Y->dims());
46  }
47 
48  const float* Xdata = X.data<float>();
49  const float* rois = R.data<float>();
50  float* Ydata = Y->mutable_data<float>();
51  int* argmax_data = is_test_ ? nullptr : A->mutable_data<int>();
52 
53  // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
54  for (int n = 0; n < num_rois; ++n) {
55  int roi_batch_id = rois[0];
56  int roi_start_w = round(rois[1] * spatial_scale_);
57  int roi_start_h = round(rois[2] * spatial_scale_);
58  int roi_end_w = round(rois[3] * spatial_scale_);
59  int roi_end_h = round(rois[4] * spatial_scale_);
60  CAFFE_ENFORCE_GE(roi_batch_id, 0);
61  CAFFE_ENFORCE_LT(roi_batch_id, batch_size);
62 
63  // Force malformed ROIs to be 1x1
64  int roi_height = max(roi_end_h - roi_start_h + 1, 1);
65  int roi_width = max(roi_end_w - roi_start_w + 1, 1);
66 
67  const float bin_size_h =
68  static_cast<float>(roi_height) / static_cast<float>(pooled_height_);
69  const float bin_size_w =
70  static_cast<float>(roi_width) / static_cast<float>(pooled_width_);
71 
72  const float* batch_data = Xdata + roi_batch_id * X.size_from_dim(1);
73 
74  for (int c = 0; c < channels; ++c) {
75  for (int ph = 0; ph < pooled_height_; ++ph) {
76  for (int pw = 0; pw < pooled_width_; ++pw) {
77  // Compute pooling region for this output unit:
78  // start (included) = floor(ph * roi_height / pooled_height_)
79  // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
80  int hstart =
81  static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
82  int wstart =
83  static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
84  int hend =
85  static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
86  int wend =
87  static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
88 
89  // Add roi offsets and clip to input boundaries
90  hstart = min(max(hstart + roi_start_h, 0), height);
91  hend = min(max(hend + roi_start_h, 0), height);
92  wstart = min(max(wstart + roi_start_w, 0), width);
93  wend = min(max(wend + roi_start_w, 0), width);
94 
95  const int pool_index = ph * pooled_width_ + pw;
96 
97  // Define an empty pooling region to be zero
98  bool is_empty = (hend <= hstart) || (wend <= wstart);
99  Ydata[pool_index] = is_empty ? 0 : -FLT_MAX;
100  if (!is_test_) {
101  // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
102  argmax_data[pool_index] = -1;
103  }
104 
105  for (int h = hstart; h < hend; ++h) {
106  for (int w = wstart; w < wend; ++w) {
107  const int index = h * width + w;
108  if (batch_data[index] > Ydata[pool_index]) {
109  Ydata[pool_index] = batch_data[index];
110  if (!is_test_) {
111  argmax_data[pool_index] = index;
112  }
113  }
114  }
115  }
116  }
117  }
118  // Increment all data pointers by one channel
119  batch_data += X.size_from_dim(2);
120  Ydata += Y->size_from_dim(2);
121  if (!is_test_) {
122  argmax_data += A->size_from_dim(2);
123  }
124  }
125  // Increment ROI data pointer
126  rois += R.size_from_dim(1);
127  }
128 
129  return true;
130 }
131 
132 REGISTER_CPU_OPERATOR(RoIPool, RoIPoolOp<float, CPUContext>);
133 REGISTER_CPU_OPERATOR(RoIPoolGradient, RoIPoolGradientOp<float, CPUContext>);
134 
135 // Input: X, rois
136 // Output case #1: Y, argmaxes (train mode)
137 // Output case #2: Y (test mode)
138 OPERATOR_SCHEMA(RoIPool)
139  .NumInputs(2)
140  .NumOutputs({1, 2})
141  .TensorInferenceFunction([](const OperatorDef& def,
142  const vector<TensorShape>& in) {
143  ArgumentHelper helper(def);
144  const StorageOrder order = StringToStorageOrder(
145  helper.GetSingleArgument<string>("order", "NCHW"));
146  const TensorShape& X = in[0];
147  const int num_channels =
148  (order == StorageOrder::NCHW ? X.dims(1) : X.dims(3));
149  const TensorShape& R = in[1];
150  const int num_rois = R.dims(0);
151  const int pooled_height = helper.GetSingleArgument<int>("pooled_h", 1);
152  const int pooled_width = helper.GetSingleArgument<int>("pooled_w", 1);
153  TensorShape Y = CreateTensorShape(
154  vector<int>({num_rois, num_channels, pooled_height, pooled_width}),
155  X.data_type());
156 
157  bool is_test = helper.GetSingleArgument<int>(OpSchema::Arg_IsTest, 0);
158  if (!is_test) {
159  TensorShape argmaxes = Y;
160  argmaxes.set_data_type(TensorProto_DataType_INT32);
161  return vector<TensorShape>({Y, argmaxes});
162  } else {
163  return vector<TensorShape>({Y});
164  }
165  })
166  .SetDoc(R"DOC(
167 Carries out ROI Pooling for Faster-RCNN.
168 Depending on the mode, there are multiple output cases:
169 
170  Output case #1: Y, argmaxes (train mode)
171  Output case #2: Y (test mode)
172 )DOC")
173  .Arg(
174  "is_test",
175  "If set, run in test mode and skip computation of argmaxes (used for "
176  "gradient computation). Only one output tensor is produced. "
177  "(Default: false).")
178  .Arg("order", "A StorageOrder string (Default: \"NCHW\").")
179  .Arg("pooled_h", "The pooled output height (Default: 1).")
180  .Arg("pooled_w", "The pooled output width (Default: 1).")
181  .Arg(
182  "spatial_scale",
183  "Multiplicative spatial scale factor to translate ROI coords from "
184  "their input scale to the scale used when pooling (Default: 1.0).")
185  .Input(
186  0,
187  "X",
188  "The input 4-D tensor of data. Only NCHW order is currently supported.")
189  .Input(
190  1,
191  "rois",
192  "RoIs (Regions of Interest) to pool over. Should be a 2-D tensor of "
193  "shape (num_rois, 5) given as [[batch_id, x1, y1, x2, y2], ...].")
194  .Output(
195  0,
196  "Y",
197  "RoI pooled output 4-D tensor of shape "
198  "(num_rois, channels, pooled_h, pooled_w).")
199  .Output(
200  1,
201  "argmaxes",
202  "Argmaxes corresponding to indices in X used for gradient computation. "
203  "Only output if arg \"is_test\" is false.");
204 
205 // Input: X, rois, argmaxes, dY (aka "gradOutput")
206 // Output: dX (aka "gradInput")
207 OPERATOR_SCHEMA(RoIPoolGradient).NumInputs(4).NumOutputs(1);
208 
210  using GradientMakerBase::GradientMakerBase;
211  vector<OperatorDef> GetGradientDefs() override {
212  return SingleGradientDef(
213  "RoIPoolGradient",
214  "",
215  vector<string>{I(0), I(1), O(1), GO(0)},
216  vector<string>{GI(0)});
217  }
218 };
219 
220 REGISTER_GRADIENT(RoIPool, GetRoIPoolGradient);
221 
222 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...