Caffe2 - C++ API
A deep learning, cross platform ML framework
generate_proposals_op.cc
1 #include "caffe2/operators/generate_proposals_op.h"
2 #include "caffe2/operators/generate_proposals_op_util_boxes.h"
3 #include "generate_proposals_op_util_nms.h"
4 
5 #ifdef CAFFE2_USE_MKL
6 #include "caffe2/mkl/operators/operator_fallback_mkl.h"
7 #endif // CAFFE2_USE_MKL
8 
9 namespace caffe2 {
10 
11 namespace {
12 
13 // Compute the 1-d index of a n-dimensional contiguous row-major tensor for
14 // a given n-dimensional index 'index'
15 size_t ComputeStartIndex(
16  const TensorCPU& tensor,
17  const std::vector<int>& index) {
18  DCHECK_EQ(index.size(), tensor.ndim());
19 
20  size_t ret = 0;
21  for (int i = 0; i < index.size(); i++) {
22  ret += index[i] * tensor.size_from_dim(i + 1);
23  }
24 
25  return ret;
26 }
27 
28 // Get a sub tensor view from 'tensor' using data pointer from 'tensor'
29 template <class T>
30 utils::ConstTensorView<T> GetSubTensorView(
31  const TensorCPU& tensor,
32  int dim0_start_index) {
33  DCHECK_EQ(tensor.meta().itemsize(), sizeof(T));
34 
35  if (tensor.size() == 0) {
36  return utils::ConstTensorView<T>(nullptr, {});
37  }
38 
39  std::vector<int> start_dims(tensor.ndim(), 0);
40  start_dims.at(0) = dim0_start_index;
41  auto st_idx = ComputeStartIndex(tensor, start_dims);
42  auto ptr = tensor.data<T>() + st_idx;
43 
44  auto& input_dims = tensor.dims();
45  std::vector<int> ret_dims(input_dims.begin() + 1, input_dims.end());
46 
47  utils::ConstTensorView<T> ret(ptr, ret_dims);
48  return ret;
49 }
50 
51 } // namespace
52 
53 namespace utils {
54 
55 ERMatXf ComputeAllAnchors(
56  const TensorCPU& anchors,
57  int height,
58  int width,
59  float feat_stride) {
60  const auto K = height * width;
61  const auto A = anchors.dim(0);
62 
63  ERMatXf shift_x = (ERVecXf::LinSpaced(width, 0.0, width - 1.0) * feat_stride)
64  .replicate(height, 1);
65  ERMatXf shift_y = (EVecXf::LinSpaced(height, 0.0, height - 1.0) * feat_stride)
66  .replicate(1, width);
67  Eigen::MatrixXf shifts(K, 4);
68  shifts << ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
69  ConstEigenVectorMap<float>(shift_y.data(), shift_y.size()),
70  ConstEigenVectorMap<float>(shift_x.data(), shift_x.size()),
71  ConstEigenVectorMap<float>(shift_y.data(), shift_y.size());
72 
73  // Broacast anchors over shifts to enumerate all anchors at all positions
74  // in the (H, W) grid:
75  // - add A anchors of shape (1, A, 4) to
76  // - K shifts of shape (K, 1, 4) to get
77  // - all shifted anchors of shape (K, A, 4)
78  // - reshape to (K*A, 4) shifted anchors
79  ConstEigenMatrixMap<float> anchors_vec(
80  anchors.template data<float>(), 1, A * 4);
81  // equivalent to python code
82  // all_anchors = (
83  // self._model.anchors.reshape((1, A, 4)) +
84  // shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
85  // all_anchors = all_anchors.reshape((K * A, 4))
86  // all_anchors_vec: (K, A * 4)
87  ERMatXf all_anchors_vec =
88  anchors_vec.replicate(K, 1) + shifts.rowwise().replicate(A);
89 
90  // use the following to reshape to (K * A, 4)
91  // Eigen::Map<const ERMatXf> all_anchors(all_anchors_vec.data(), K * A, 4);
92 
93  return all_anchors_vec;
94 }
95 
96 } // namespace utils
97 
98 template <>
99 void GenerateProposalsOp<CPUContext>::ProposalsForOneImage(
100  const Eigen::Array3f& im_info,
101  const Eigen::Map<const ERMatXf>& all_anchors,
102  const utils::ConstTensorView<float>& bbox_deltas_tensor,
103  const utils::ConstTensorView<float>& scores_tensor,
104  ERArrXXf* out_boxes,
105  EArrXf* out_probs) const {
106  const auto& pre_nms_topN = rpn_pre_nms_topN_;
107  const auto& post_nms_topN = rpn_post_nms_topN_;
108  const auto& nms_thresh = rpn_nms_thresh_;
109  const auto& min_size = rpn_min_size_;
110 
111  // Transpose and reshape predicted bbox transformations to get them
112  // into the same order as the anchors:
113  // - bbox deltas will be (4 * A, H, W) format from conv output
114  // - transpose to (H, W, 4 * A)
115  // - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
116  // in slowest to fastest order to match the enumerated anchors
117  CAFFE_ENFORCE_EQ(bbox_deltas_tensor.ndim(), 3);
118  CAFFE_ENFORCE_EQ(bbox_deltas_tensor.dim(0) % 4, 0);
119  auto A = bbox_deltas_tensor.dim(0) / 4;
120  auto H = bbox_deltas_tensor.dim(1);
121  auto W = bbox_deltas_tensor.dim(2);
122  // equivalent to python code
123  // bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, 4))
124  ERArrXXf bbox_deltas(H * W * A, 4);
125  Eigen::Map<ERMatXf>(bbox_deltas.data(), H * W, 4 * A) =
126  Eigen::Map<const ERMatXf>(bbox_deltas_tensor.data(), A * 4, H * W)
127  .transpose();
128  CAFFE_ENFORCE_EQ(bbox_deltas.rows(), all_anchors.rows());
129 
130  // - scores are (A, H, W) format from conv output
131  // - transpose to (H, W, A)
132  // - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
133  // to match the order of anchors and bbox_deltas
134  CAFFE_ENFORCE_EQ(scores_tensor.ndim(), 3);
135  CAFFE_ENFORCE_EQ(scores_tensor.dims(), (vector<int>{A, H, W}));
136  // equivalent to python code
137  // scores = scores.transpose((1, 2, 0)).reshape((-1, 1))
138  EArrXf scores(scores_tensor.size());
139  Eigen::Map<ERMatXf>(scores.data(), H * W, A) =
140  Eigen::Map<const ERMatXf>(scores_tensor.data(), A, H * W).transpose();
141 
142  // Transform anchors into proposals via bbox transformations
143  static const std::vector<float> bbox_weights{1.0, 1.0, 1.0, 1.0};
144  auto proposals = utils::bbox_transform(
145  all_anchors.array(),
146  bbox_deltas,
147  bbox_weights,
148  utils::BBOX_XFORM_CLIP_DEFAULT,
149  correct_transform_coords_);
150 
151  // 2. clip proposals to image (may result in proposals with zero area
152  // that will be removed in the next step)
153  proposals = utils::clip_boxes(proposals, im_info[0], im_info[1]);
154 
155  // 3. remove predicted boxes with either height or width < min_size
156  auto keep = utils::filter_boxes(proposals, min_size, im_info);
157  DCHECK_LE(keep.size(), scores.size());
158 
159  // 4. sort all (proposal, score) pairs by score from highest to lowest
160  // 5. take top pre_nms_topN (e.g. 6000)
161  std::sort(keep.begin(), keep.end(), [&scores](int lhs, int rhs) {
162  return scores[lhs] > scores[rhs];
163  });
164 
165  if (pre_nms_topN > 0 && pre_nms_topN < keep.size()) {
166  keep.resize(pre_nms_topN);
167  }
168 
169  // 6. apply loose nms (e.g. threshold = 0.7)
170  // 7. take after_nms_topN (e.g. 300)
171  // 8. return the top proposals (-> RoIs top)
172  if (post_nms_topN > 0 && post_nms_topN < keep.size()) {
173  keep = utils::nms_cpu(proposals, scores, keep, nms_thresh, post_nms_topN);
174  } else {
175  keep = utils::nms_cpu(proposals, scores, keep, nms_thresh);
176  }
177 
178  // Generate outputs
179  utils::GetSubArrayRows(proposals, utils::AsEArrXt(keep), out_boxes);
180  utils::GetSubArray(scores, utils::AsEArrXt(keep), out_probs);
181 }
182 
183 template <>
184 bool GenerateProposalsOp<CPUContext>::RunOnDevice() {
185  const auto& scores = Input(0);
186  const auto& bbox_deltas = Input(1);
187  const auto& im_info_tensor = Input(2);
188  const auto& anchors = Input(3);
189  auto* out_rois = Output(0);
190  auto* out_rois_probs = Output(1);
191 
192  CAFFE_ENFORCE_EQ(scores.ndim(), 4, scores.ndim());
193  CAFFE_ENFORCE(scores.template IsType<float>(), scores.meta().name());
194  const auto num_images = scores.dim(0);
195  const auto A = scores.dim(1);
196  const auto height = scores.dim(2);
197  const auto width = scores.dim(3);
198  const auto K = height * width;
199 
200  // bbox_deltas: (num_images, A * 4, H, W)
201  CAFFE_ENFORCE_EQ(
202  bbox_deltas.dims(), (vector<TIndex>{num_images, 4 * A, height, width}));
203 
204  // im_info_tensor: (num_images, 3), format [height, width, scale; ...]
205  CAFFE_ENFORCE_EQ(im_info_tensor.dims(), (vector<TIndex>{num_images, 3}));
206  CAFFE_ENFORCE(
207  im_info_tensor.template IsType<float>(), im_info_tensor.meta().name());
208 
209  // anchors: (A, 4)
210  CAFFE_ENFORCE_EQ(anchors.dims(), (vector<TIndex>{A, 4}));
211  CAFFE_ENFORCE(anchors.template IsType<float>(), anchors.meta().name());
212 
213  // Broadcast the anchors to all pixels
214  auto all_anchors_vec =
215  utils::ComputeAllAnchors(anchors, height, width, feat_stride_);
216  Eigen::Map<const ERMatXf> all_anchors(all_anchors_vec.data(), K * A, 4);
217 
218  Eigen::Map<const ERArrXXf> im_info(
219  im_info_tensor.data<float>(),
220  im_info_tensor.dim(0),
221  im_info_tensor.dim(1));
222 
223  const int roi_col_count = 5;
224  out_rois->Resize(0, roi_col_count);
225  out_rois_probs->Resize(0);
226 
227  // Use openmp for acceleration?
228  for (int i = 0; i < num_images; i++) {
229  auto cur_im_info = im_info.row(i);
230  auto cur_bbox_deltas = GetSubTensorView<float>(bbox_deltas, i);
231  auto cur_scores = GetSubTensorView<float>(scores, i);
232 
233  ERArrXXf im_i_boxes;
234  EArrXf im_i_probs;
235  ProposalsForOneImage(
236  cur_im_info,
237  all_anchors,
238  cur_bbox_deltas,
239  cur_scores,
240  &im_i_boxes,
241  &im_i_probs);
242 
243  int csz = im_i_boxes.rows();
244  int cur_start_idx = out_rois->dim(0);
245 
246  out_rois->Extend(csz, 50, &context_);
247  out_rois_probs->Extend(csz, 50, &context_);
248 
249  // write rois
250  Eigen::Map<ERArrXXf> cur_rois(
251  out_rois->mutable_data<float>() + cur_start_idx * roi_col_count,
252  csz,
253  5);
254  cur_rois.col(0).setConstant(i);
255  cur_rois.block(0, 1, csz, 4) = im_i_boxes;
256 
257  // write rois_probs
258  Eigen::Map<EArrXf>(
259  out_rois_probs->mutable_data<float>() + cur_start_idx, csz) =
260  im_i_probs;
261  }
262 
263  return true;
264 }
265 
266 namespace {
267 
268 REGISTER_CPU_OPERATOR(GenerateProposals, GenerateProposalsOp<CPUContext>);
269 // For backward compatibility
270 REGISTER_CPU_OPERATOR(GenerateProposalsCPP, GenerateProposalsOp<CPUContext>);
271 
272 #ifdef CAFFE2_HAS_MKL_DNN
273 REGISTER_MKL_OPERATOR(
274  GenerateProposals,
275  mkl::MKLFallbackOp<GenerateProposalsOp<CPUContext>>);
276 // For backward compatibility
277 REGISTER_MKL_OPERATOR(
278  GenerateProposalsCPP,
279  mkl::MKLFallbackOp<GenerateProposalsOp<CPUContext>>);
280 #endif // CAFFE2_HAS_MKL_DNN
281 
282 OPERATOR_SCHEMA(GenerateProposals)
283  .NumInputs(4)
284  .NumOutputs(2)
285  .SetDoc(R"DOC(
286 Generate bounding box proposals for Faster RCNN. The propoasls are generated for
287 a list of images based on image score 'score', bounding box regression result
288 'deltas' as well as predefined bounding box shapes 'anchors'. Greedy
289 non-maximum suppression is applied to generate the final bounding boxes.
290 )DOC")
291  .Arg("spatial_scale", "(float) spatial scale")
292  .Arg("pre_nms_topN", "(int) RPN_PRE_NMS_TOP_N")
293  .Arg("post_nms_topN", "(int) RPN_POST_NMS_TOP_N")
294  .Arg("nms_thresh", "(float) RPN_NMS_THRESH")
295  .Arg("min_size", "(float) RPN_MIN_SIZE")
296  .Input(0, "scores", "Scores from conv layer, size (img_count, A, H, W)")
297  .Input(
298  1,
299  "bbox_deltas",
300  "Bounding box deltas from conv layer, "
301  "size (img_count, 4 * A, H, W)")
302  .Input(
303  2,
304  "im_info",
305  "Image info, size (img_count, 3), "
306  "format (height, width, scale)")
307  .Input(3, "anchors", "Bounding box anchors, size (A, 4)")
308  .Output(
309  0,
310  "rois",
311  "Proposals, size (n x 5), "
312  "format (image_index, x1, y1, x2, y2)")
313  .Output(1, "rois_probs", "scores of proposals, size (n)");
314 // For backward compatibility
315 OPERATOR_SCHEMA(GenerateProposalsCPP).NumInputs(4).NumOutputs(2);
316 
317 SHOULD_NOT_DO_GRADIENT(GenerateProposals);
318 // For backward compatibility
319 SHOULD_NOT_DO_GRADIENT(GenerateProposalsCPP);
320 
321 } // namespace
322 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.