Caffe2 - C++ API
A deep learning, cross platform ML framework
int8_roi_align_op.h
1 #ifndef CAFFE2_OPERATORS_INT8_ROI_ALIGN_OP_H_
2 #define CAFFE2_OPERATORS_INT8_ROI_ALIGN_OP_H_
3 
4 #include "caffe2/core/common.h"
5 #include "caffe2/core/context.h"
6 #include "caffe2/core/logging.h"
7 #include "caffe2/core/operator.h"
8 #include "caffe2/core/operator_schema.h"
9 #include "caffe2/core/tensor_int8.h"
10 #include "caffe2/operators/quantized/int8_utils.h"
11 #include "caffe2/utils/math.h"
12 
13 namespace caffe2 {
14 
15 namespace int8 {
16 
17 namespace {
18 
19 struct PreCalc {
20  int pos1;
21  int pos2;
22  int pos3;
23  int pos4;
24  uint8_t w1;
25  uint8_t w2;
26  uint8_t w3;
27  uint8_t w4;
28 };
29 
30 void pre_calc_for_bilinear_interpolate(
31  const int height,
32  const int width,
33  const int pooled_height,
34  const int pooled_width,
35  const int iy_upper,
36  const int ix_upper,
37  float roi_start_h,
38  float roi_start_w,
39  float bin_size_h,
40  float bin_size_w,
41  int roi_bin_grid_h,
42  int roi_bin_grid_w,
43  std::vector<PreCalc>& pre_calc) {
44  int pre_calc_index = 0;
45  // boltnn use a smaller multiplier here. Sometimes w will shrink to 0.
46  const float w_multiplier = 255.0;
47  for (int ph = 0; ph < pooled_height; ph++) {
48  for (int pw = 0; pw < pooled_width; pw++) {
49  for (int iy = 0; iy < iy_upper; iy++) {
50  const float yy = roi_start_h + ph * bin_size_h +
51  static_cast<float>(iy + .5f) * bin_size_h /
52  static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
53  for (int ix = 0; ix < ix_upper; ix++) {
54  const float xx = roi_start_w + pw * bin_size_w +
55  static_cast<float>(ix + .5f) * bin_size_w /
56  static_cast<float>(roi_bin_grid_w);
57 
58  float x = xx;
59  float y = yy;
60  // deal with: inverse elements are out of feature map boundary
61  if (y < -1.0 || y > height || x < -1.0 || x > width) {
62  // empty
63  PreCalc pc;
64  pc.pos1 = 0;
65  pc.pos2 = 0;
66  pc.pos3 = 0;
67  pc.pos4 = 0;
68  pc.w1 = 0;
69  pc.w2 = 0;
70  pc.w3 = 0;
71  pc.w4 = 0;
72  pre_calc[pre_calc_index] = pc;
73  pre_calc_index += 1;
74  continue;
75  }
76 
77  if (y <= 0) {
78  y = 0;
79  }
80  if (x <= 0) {
81  x = 0;
82  }
83 
84  int y_low = (int)y;
85  int x_low = (int)x;
86  int y_high;
87  int x_high;
88 
89  if (y_low >= height - 1) {
90  y_high = y_low = height - 1;
91  y = (float)y_low;
92  } else {
93  y_high = y_low + 1;
94  }
95 
96  if (x_low >= width - 1) {
97  x_high = x_low = width - 1;
98  x = (float)x_low;
99  } else {
100  x_high = x_low + 1;
101  }
102 
103  float ly = y - y_low;
104  float lx = x - x_low;
105  float hy = 1. - ly, hx = 1. - lx;
106  // w are not necessary 1
107  uint8_t w1 = static_cast<uint8_t>(Round(hy * hx * w_multiplier));
108  uint8_t w2 = static_cast<uint8_t>(Round(hy * lx * w_multiplier));
109  uint8_t w3 = static_cast<uint8_t>(Round(ly * hx * w_multiplier));
110  uint8_t w4 = static_cast<uint8_t>(Round(ly * lx * w_multiplier));
111 
112  // save weights and indeces
113  PreCalc pc;
114  pc.pos1 = y_low * width + x_low;
115  pc.pos2 = y_low * width + x_high;
116  pc.pos3 = y_high * width + x_low;
117  pc.pos4 = y_high * width + x_high;
118 
119  pc.w1 = w1;
120  pc.w2 = w2;
121  pc.w3 = w3;
122  pc.w4 = w4;
123  pre_calc[pre_calc_index] = pc;
124 
125  pre_calc_index += 1;
126  }
127  }
128  }
129  }
130 }
131 
132 void ROIAlignForward(
133  const int nthreads,
134  const uint8_t* bottom_data,
135  const float& spatial_scale,
136  const int channels,
137  const int height,
138  const int width,
139  const int pooled_height,
140  const int pooled_width,
141  const int sampling_ratio,
142  const float* bottom_rois,
143  int roi_cols,
144  uint8_t* top_data,
145  const float x_scale,
146  const float y_scale,
147  const int32_t x_offset,
148  const int32_t y_offset,
149  StorageOrder order) {
150  DCHECK(roi_cols == 4 || roi_cols == 5);
151 
152  int n_rois = nthreads / channels / pooled_width / pooled_height;
153 
154  for (int n = 0; n < n_rois; n++) {
155  int index_n = n * channels * pooled_width * pooled_height;
156 
157  // roi could have 4 or 5 columns
158  const float* offset_bottom_rois = bottom_rois + n * roi_cols;
159  int roi_batch_ind = 0;
160  if (roi_cols == 5) {
161  roi_batch_ind = offset_bottom_rois[0];
162  offset_bottom_rois++;
163  }
164 
165  // Do not using rounding; this implementation detail is critical
166  float roi_start_w = offset_bottom_rois[0] * spatial_scale;
167  float roi_start_h = offset_bottom_rois[1] * spatial_scale;
168  float roi_end_w = offset_bottom_rois[2] * spatial_scale;
169  float roi_end_h = offset_bottom_rois[3] * spatial_scale;
170 
171  // Force malformed ROIs to be 1x1
172  float roi_width = std::max(roi_end_w - roi_start_w, (float)1.);
173  float roi_height = std::max(roi_end_h - roi_start_h, (float)1.);
174  float bin_size_h =
175  static_cast<float>(roi_height) / static_cast<float>(pooled_height);
176  float bin_size_w =
177  static_cast<float>(roi_width) / static_cast<float>(pooled_width);
178 
179  // We use roi_bin_grid to sample the grid and mimic integral
180  int roi_bin_grid_h = (sampling_ratio > 0)
181  ? sampling_ratio
182  : ceil(roi_height / pooled_height); // e.g., = 2
183  int roi_bin_grid_w =
184  (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
185 
186  // We do average (integral) pooling inside a bin
187  const float count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
188 
189  // calculate multiplier
190  double real_multiplier = x_scale / (y_scale * 255.0 * count);
191  int32_t Y_multiplier;
192  int Y_shift;
193  QuantizeMultiplierSmallerThanOne(real_multiplier, &Y_multiplier, &Y_shift);
194 
195  // we want to precalculate indeces and weights shared by all chanels,
196  // this is the key point of optimiation
197  std::vector<PreCalc> pre_calc(
198  roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
199  pre_calc_for_bilinear_interpolate(
200  height,
201  width,
202  pooled_height,
203  pooled_width,
204  roi_bin_grid_h,
205  roi_bin_grid_w,
206  roi_start_h,
207  roi_start_w,
208  bin_size_h,
209  bin_size_w,
210  roi_bin_grid_h,
211  roi_bin_grid_w,
212  pre_calc);
213 
214  const uint8_t* offset_bottom_data =
215  bottom_data + roi_batch_ind * channels * height * width;
216  int pre_calc_index = 0;
217  for (int ph = 0; ph < pooled_height; ph++) {
218  for (int pw = 0; pw < pooled_width; pw++) {
219  vector<int32_t> acc_buffer(channels, 0);
220 
221  for (int iy = 0; iy < roi_bin_grid_h; iy++) {
222  for (int ix = 0; ix < roi_bin_grid_w; ix++) {
223  PreCalc pc = pre_calc[pre_calc_index];
224 
225  const uint8_t* data_1 = offset_bottom_data + channels * pc.pos1;
226  const uint8_t* data_2 = offset_bottom_data + channels * pc.pos2;
227  const uint8_t* data_3 = offset_bottom_data + channels * pc.pos3;
228  const uint8_t* data_4 = offset_bottom_data + channels * pc.pos4;
229  for (int c = 0; c < channels; ++c) {
230  acc_buffer[c] += (uint32_t)(pc.w1) * (uint32_t)(data_1[c]);
231  acc_buffer[c] += (uint32_t)(pc.w2) * (uint32_t)(data_2[c]);
232  acc_buffer[c] += (uint32_t)(pc.w3) * (uint32_t)(data_3[c]);
233  acc_buffer[c] += (uint32_t)(pc.w4) * (uint32_t)(data_4[c]);
234 
235  // w_1..4 are all multiplied by 255.0
236  acc_buffer[c] -= x_offset * 255.0;
237  }
238 
239  pre_calc_index += 1;
240  }
241  }
242  int index_nhw = index_n + (ph * pooled_width + pw) * channels;
243  uint8_t* out_ptr = top_data + index_nhw;
244  for (int c = 0; c < channels; ++c) {
245  int32_t a_mul = MultiplyByQuantizedMultiplierSmallerThanOne(
246  acc_buffer[c], Y_multiplier, Y_shift) +
247  y_offset;
248  int32_t clamped_a =
249  std::min<int32_t>(255, std::max<int32_t>(0, a_mul));
250  out_ptr[c] = static_cast<uint8_t>(clamped_a);
251  }
252  } // for pw
253  } // for ph
254  } // for n
255 }
256 
257 } // namespace
258 
259 class Int8RoIAlignOp final : public Operator<CPUContext> {
260  public:
261  template <class... Args>
262  explicit Int8RoIAlignOp(Args&&... args)
263  : Operator<CPUContext>(std::forward<Args>(args)...),
264  order_(StringToStorageOrder(
265  this->template GetSingleArgument<string>("order", "NHWC"))),
266  spatial_scale_(
267  this->template GetSingleArgument<float>("spatial_scale", 1.)),
268  pooled_height_(this->template GetSingleArgument<int>("pooled_h", 1)),
269  pooled_width_(this->template GetSingleArgument<int>("pooled_w", 1)),
270  sampling_ratio_(
271  this->template GetSingleArgument<int>("sampling_ratio", -1)) {
272  DCHECK_GT(spatial_scale_, 0);
273  DCHECK_GT(pooled_height_, 0);
274  DCHECK_GT(pooled_width_, 0);
275  DCHECK_GE(sampling_ratio_, 0);
276  // only supports NHWC
277  CAFFE_ENFORCE(order_ == StorageOrder::NHWC);
278  }
279 
280  bool RunOnDevice() override {
281  const auto& X = Inputs()[0]->template Get<Int8TensorCPU>(); // Input, NHWC
282  auto& R = Input(1); // RoIs
283  auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>(); // RoI pooled
284  // calculate multiplier
285  int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0);
286  auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
287  Y->scale = Y_scale;
288  Y->zero_point = Y_offset;
289 
290  if (R.numel() == 0) {
291  // Handle empty rois
292  Y->t.Resize(0, pooled_height_, pooled_width_, X.t.dim32(3));
293  // The following mutable_data calls are needed to allocate the tensors
294  Y->t.mutable_data<uint8_t>();
295  return true;
296  }
297 
298  CAFFE_ENFORCE_EQ(R.dim(), 2);
299  // if R has 5 columns, the first column is the index, otherwise 0
300  CAFFE_ENFORCE(R.dim32(1) == 4 || R.dim32(1) == 5);
301 
302  assert(sampling_ratio_ >= 0);
303 
304  // only supports NHWC now
306  &Y->t,
307  {R.dim32(0), pooled_height_, pooled_width_, X.t.dim32(3)},
308  at::dtype<uint8_t>().device(CPU));
309  int output_size = Y->t.numel();
310 
311  ROIAlignForward(
312  output_size,
313  X.t.data<uint8_t>(),
314  spatial_scale_,
315  X.t.dim32(3),
316  X.t.dim32(1),
317  X.t.dim32(2),
318  pooled_height_,
319  pooled_width_,
320  sampling_ratio_,
321  R.data<float>(),
322  R.dim32(1),
323  Y->t.mutable_data<uint8_t>(),
324  X.scale,
325  Y_scale,
326  X.zero_point,
327  Y_offset,
328  order_);
329 
330  return true;
331  }
332 
333  protected:
334  StorageOrder order_;
335  float spatial_scale_;
336  int pooled_height_;
337  int pooled_width_;
338  int sampling_ratio_;
339 };
340 
341 } // namespace int8
342 
343 } // namespace caffe2
344 
345 #endif // CAFFE2_OPERATORS_INT8_ROI_ALIGN_OP_H_
void ReinitializeTensor(Tensor *tensor, at::IntArrayRef dims, at::TensorOptions options)
Reinitialize a Tensor to given dims and options if necessary, note that this will not do anything if ...
Definition: tensor.cc:127
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13