Caffe2 - C++ API
A deep learning, cross platform ML framework
resize_op.cc
1 #include "caffe2/operators/resize_op.h"
2 
3 #include "caffe2/utils/cpu_neon.h"
4 #include "caffe2/utils/math.h"
5 
6 #ifdef CAFFE2_USE_MKLDNN
7 #include "caffe2/ideep/operators/operator_fallback_ideep.h"
8 #include "caffe2/ideep/utils/ideep_operator.h"
9 #endif
10 
11 namespace caffe2 {
12 
13 void resizeNearestNCHW2x(
14  int batch_size,
15  int num_channels,
16  int input_height,
17  int input_width,
18  const float* input,
19  float* output) {
20  const int output_height = input_height * 2;
21  const int output_width = input_width * 2;
22  for (int n = 0; n < batch_size; ++n) {
23  for (int c = 0; c < num_channels; ++c) {
24  for (int y = 0; y < output_height; ++y) {
25  const int in_y = y / 2;
26 
27 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
28  int vecW = (input_width / 4) * 4; // round down
29  int x = 0;
30  for (; x < vecW; x += 4) {
31  // load 0 1 2 3
32  float32x4_t v = vld1q_f32(input + in_y * input_width + x);
33  const int oidx = output_width * y + x * 2;
34  float32x4x2_t v2 = {{v, v}};
35  // store 00 11 22 33
36  vst2q_f32(output + oidx + 0, v2);
37  }
38 
39  // handle remainder
40  for (; x < input_width; ++x) {
41  const float v = input[in_y * input_width + x];
42  const int oidx = output_width * y + x * 2;
43  output[oidx + 0] = v;
44  output[oidx + 1] = v;
45  }
46 #else
47  for (int x = 0; x < input_width; ++x) {
48  const float v = input[in_y * input_width + x];
49  const int oidx = output_width * y + x * 2;
50  output[oidx + 0] = v;
51  output[oidx + 1] = v;
52  }
53 #endif
54  }
55  input += input_height * input_width;
56  output += output_height * output_width;
57  }
58  }
59 }
60 
61 template <>
62 bool ResizeNearestOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
63  const auto& X = Input(0);
64 
65  const int batch_size = X.dim32(0),
66  num_channels = X.dim32(1),
67  input_height = X.dim32(2),
68  input_width = X.dim32(3);
69  if (InputSize() == 2) {
70  const auto& scales = Input(1);
71  CAFFE_ENFORCE_EQ(scales.dim(), 1);
72  CAFFE_ENFORCE_EQ(scales.numel(), 2);
73  const float* scales_data = scales.data<float>();
74  height_scale_ = scales_data[0];
75  width_scale_ = scales_data[1];
76  }
77 
78  int output_width = input_width * width_scale_;
79  int output_height = input_height * height_scale_;
80  auto* Y = Output(
81  0,
82  {batch_size, num_channels, output_height, output_width},
83  at::dtype<float>());
84 
85  const float* Xdata = X.data<float>();
86  float* Ydata = Y->template mutable_data<float>();
87 
88  // Specialized implementation for fast 2x upsampling
89  if (width_scale_ == 2.0 && height_scale_ == 2.0) {
90  resizeNearestNCHW2x(
91  batch_size, num_channels, input_height, input_width, Xdata, Ydata);
92  return true;
93  }
94 
95  for (int n = 0; n < batch_size; ++n) {
96  for (int c = 0; c < num_channels; ++c) {
97  for (int y = 0; y < output_height; ++y) {
98  const int in_y = std::min((int)(y / height_scale_), (input_height - 1));
99  for (int x = 0; x < output_width; ++x) {
100  const int in_x = std::min((int)(x / width_scale_), (input_width - 1));
101  Ydata[output_width * y + x] = Xdata[input_width * in_y + in_x];
102  }
103  }
104  Xdata += input_height * input_width;
105  Ydata += output_width * output_height;
106  }
107  }
108 
109  return true;
110 }
111 
112 template <>
113 bool ResizeNearestOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
114  const auto& X = Input(0);
115 
116  const int batch_size = X.dim32(0), input_height = X.dim32(1),
117  input_width = X.dim32(2), num_channels = X.dim32(3);
118  if (InputSize() == 2) {
119  const auto& scales = Input(1);
120  CAFFE_ENFORCE_EQ(scales.dim(), 1);
121  CAFFE_ENFORCE_EQ(scales.numel(), 2);
122  const float* scales_data = scales.data<float>();
123  height_scale_ = scales_data[0];
124  width_scale_ = scales_data[1];
125  }
126 
127  int output_width = input_width * width_scale_;
128  int output_height = input_height * height_scale_;
129 
130  const int output_width_stride = output_width * num_channels;
131  const int input_width_stride = input_width * num_channels;
132 
133  auto* Y = Output(
134  0,
135  {batch_size, output_height, output_width, num_channels},
136  at::dtype<float>());
137 
138  const float* Xdata = X.data<float>();
139  float* Ydata = Y->template mutable_data<float>();
140 
141  for (int n = 0; n < batch_size; ++n) {
142  for (int y = 0; y < output_height; ++y) {
143  const int in_y = std::min((int)(y / height_scale_), (input_height - 1));
144  for (int x = 0; x < output_width; ++x) {
145  const int in_x = std::min((int)(x / width_scale_), (input_width - 1));
146  std::memcpy(
147  &Ydata[output_width_stride * y + num_channels * x],
148  &Xdata[input_width_stride * in_y + num_channels * in_x],
149  num_channels * sizeof(float));
150  }
151  }
152  Xdata += input_height * input_width_stride;
153  Ydata += output_height * output_width_stride;
154  }
155 
156  return true;
157 }
158 
159 template <>
160 bool ResizeNearestOp<float, CPUContext>::RunOnDevice() {
161  switch (order_) {
162  case StorageOrder::NHWC:
163  return RunOnDeviceWithOrderNHWC();
164  case StorageOrder::NCHW:
165  return RunOnDeviceWithOrderNCHW();
166  default:
167  CAFFE_THROW("Unknown Storage order: ", order_);
168  }
169 }
170 
171 template <>
172 bool ResizeNearestGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
173  const auto& dY = Input(0);
174  const auto& X = Input(1);
175 
176  const auto inputDims = dY.sizes();
177  CAFFE_ENFORCE_EQ(4, inputDims.size());
178  const int batch_size = dY.dim32(0),
179  num_channels = dY.dim32(1),
180  input_height = dY.dim32(2),
181  input_width = dY.dim32(3);
182  const int output_height = X.dim32(2);
183  const int output_width = X.dim32(3);
184  if (InputSize() == 3) {
185  const auto& scales = Input(2);
186  CAFFE_ENFORCE_EQ(scales.dim(), 1);
187  CAFFE_ENFORCE_EQ(scales.numel(), 2);
188  const float* scales_data = scales.data<float>();
189  height_scale_ = scales_data[0];
190  width_scale_ = scales_data[1];
191  }
192  auto* dX = Output(
193  0,
194  {batch_size, num_channels, output_height, output_width},
195  at::dtype<float>());
196  math::Set<float, CPUContext>(
197  dX->numel(), 0.0f, dX->template mutable_data<float>(), &context_);
198 
199  const float* dYdata = dY.data<float>();
200  float* dXdata = dX->template mutable_data<float>();
201 
202  for (int n = 0; n < batch_size; ++n) {
203  for (int c = 0; c < num_channels; ++c) {
204  for (int y = 0; y < input_height; ++y) {
205  const int out_y = std::min((int)(y / height_scale_),
206  (output_height - 1));
207  for (int x = 0; x < input_width; ++x) {
208  const int out_x = std::min((int)(x / width_scale_),
209  (output_width - 1));
210  dXdata[output_width * out_y + out_x] += dYdata[input_width * y + x];
211  }
212  }
213  dYdata += input_height * input_width;
214  dXdata += output_height * output_width;
215  }
216  }
217 
218  return true;
219 }
220 
221 template <>
222 bool ResizeNearestGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
223  const auto& dY = Input(0);
224  const auto& X = Input(1);
225 
226  const auto inputDims = dY.sizes();
227  CAFFE_ENFORCE_EQ(4, inputDims.size());
228  const int batch_size = dY.dim32(0), input_height = dY.dim32(1),
229  input_width = dY.dim32(2), num_channels = dY.dim32(3);
230  const int output_height = X.dim32(1);
231  const int output_width = X.dim32(2);
232  if (InputSize() == 3) {
233  const auto& scales = Input(2);
234  CAFFE_ENFORCE_EQ(scales.dim(), 1);
235  CAFFE_ENFORCE_EQ(scales.numel(), 2);
236  const float* scales_data = scales.data<float>();
237  height_scale_ = scales_data[0];
238  width_scale_ = scales_data[1];
239  }
240  auto* dX = Output(
241  0,
242  {batch_size, output_height, output_width, num_channels},
243  at::dtype<float>());
244  math::Set<float, CPUContext>(
245  dX->numel(), 0.0f, dX->template mutable_data<float>(), &context_);
246 
247  const int output_width_stride = output_width * num_channels;
248  const int input_width_stride = input_width * num_channels;
249 
250  const float* dYdata = dY.data<float>();
251  float* dXdata = dX->template mutable_data<float>();
252 
253  for (int n = 0; n < batch_size; ++n) {
254  for (int y = 0; y < input_height; ++y) {
255  const int out_y = std::min((int)(y / height_scale_), (output_height - 1));
256  for (int x = 0; x < input_width; ++x) {
257  const int out_x = std::min((int)(x / width_scale_), (output_width - 1));
258 
259  float* dXdata_c0 =
260  dXdata + output_width_stride * out_y + num_channels * out_x;
261  const float* dYdata_c0 =
262  dYdata + input_width_stride * y + num_channels * x;
263 
264  for (int c = 0; c < num_channels; ++c) {
265  dXdata_c0[c] += dYdata_c0[c];
266  }
267  }
268  }
269  dYdata += input_height * input_width_stride;
270  dXdata += output_height * output_width_stride;
271  }
272 
273  return true;
274 }
275 
276 template <>
277 bool ResizeNearestGradientOp<float, CPUContext>::RunOnDevice() {
278  switch (order_) {
279  case StorageOrder::NHWC:
280  return RunOnDeviceWithOrderNHWC();
281  case StorageOrder::NCHW:
282  return RunOnDeviceWithOrderNCHW();
283  default:
284  CAFFE_THROW("Unknown Storage order: ", order_);
285  }
286 }
287 REGISTER_CPU_OPERATOR(ResizeNearest, ResizeNearestOp<float, CPUContext>);
288 REGISTER_CPU_GRADIENT_OPERATOR(
289  ResizeNearestGradient,
290  ResizeNearestGradientOp<float, CPUContext>);
291 
292 #ifdef CAFFE2_USE_MKLDNN
293 REGISTER_IDEEP_OPERATOR(
294  ResizeNearest,
295  IDEEPFallbackOp<ResizeNearestOp<float, CPUContext>>);
296 #endif
297 
298 // Input: X, output: Y
299 OPERATOR_SCHEMA(ResizeNearest)
300  .NumInputs(1, 2)
301  .NumOutputs(1)
302  .Arg("width_scale", "Scale along width dimension")
303  .Arg("height_scale", "Scale along height dimension")
304  .SetDoc(R"DOC(
305 Resizes the spatial dimensions of the input using nearest neighbor
306 interpolation. The `width_scale` and `height_scale` arguments
307 control the size of the output, which is given by:
308 output_width = floor(input_width * width_scale)
309 output_height = floor(output_height * height_scale)
310 )DOC")
311  .Input(0, "X", "Input tensor")
312  .Input(
313  1,
314  "scales", // the hack to support onnx spec
315  "1D, 2-element, Scales tensor, [height_scale, width_scale]")
316  .Output(0, "Y", "Output tensor")
317  .InheritOnnxSchema("Upsample");
318 
319 // Input: dY, output: dX
320 GRADIENT_OPERATOR_SCHEMA(ResizeNearestGradient)
321  .NumInputs(2, 3)
322  .NumOutputs(1)
323  .Arg("width_scale", "Scale along width dimension")
324  .Arg("height_scale", "Scale along height dimension");
325 
327  using GradientMakerBase::GradientMakerBase;
328  vector<OperatorDef> GetGradientDefs() override {
329  if (def_.input().size() == 2) {
330  // this is a hack to support the second input as dynamic
331  // width_scale and height_scale to align with onnx change
332  return SingleGradientDef(
333  "ResizeNearestGradient",
334  "",
335  vector<string>{GO(0), I(0), I(1)},
336  vector<string>{GI(0)});
337  }
338  return SingleGradientDef("ResizeNearestGradient",
339  "",
340  vector<string>{GO(0), I(0)},
341  vector<string>{GI(0)});
342  }
343 };
344 REGISTER_GRADIENT(ResizeNearest, GetResizeNearestGradient);
345 
346 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...