Caffe2 - C++ API
A deep learning, cross platform ML framework
local_response_normalization_op.cc
1 #include "caffe2/operators/local_response_normalization_op.h"
2 
3 namespace caffe2 {
4 
5 template<>
6 bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
7  // Note(Yangqing): this one is copied from my Caffe implementation.
8  auto& X = Input(0);
9 
10  DCHECK_EQ(X.dim(), 4);
11  const int N = X.dim32(0);
12  const int C = X.dim32(1);
13  const int H = X.dim32(2);
14  const int W = X.dim32(3);
15  const int image_size = C * H * W;
16  const float* Xdata = X.data<float>();
17  auto* Y = Output(0, X.sizes(), at::dtype<float>());
18  float* Ydata = Y->template mutable_data<float>();
19 
20  if (OutputSize() > 1) {
21  scale_ = Output(1);
22  } else {
23  if (!scale_) {
24  scale_ = &local_scale_tensor_;
25  }
26  }
27  scale_->ResizeLike(X);
28  float* scale_data = scale_->template mutable_data<float>();
29  math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
30  Tensor padded_square(vector<int64_t>{C + size_ - 1, H, W}, CPU);
31  float* padded_square_data = padded_square.template mutable_data<float>();
32  math::Set<float, CPUContext>(
33  padded_square.numel(), 0., padded_square_data, &context_);
34  const float alpha_over_size = alpha_ / size_;
35  // go through the images
36  for (int n = 0; n < N; ++n) {
37  // compute the padded square
38  math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
39  padded_square_data + pre_pad_ * H * W,
40  &context_);
41  // Create the first channel scale
42  for (int c = 0; c < size_; ++c) {
43  math::Axpy<float, CPUContext>(
44  H * W, alpha_over_size, padded_square_data + c * H * W,
45  scale_data + image_size * n, &context_);
46  }
47  for (int c = 1; c < C; ++c) {
48  float* this_scale_slice = scale_data + n * image_size + c * H * W;
49  // copy previous scale
50  context_.CopyFromCPU<float>(
51  H * W, this_scale_slice - H * W, this_scale_slice);
52  // add head
53  math::Axpy<float, CPUContext>(
54  H * W, alpha_over_size, padded_square_data + (c + size_ - 1) * H * W,
55  this_scale_slice, &context_);
56  // subtract tail
57  math::Axpy<float, CPUContext>(
58  H * W, -alpha_over_size, padded_square_data + (c - 1) * H * W,
59  this_scale_slice, &context_);
60  }
61  }
62  math::Powx<float, CPUContext>(
63  X.numel(), scale_data, -beta_, Ydata, &context_);
64  math::Mul<float, CPUContext>(X.numel(), Ydata, Xdata, Ydata, &context_);
65  return true;
66 }
67 
68 template<>
69 bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
70  // Note(Yangqing): This one is copied from my Decaf implementation. How many
71  // variants have I written...?
72  auto& X = Input(0);
73 
74  DCHECK_EQ(X.dim(), 4);
75  const int N = X.dim32(0);
76  const int H = X.dim32(1);
77  const int W = X.dim32(2);
78  const int C = X.dim32(3);
79  const int num_rows = N * H * W;
80  const float* Xdata = X.data<float>();
81  auto* Y = Output(0, X.sizes(), at::dtype<float>());
82  float* Ydata = Y->template mutable_data<float>();
83 
84  if (OutputSize() > 1) {
85  scale_ = Output(1);
86  } else {
87  if (!scale_) {
88  scale_ = &local_scale_tensor_;
89  }
90  }
91  scale_->ResizeLike(X);
92  float* scale_data = scale_->template mutable_data<float>();
93 
94  Tensor padded_square(vector<int64_t>(1, C + size_ - 1), CPU);
95  float* padded_square_data = padded_square.template mutable_data<float>();
96  math::Set<float, CPUContext>(
97  padded_square.numel(), 0., padded_square_data, &context_);
98  const float alpha_over_size = alpha_ / size_;
99 
100  for (int n = 0; n < num_rows; ++n) {
101  for (int c = 0; c < C; ++c) {
102  padded_square_data[c + pre_pad_] =
103  Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
104  }
105  float accum_scale = 0.;
106  for (int i = 0; i < size_ - 1; ++i) {
107  accum_scale += padded_square_data[i];
108  }
109  for (int c = 0; c < C; ++c) {
110  accum_scale += padded_square_data[c + size_ - 1];
111  scale_data[n * C + c] = bias_ + accum_scale;
112  accum_scale -= padded_square_data[c];
113  }
114  }
115  math::Powx<float, CPUContext>(
116  X.numel(), scale_data, -beta_, Ydata, &context_);
117  math::Mul<float, CPUContext>(X.numel(), Ydata, Xdata, Ydata, &context_);
118  return true;
119 }
120 
121 template <>
122 bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
123  auto& X = Input(0);
124  auto& Y = Input(1);
125  auto& dY = Input(2);
126 
127  DCHECK_EQ(X.dim(), 4);
128  const int N = X.dim32(0);
129  const int C = X.dim32(1);
130  const int H = X.dim32(2);
131  const int W = X.dim32(3);
132  const int image_size = C * H * W;
133  // Loosely checking the size, assuming that the shapes will be the same as
134  // long as the sizes check out.
135  DCHECK_EQ(X.numel(), Y.numel());
136  DCHECK_EQ(X.numel(), dY.numel());
137  auto* dX = Output(0, X.sizes(), at::dtype<float>());
138 
139  const float* Xdata = X.data<float>();
140  const float* Ydata = Y.data<float>();
141  if (!scale_) {
142  scale_ = &local_scale_tensor_;
143  }
144  scale_->ResizeLike(X);
145  float* scale_data = scale_->template mutable_data<float>();
146  const float* dYdata = dY.data<float>();
147  float* dXdata = dX->template mutable_data<float>();
148 
149  Tensor padded_ratio(vector<int64_t>{C + size_ - 1, H, W}, CPU);
150  float* padded_ratio_data = padded_ratio.template mutable_data<float>();
151  // Compute scale(copied from LRNOp) - reusing padded_ratio
152  math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
153  math::Set<float, CPUContext>(
154  padded_ratio.numel(), 0., padded_ratio_data, &context_);
155  const float alpha_over_size = alpha_ / size_;
156  // go through the images
157  for (int n = 0; n < N; ++n) {
158  // compute the padded square
159  math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
160  padded_ratio_data + pre_pad_ * H * W,
161  &context_);
162  // Create the first channel scale
163  for (int c = 0; c < size_; ++c) {
164  math::Axpy<float, CPUContext>(
165  H * W, alpha_over_size, padded_ratio_data + c * H * W,
166  scale_data + image_size * n, &context_);
167  }
168  for (int c = 1; c < C; ++c) {
169  float* this_scale_slice = scale_data + n * image_size + c * H * W;
170  // copy previous scale
171  context_.CopyFromCPU<float>(
172  H * W, this_scale_slice - H * W, this_scale_slice);
173  // add head
174  math::Axpy<float, CPUContext>(
175  H * W, alpha_over_size, padded_ratio_data + (c + size_ - 1) * H * W,
176  this_scale_slice, &context_);
177  // subtract tail
178  math::Axpy<float, CPUContext>(
179  H * W, -alpha_over_size, padded_ratio_data + (c - 1) * H * W,
180  this_scale_slice, &context_);
181  }
182  }
183 
184  math::Set<float, CPUContext>(
185  padded_ratio.numel(), 0., padded_ratio_data, &context_);
186  Tensor accum_ratio(vector<int64_t>{H, W}, CPU);
187  float* accum_ratio_data = accum_ratio.template mutable_data<float>();
188 
189  const float cache_ratio = 2. * alpha_ * beta_ / size_;
190  const int inverse_pre_pad = size_ - (size_ + 1) / 2;
191 
192  int offset = 0;
193  for (int n = 0; n < N; ++n) {
194  // first, compute diff_i * y_i / s_i
195  math::Mul<float, CPUContext>(
196  image_size, dYdata + offset, Ydata + offset,
197  padded_ratio_data + inverse_pre_pad * H * W, &context_);
198  math::Div<float, CPUContext>(
199  image_size, padded_ratio_data + inverse_pre_pad * H * W,
200  scale_data + offset,
201  padded_ratio_data + inverse_pre_pad * H * W, &context_);
202  // Now, compute the accumulated ratios and the bottom diff
203  math::Set<float, CPUContext>(
204  accum_ratio.numel(), 0., accum_ratio_data, &context_);
205  for (int c = 0; c < size_ - 1; ++c) {
206  math::Axpy<float, CPUContext>(H * W, 1,
207  padded_ratio_data + c * H * W,
208  accum_ratio_data, &context_);
209  }
210  for (int c = 0; c < C; ++c) {
211  for (int hw = 0; hw < H * W; ++hw) {
212  accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
213  dXdata[offset] =
214  dYdata[offset] * pow(scale_data[offset], -beta_) -
215  cache_ratio * accum_ratio_data[hw] * Xdata[offset];
216  accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
217  ++offset;
218  }
219  }
220  }
221  return true;
222 }
223 
224 template <>
225 bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
226  auto& X = Input(0);
227  auto& Y = Input(1);
228  auto& dY = Input(2);
229 
230  DCHECK_EQ(X.dim(), 4);
231  const int N = X.dim32(0);
232  const int H = X.dim32(1);
233  const int W = X.dim32(2);
234  const int C = X.dim32(3);
235  const int num_rows = N * H * W;
236  const float* Xdata = X.data<float>();
237  // Loosely checking the size, assuming that the shapes will be the same as
238  // long as the sizes check out.
239  DCHECK_EQ(X.numel(), Y.numel());
240  DCHECK_EQ(X.numel(), dY.numel());
241  auto* dX = Output(0, X.sizes(), at::dtype<float>());
242  if (!scale_) {
243  scale_ = &local_scale_tensor_;
244  }
245  scale_->ResizeLike(X);
246  Tensor padded_ratio(vector<int64_t>(1, C + size_ - 1), CPU);
247  float* padded_ratio_data = padded_ratio.template mutable_data<float>();
248  float* scale_data = scale_->template mutable_data<float>();
249  // Compute scale(copied from LRNOp) - reusing padded_ratio
250  math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
251  math::Set<float, CPUContext>(
252  padded_ratio.numel(), 0., padded_ratio_data, &context_);
253  const float alpha_over_size = alpha_ / size_;
254 
255  for (int n = 0; n < num_rows; ++n) {
256  for (int c = 0; c < C; ++c) {
257  padded_ratio_data[c + pre_pad_] =
258  Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
259  }
260  float accum_scale = 0.;
261  for (int i = 0; i < size_ - 1; ++i) {
262  accum_scale += padded_ratio_data[i];
263  }
264  for (int c = 0; c < C; ++c) {
265  accum_scale += padded_ratio_data[c + size_ - 1];
266  scale_data[n * C + c] = bias_ + accum_scale;
267  accum_scale -= padded_ratio_data[c];
268  }
269  }
270 
271  math::Set<float, CPUContext>(
272  padded_ratio.numel(), 0., padded_ratio_data, &context_);
273  // the ratio 2*alpha*beta/size
274  const float cache_ratio = 2. * alpha_ * beta_ / size_;
275  const float* Ydata = Y.data<float>();
276 
277  const float* dYdata = dY.data<float>();
278  float* dXdata = dX->template mutable_data<float>();
279  for (int n = 0; n < num_rows; ++n) {
280  const int offset = n * C;
281  for (int c = 0; c < C; ++c) {
282  padded_ratio_data[c + pre_pad_] =
283  Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
284  }
285  float accum_ratio = 0.;
286  for (int c = 0; c < size_ - 1; ++c) {
287  accum_ratio += padded_ratio_data[c];
288  }
289  for (int c = 0; c < C; ++c) {
290  accum_ratio += padded_ratio_data[c + size_ - 1];
291  dXdata[offset + c] =
292  dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
293  cache_ratio * Xdata[offset + c] * accum_ratio;
294  accum_ratio -= padded_ratio_data[c];
295  }
296  }
297  return true;
298 }
299 
300 REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
301 REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
302 
303 OPERATOR_SCHEMA(LRN)
304  .NumInputs(1)
305  .NumOutputs(1, 2)
306  .SetDoc(R"DOC(
307 
308 `LRN` applies Local Response Normalization to an input blob. This operation performs
309 a kind of "lateral inhibition" by normalizing over local input regions, where
310 normalization is applied across channels. This operator is typically used to
311 normalize an unbounded activation (such as ReLU). The output shape is the same as
312 the input shape. The `brew` module has a wrapper for this operator for use in a
313 `ModelHelper` object.
314 
315 The formula for LRN is as follows:
316 
317 $$b_{c} = a_{c}(bias + \frac{\alpha}{n}\sum_{c'=max(0,c-n/2)}^{min(N-1,c+n/2)} a_{c'}^2 )^{-\beta}$$
318 
319 
320 Github Links:
321 
322 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.h
323 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.cc
324 
325 
326 <details>
327 
328 <summary> <b>Example</b> </summary>
329 
330 **Code**
331 
332 ```
333 workspace.ResetWorkspace()
334 
335 op = core.CreateOperator("LRN",
336  ["X"],
337  ["Y", "Y_scale"],
338  size=11,
339  alpha=0.001,
340  beta=0.5,
341  bias=2.0,
342  order="NHWC"
343 )
344 
345 workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) // NCHW
346 print("X:\n", workspace.FetchBlob("X"), "\n")
347 workspace.RunOperatorOnce(op)
348 print("Y:\n", workspace.FetchBlob("Y"))
349 print("Y_scale:\n", workspace.FetchBlob("Y_scale"))
350 ```
351 
352 **Result**
353 
354 ```
355 X:
356  [[[[ 0.72985137]
357  [-0.3753357 ]
358  [ 2.7344604 ]
359  [-0.5937792 ]
360  [ 0.38440478]
361  [-2.1659644 ]]
362 
363  [[-0.92846817]
364  [-0.9996144 ]
365  [ 0.212943 ]
366  [-1.968045 ]
367  [-0.77839696]
368  [ 0.45492038]]
369 
370  [[-0.11263168]
371  [ 1.9901097 ]
372  [ 0.19275683]
373  [ 0.15630436]
374  [ 0.7536298 ]
375  [-0.77339894]]
376 
377  [[ 0.8353551 ]
378  [-0.7784452 ]
379  [ 1.779317 ]
380  [ 0.22421335]
381  [ 1.3846219 ]
382  [-3.0546608 ]]
383 
384  [[ 0.09977621]
385  [ 2.2071757 ]
386  [ 0.79971045]
387  [ 3.563886 ]
388  [-0.7169287 ]
389  [ 0.77170426]]
390 
391  [[-1.4296649 ]
392  [ 0.19181213]
393  [ 0.45961624]
394  [-1.0201577 ]
395  [ 0.62854475]
396  [-0.6395456 ]]]]
397 
398 Y:
399  [[[[ 0.5160766 ]
400  [-0.26540157]
401  [ 1.9332271 ]
402  [-0.41986194]
403  [ 0.27181432]
404  [-1.5314047 ]]
405 
406  [[-0.6565133 ]
407  [-0.7068181 ]
408  [ 0.15057328]
409  [-1.3914955 ]
410  [-0.5504022 ]
411  [ 0.32167578]]
412 
413  [[-0.0796426 ]
414  [ 1.4070934 ]
415  [ 0.13629955]
416  [ 0.11052381]
417  [ 0.53288984]
418  [-0.5468682 ]]
419 
420  [[ 0.5906759 ]
421  [-0.5504363 ]
422  [ 1.2580767 ]
423  [ 0.1585426 ]
424  [ 0.9790328 ]
425  [-2.1595135 ]]
426 
427  [[ 0.07055242]
428  [ 1.5605361 ]
429  [ 0.5654725 ]
430  [ 2.5193207 ]
431  [-0.50693923]
432  [ 0.54567 ]]
433 
434  [[-1.0108787 ]
435  [ 0.13563155]
436  [ 0.3249962 ]
437  [-0.72134334]
438  [ 0.44444424]
439  [-0.45222285]]]]
440 Y_scale:
441  [[[[2.0000484]
442  [2.0000129]
443  [2.0006797]
444  [2.000032 ]
445  [2.0000134]
446  [2.0004265]]
447 
448  [[2.0000784]
449  [2.0000908]
450  [2.000004 ]
451  [2.0003521]
452  [2.000055 ]
453  [2.0000188]]
454 
455  [[2.0000012]
456  [2.00036 ]
457  [2.0000033]
458  [2.0000021]
459  [2.0000517]
460  [2.0000544]]
461 
462  [[2.0000634]
463  [2.000055 ]
464  [2.0002878]
465  [2.0000045]
466  [2.0001743]
467  [2.0008483]]
468 
469  [[2.000001 ]
470  [2.000443 ]
471  [2.0000582]
472  [2.0011547]
473  [2.0000467]
474  [2.0000541]]
475 
476  [[2.0001857]
477  [2.0000033]
478  [2.0000193]
479  [2.0000947]
480  [2.000036 ]
481  [2.0000372]]]]
482 ```
483 
484 </details>
485 
486 )DOC")
487  .Arg(
488  "size",
489  "*(type: int; default: 0)* Amount of neighboring channels to sum over for normalization")
490  .Arg(
491  "alpha",
492  "*(type: float; default: 0)* Multiplicative (scaling) factor.")
493  .Arg("beta", "*(type: float; default: 0)* Exponent.")
494  .Arg("bias", "*(type: float; default: 1.0)* Additive factor.")
495  .Arg("order", "*(type: float; default: 'NCHW')* Order of blob dimensions.")
496  .Input(0, "X", "*(type: Tensor`<float>`)* Input data tensor (ReLU output).")
497  .Output(0, "Y", "*(type: Tensor`<float>`)* Output tensor.")
498  .Output(1, "Y_scale", "*(type: Tensor`<float>`)* Output scale.")
499  .InheritOnnxSchema();
500 OPERATOR_SCHEMA(LRNGradient).NumInputs(3).NumOutputs(1);
501 
502 class GetLRNGradient : public GradientMakerBase {
503  using GradientMakerBase::GradientMakerBase;
504  vector<OperatorDef> GetGradientDefs() override {
505  return SingleGradientDef(
506  "LRNGradient", "",
507  vector<string>{I(0), O(0), GO(0)},
508  vector<string>{GI(0)});
509  }
510 };
511 REGISTER_GRADIENT(LRN, GetLRNGradient);
512 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...
Definition: static.cpp:64