Caffe2 - C++ API
A deep learning, cross platform ML framework
local_response_normalization_op.cc
1 
17 #include "caffe2/operators/local_response_normalization_op.h"
18 
19 namespace caffe2 {
20 
21 template<>
22 bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
23  // Note(Yangqing): this one is copied from my Caffe implementation.
24  auto& X = Input(0);
25  auto* Y = Output(0);
26  DCHECK_EQ(X.ndim(), 4);
27  const int N = X.dim32(0);
28  const int C = X.dim32(1);
29  const int H = X.dim32(2);
30  const int W = X.dim32(3);
31  const int image_size = C * H * W;
32  const float* Xdata = X.data<float>();
33  Y->ResizeLike(X);
34  float* Ydata = Y->mutable_data<float>();
35 
36  if (OutputSize() > 1) {
37  scale_ = Output(1);
38  } else {
39  if (!scale_) {
40  scale_ = &local_scale_tensor_;
41  }
42  }
43  scale_->ResizeLike(X);
44  float* scale_data = scale_->mutable_data<float>();
45  math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
46  TensorCPU padded_square(
47  vector<TIndex>{C + size_ - 1, H, W});
48  float* padded_square_data = padded_square.mutable_data<float>();
49  math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
50  &context_);
51  const float alpha_over_size = alpha_ / size_;
52  // go through the images
53  for (int n = 0; n < N; ++n) {
54  // compute the padded square
55  math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
56  padded_square_data + pre_pad_ * H * W,
57  &context_);
58  // Create the first channel scale
59  for (int c = 0; c < size_; ++c) {
60  math::Axpy<float, CPUContext>(
61  H * W, alpha_over_size, padded_square_data + c * H * W,
62  scale_data + image_size * n, &context_);
63  }
64  for (int c = 1; c < C; ++c) {
65  float* this_scale_slice = scale_data + n * image_size + c * H * W;
66  // copy previous scale
67  context_.Copy<float, CPUContext, CPUContext>(
68  H * W, this_scale_slice - H * W, this_scale_slice);
69  // add head
70  math::Axpy<float, CPUContext>(
71  H * W, alpha_over_size, padded_square_data + (c + size_ - 1) * H * W,
72  this_scale_slice, &context_);
73  // subtract tail
74  math::Axpy<float, CPUContext>(
75  H * W, -alpha_over_size, padded_square_data + (c - 1) * H * W,
76  this_scale_slice, &context_);
77  }
78  }
79  math::Powx<float, CPUContext>(
80  X.size(), scale_data, -beta_, Ydata, &context_);
81  math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &context_);
82  return true;
83 }
84 
85 template<>
86 bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
87  // Note(Yangqing): This one is copied from my Decaf implementation. How many
88  // variants have I written...?
89  auto& X = Input(0);
90  auto* Y = Output(0);
91  DCHECK_EQ(X.ndim(), 4);
92  const int N = X.dim32(0);
93  const int H = X.dim32(1);
94  const int W = X.dim32(2);
95  const int C = X.dim32(3);
96  const int num_rows = N * H * W;
97  const float* Xdata = X.data<float>();
98  Y->ResizeLike(X);
99  float* Ydata = Y->mutable_data<float>();
100 
101  if (OutputSize() > 1) {
102  scale_ = Output(1);
103  } else {
104  if (!scale_) {
105  scale_ = &local_scale_tensor_;
106  }
107  }
108  scale_->ResizeLike(X);
109  float* scale_data = scale_->mutable_data<float>();
110 
111  TensorCPU padded_square(vector<TIndex>(1, C + size_ - 1));
112  float* padded_square_data = padded_square.mutable_data<float>();
113  math::Set<float, CPUContext>(padded_square.size(), 0., padded_square_data,
114  &context_);
115  const float alpha_over_size = alpha_ / size_;
116 
117  for (int n = 0; n < num_rows; ++n) {
118  for (int c = 0; c < C; ++c) {
119  padded_square_data[c + pre_pad_] =
120  Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
121  }
122  float accum_scale = 0.;
123  for (int i = 0; i < size_ - 1; ++i) {
124  accum_scale += padded_square_data[i];
125  }
126  for (int c = 0; c < C; ++c) {
127  accum_scale += padded_square_data[c + size_ - 1];
128  scale_data[n * C + c] = bias_ + accum_scale;
129  accum_scale -= padded_square_data[c];
130  }
131  }
132  math::Powx<float, CPUContext>(
133  X.size(), scale_data, -beta_, Ydata, &context_);
134  math::Mul<float, CPUContext>(X.size(), Ydata, Xdata, Ydata, &context_);
135  return true;
136 }
137 
138 template <>
139 bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
140  auto& X = Input(0);
141  auto& Y = Input(1);
142  auto& dY = Input(2);
143  auto* dX = Output(0);
144  DCHECK_EQ(X.ndim(), 4);
145  const int N = X.dim32(0);
146  const int C = X.dim32(1);
147  const int H = X.dim32(2);
148  const int W = X.dim32(3);
149  const int image_size = C * H * W;
150  // Loosely checking the size, assuming that the shapes will be the same as
151  // long as the sizes check out.
152  DCHECK_EQ(X.size(), Y.size());
153  DCHECK_EQ(X.size(), dY.size());
154  dX->ResizeLike(X);
155 
156  const float* Xdata = X.data<float>();
157  const float* Ydata = Y.data<float>();
158  if (!scale_) {
159  scale_ = &local_scale_tensor_;
160  }
161  scale_->ResizeLike(X);
162  float* scale_data = scale_->mutable_data<float>();
163  const float* dYdata = dY.data<float>();
164  float* dXdata = dX->mutable_data<float>();
165 
166  TensorCPU padded_ratio(
167  vector<TIndex>{C + size_ - 1, H, W});
168  float* padded_ratio_data = padded_ratio.mutable_data<float>();
169  // Compute scale(copied from LRNOp) - reusing padded_ratio
170  math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
171  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
172  &context_);
173  const float alpha_over_size = alpha_ / size_;
174  // go through the images
175  for (int n = 0; n < N; ++n) {
176  // compute the padded square
177  math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
178  padded_ratio_data + pre_pad_ * H * W,
179  &context_);
180  // Create the first channel scale
181  for (int c = 0; c < size_; ++c) {
182  math::Axpy<float, CPUContext>(
183  H * W, alpha_over_size, padded_ratio_data + c * H * W,
184  scale_data + image_size * n, &context_);
185  }
186  for (int c = 1; c < C; ++c) {
187  float* this_scale_slice = scale_data + n * image_size + c * H * W;
188  // copy previous scale
189  context_.Copy<float, CPUContext, CPUContext>(
190  H * W, this_scale_slice - H * W, this_scale_slice);
191  // add head
192  math::Axpy<float, CPUContext>(
193  H * W, alpha_over_size, padded_ratio_data + (c + size_ - 1) * H * W,
194  this_scale_slice, &context_);
195  // subtract tail
196  math::Axpy<float, CPUContext>(
197  H * W, -alpha_over_size, padded_ratio_data + (c - 1) * H * W,
198  this_scale_slice, &context_);
199  }
200  }
201 
202  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
203  &context_);
204  TensorCPU accum_ratio(vector<TIndex>{H, W});
205  float* accum_ratio_data = accum_ratio.mutable_data<float>();
206 
207 
208  const float cache_ratio = 2. * alpha_ * beta_ / size_;
209  const int inverse_pre_pad = size_ - (size_ + 1) / 2;
210 
211  int offset = 0;
212  for (int n = 0; n < N; ++n) {
213  // first, compute diff_i * y_i / s_i
214  math::Mul<float, CPUContext>(
215  image_size, dYdata + offset, Ydata + offset,
216  padded_ratio_data + inverse_pre_pad * H * W, &context_);
217  math::Div<float, CPUContext>(
218  image_size, padded_ratio_data + inverse_pre_pad * H * W,
219  scale_data + offset,
220  padded_ratio_data + inverse_pre_pad * H * W, &context_);
221  // Now, compute the accumulated ratios and the bottom diff
222  math::Set<float, CPUContext>(accum_ratio.size(), 0., accum_ratio_data,
223  &context_);
224  for (int c = 0; c < size_ - 1; ++c) {
225  math::Axpy<float, CPUContext>(H * W, 1,
226  padded_ratio_data + c * H * W,
227  accum_ratio_data, &context_);
228  }
229  for (int c = 0; c < C; ++c) {
230  for (int hw = 0; hw < H * W; ++hw) {
231  accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
232  dXdata[offset] =
233  dYdata[offset] * pow(scale_data[offset], -beta_) -
234  cache_ratio * accum_ratio_data[hw] * Xdata[offset];
235  accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
236  ++offset;
237  }
238  }
239  }
240  return true;
241 }
242 
243 template <>
244 bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
245  auto& X = Input(0);
246  auto& Y = Input(1);
247  auto& dY = Input(2);
248  auto* dX = Output(0);
249  DCHECK_EQ(X.ndim(), 4);
250  const int N = X.dim32(0);
251  const int H = X.dim32(1);
252  const int W = X.dim32(2);
253  const int C = X.dim32(3);
254  const int num_rows = N * H * W;
255  const float* Xdata = X.data<float>();
256  // Loosely checking the size, assuming that the shapes will be the same as
257  // long as the sizes check out.
258  DCHECK_EQ(X.size(), Y.size());
259  DCHECK_EQ(X.size(), dY.size());
260  dX->ResizeLike(X);
261  if (!scale_) {
262  scale_ = &local_scale_tensor_;
263  }
264  scale_->ResizeLike(X);
265  TensorCPU padded_ratio(vector<TIndex>(1, C + size_ - 1));
266  float* padded_ratio_data = padded_ratio.mutable_data<float>();
267  float* scale_data = scale_->mutable_data<float>();
268  // Compute scale(copied from LRNOp) - reusing padded_ratio
269  math::Set<float, CPUContext>(X.size(), bias_, scale_data, &context_);
270  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
271  &context_);
272  const float alpha_over_size = alpha_ / size_;
273 
274  for (int n = 0; n < num_rows; ++n) {
275  for (int c = 0; c < C; ++c) {
276  padded_ratio_data[c + pre_pad_] =
277  Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
278  }
279  float accum_scale = 0.;
280  for (int i = 0; i < size_ - 1; ++i) {
281  accum_scale += padded_ratio_data[i];
282  }
283  for (int c = 0; c < C; ++c) {
284  accum_scale += padded_ratio_data[c + size_ - 1];
285  scale_data[n * C + c] = bias_ + accum_scale;
286  accum_scale -= padded_ratio_data[c];
287  }
288  }
289 
290  math::Set<float, CPUContext>(padded_ratio.size(), 0., padded_ratio_data,
291  &context_);
292  // the ratio 2*alpha*beta/size
293  const float cache_ratio = 2. * alpha_ * beta_ / size_;
294  const float* Ydata = Y.data<float>();
295 
296  const float* dYdata = dY.data<float>();
297  float* dXdata = dX->mutable_data<float>();
298  for (int n = 0; n < num_rows; ++n) {
299  const int offset = n * C;
300  for (int c = 0; c < C; ++c) {
301  padded_ratio_data[c + pre_pad_] =
302  Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
303  }
304  float accum_ratio = 0.;
305  for (int c = 0; c < size_ - 1; ++c) {
306  accum_ratio += padded_ratio_data[c];
307  }
308  for (int c = 0; c < C; ++c) {
309  accum_ratio += padded_ratio_data[c + size_ - 1];
310  dXdata[offset + c] =
311  dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
312  cache_ratio * Xdata[offset + c] * accum_ratio;
313  accum_ratio -= padded_ratio_data[c];
314  }
315  }
316  return true;
317 }
318 
319 REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
320 REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
321 
322 OPERATOR_SCHEMA(LRN).NumInputs(1).NumOutputs(1,2);
323 OPERATOR_SCHEMA(LRNGradient).NumInputs(3).NumOutputs(1);
324 
326  using GradientMakerBase::GradientMakerBase;
327  vector<OperatorDef> GetGradientDefs() override {
328  return SingleGradientDef(
329  "LRNGradient", "",
330  vector<string>{I(0), O(0), GO(0)},
331  vector<string>{GI(0)});
332  }
333 };
334 REGISTER_GRADIENT(LRN, GetLRNGradient);
335 } // namespace caffe2
T * mutable_data()
Returns a typed pointer of the underlying storage.
Definition: tensor.h:594
Copyright (c) 2016-present, Facebook, Inc.
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...