1 #include "caffe2/operators/local_response_normalization_op.h" 6 bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
10 DCHECK_EQ(X.dim(), 4);
11 const int N = X.dim32(0);
12 const int C = X.dim32(1);
13 const int H = X.dim32(2);
14 const int W = X.dim32(3);
15 const int image_size = C * H * W;
16 const float* Xdata = X.data<
float>();
17 auto* Y = Output(0, X.sizes(), at::dtype<float>());
18 float* Ydata = Y->template mutable_data<float>();
20 if (OutputSize() > 1) {
24 scale_ = &local_scale_tensor_;
27 scale_->ResizeLike(X);
28 float* scale_data = scale_->template mutable_data<float>();
29 math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
30 Tensor padded_square(vector<int64_t>{C + size_ - 1, H, W}, CPU);
31 float* padded_square_data = padded_square.template mutable_data<float>();
32 math::Set<float, CPUContext>(
33 padded_square.numel(), 0., padded_square_data, &context_);
34 const float alpha_over_size = alpha_ / size_;
36 for (
int n = 0; n < N; ++n) {
38 math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
39 padded_square_data + pre_pad_ * H * W,
42 for (
int c = 0; c < size_; ++c) {
43 math::Axpy<float, CPUContext>(
44 H * W, alpha_over_size, padded_square_data + c * H * W,
45 scale_data + image_size * n, &context_);
47 for (
int c = 1; c < C; ++c) {
48 float* this_scale_slice = scale_data + n * image_size + c * H * W;
50 context_.CopyFromCPU<
float>(
51 H * W, this_scale_slice - H * W, this_scale_slice);
53 math::Axpy<float, CPUContext>(
54 H * W, alpha_over_size, padded_square_data + (c + size_ - 1) * H * W,
55 this_scale_slice, &context_);
57 math::Axpy<float, CPUContext>(
58 H * W, -alpha_over_size, padded_square_data + (c - 1) * H * W,
59 this_scale_slice, &context_);
62 math::Powx<float, CPUContext>(
63 X.numel(), scale_data, -beta_, Ydata, &context_);
64 math::Mul<float, CPUContext>(X.numel(), Ydata, Xdata, Ydata, &context_);
69 bool LRNOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
74 DCHECK_EQ(X.dim(), 4);
75 const int N = X.dim32(0);
76 const int H = X.dim32(1);
77 const int W = X.dim32(2);
78 const int C = X.dim32(3);
79 const int num_rows = N * H * W;
80 const float* Xdata = X.data<
float>();
81 auto* Y = Output(0, X.sizes(), at::dtype<float>());
82 float* Ydata = Y->template mutable_data<float>();
84 if (OutputSize() > 1) {
88 scale_ = &local_scale_tensor_;
91 scale_->ResizeLike(X);
92 float* scale_data = scale_->template mutable_data<float>();
94 Tensor padded_square(vector<int64_t>(1, C + size_ - 1), CPU);
95 float* padded_square_data = padded_square.template mutable_data<float>();
96 math::Set<float, CPUContext>(
97 padded_square.numel(), 0., padded_square_data, &context_);
98 const float alpha_over_size = alpha_ / size_;
100 for (
int n = 0; n < num_rows; ++n) {
101 for (
int c = 0; c < C; ++c) {
102 padded_square_data[c + pre_pad_] =
103 Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
105 float accum_scale = 0.;
106 for (
int i = 0; i < size_ - 1; ++i) {
107 accum_scale += padded_square_data[i];
109 for (
int c = 0; c < C; ++c) {
110 accum_scale += padded_square_data[c + size_ - 1];
111 scale_data[n * C + c] = bias_ + accum_scale;
112 accum_scale -= padded_square_data[c];
115 math::Powx<float, CPUContext>(
116 X.numel(), scale_data, -beta_, Ydata, &context_);
117 math::Mul<float, CPUContext>(X.numel(), Ydata, Xdata, Ydata, &context_);
122 bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
127 DCHECK_EQ(X.dim(), 4);
128 const int N = X.dim32(0);
129 const int C = X.dim32(1);
130 const int H = X.dim32(2);
131 const int W = X.dim32(3);
132 const int image_size = C * H * W;
135 DCHECK_EQ(X.numel(), Y.numel());
136 DCHECK_EQ(X.numel(), dY.numel());
137 auto* dX = Output(0, X.sizes(), at::dtype<float>());
139 const float* Xdata = X.data<
float>();
140 const float* Ydata = Y.data<
float>();
142 scale_ = &local_scale_tensor_;
144 scale_->ResizeLike(X);
145 float* scale_data = scale_->template mutable_data<float>();
146 const float* dYdata = dY.data<
float>();
147 float* dXdata = dX->template mutable_data<float>();
149 Tensor padded_ratio(vector<int64_t>{C + size_ - 1, H, W}, CPU);
150 float* padded_ratio_data = padded_ratio.template mutable_data<float>();
152 math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
153 math::Set<float, CPUContext>(
154 padded_ratio.numel(), 0., padded_ratio_data, &context_);
155 const float alpha_over_size = alpha_ / size_;
157 for (
int n = 0; n < N; ++n) {
159 math::Sqr<float, CPUContext>(image_size, Xdata + image_size * n,
160 padded_ratio_data + pre_pad_ * H * W,
163 for (
int c = 0; c < size_; ++c) {
164 math::Axpy<float, CPUContext>(
165 H * W, alpha_over_size, padded_ratio_data + c * H * W,
166 scale_data + image_size * n, &context_);
168 for (
int c = 1; c < C; ++c) {
169 float* this_scale_slice = scale_data + n * image_size + c * H * W;
171 context_.CopyFromCPU<
float>(
172 H * W, this_scale_slice - H * W, this_scale_slice);
174 math::Axpy<float, CPUContext>(
175 H * W, alpha_over_size, padded_ratio_data + (c + size_ - 1) * H * W,
176 this_scale_slice, &context_);
178 math::Axpy<float, CPUContext>(
179 H * W, -alpha_over_size, padded_ratio_data + (c - 1) * H * W,
180 this_scale_slice, &context_);
184 math::Set<float, CPUContext>(
185 padded_ratio.numel(), 0., padded_ratio_data, &context_);
186 Tensor accum_ratio(vector<int64_t>{H, W}, CPU);
187 float* accum_ratio_data = accum_ratio.template mutable_data<float>();
189 const float cache_ratio = 2. * alpha_ * beta_ / size_;
190 const int inverse_pre_pad = size_ - (size_ + 1) / 2;
193 for (
int n = 0; n < N; ++n) {
195 math::Mul<float, CPUContext>(
196 image_size, dYdata + offset, Ydata + offset,
197 padded_ratio_data + inverse_pre_pad * H * W, &context_);
198 math::Div<float, CPUContext>(
199 image_size, padded_ratio_data + inverse_pre_pad * H * W,
201 padded_ratio_data + inverse_pre_pad * H * W, &context_);
203 math::Set<float, CPUContext>(
204 accum_ratio.numel(), 0., accum_ratio_data, &context_);
205 for (
int c = 0; c < size_ - 1; ++c) {
206 math::Axpy<float, CPUContext>(H * W, 1,
207 padded_ratio_data + c * H * W,
208 accum_ratio_data, &context_);
210 for (
int c = 0; c < C; ++c) {
211 for (
int hw = 0; hw < H * W; ++hw) {
212 accum_ratio_data[hw] += padded_ratio_data[(c + size_ - 1) * H * W + hw];
214 dYdata[offset] * pow(scale_data[offset], -beta_) -
215 cache_ratio * accum_ratio_data[hw] * Xdata[offset];
216 accum_ratio_data[hw] -= padded_ratio_data[c * H * W + hw];
225 bool LRNGradientOp<float, CPUContext>::RunOnDeviceWithOrderNHWC() {
230 DCHECK_EQ(X.dim(), 4);
231 const int N = X.dim32(0);
232 const int H = X.dim32(1);
233 const int W = X.dim32(2);
234 const int C = X.dim32(3);
235 const int num_rows = N * H * W;
236 const float* Xdata = X.data<
float>();
239 DCHECK_EQ(X.numel(), Y.numel());
240 DCHECK_EQ(X.numel(), dY.numel());
241 auto* dX = Output(0, X.sizes(), at::dtype<float>());
243 scale_ = &local_scale_tensor_;
245 scale_->ResizeLike(X);
246 Tensor padded_ratio(vector<int64_t>(1, C + size_ - 1), CPU);
247 float* padded_ratio_data = padded_ratio.template mutable_data<float>();
248 float* scale_data = scale_->template mutable_data<float>();
250 math::Set<float, CPUContext>(X.numel(), bias_, scale_data, &context_);
251 math::Set<float, CPUContext>(
252 padded_ratio.numel(), 0., padded_ratio_data, &context_);
253 const float alpha_over_size = alpha_ / size_;
255 for (
int n = 0; n < num_rows; ++n) {
256 for (
int c = 0; c < C; ++c) {
257 padded_ratio_data[c + pre_pad_] =
258 Xdata[n * C + c] * Xdata[n * C + c] * alpha_over_size;
260 float accum_scale = 0.;
261 for (
int i = 0; i < size_ - 1; ++i) {
262 accum_scale += padded_ratio_data[i];
264 for (
int c = 0; c < C; ++c) {
265 accum_scale += padded_ratio_data[c + size_ - 1];
266 scale_data[n * C + c] = bias_ + accum_scale;
267 accum_scale -= padded_ratio_data[c];
271 math::Set<float, CPUContext>(
272 padded_ratio.numel(), 0., padded_ratio_data, &context_);
274 const float cache_ratio = 2. * alpha_ * beta_ / size_;
275 const float* Ydata = Y.data<
float>();
277 const float* dYdata = dY.data<
float>();
278 float* dXdata = dX->template mutable_data<float>();
279 for (
int n = 0; n < num_rows; ++n) {
280 const int offset = n * C;
281 for (
int c = 0; c < C; ++c) {
282 padded_ratio_data[c + pre_pad_] =
283 Ydata[offset + c] * dYdata[offset + c] / scale_data[offset + c];
285 float accum_ratio = 0.;
286 for (
int c = 0; c < size_ - 1; ++c) {
287 accum_ratio += padded_ratio_data[c];
289 for (
int c = 0; c < C; ++c) {
290 accum_ratio += padded_ratio_data[c + size_ - 1];
292 dYdata[offset + c] * pow(scale_data[offset + c], -beta_) -
293 cache_ratio * Xdata[offset + c] * accum_ratio;
294 accum_ratio -= padded_ratio_data[c];
300 REGISTER_CPU_OPERATOR(LRN, LRNOp<float, CPUContext>);
301 REGISTER_CPU_OPERATOR(LRNGradient, LRNGradientOp<float, CPUContext>);
308 `LRN` applies Local Response Normalization to an input blob. This operation performs 309 a kind of "lateral inhibition" by normalizing over local input regions, where 310 normalization is applied across channels. This operator is typically used to 311 normalize an unbounded activation (such as ReLU). The output shape is the same as 312 the input shape. The `brew` module has a wrapper for this operator for use in a 313 `ModelHelper` object. 315 The formula for LRN is as follows: 317 $$b_{c} = a_{c}(bias + \frac{\alpha}{n}\sum_{c'=max(0,c-n/2)}^{min(N-1,c+n/2)} a_{c'}^2 )^{-\beta}$$ 322 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.h 323 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/local_response_normalization_op.cc 328 <summary> <b>Example</b> </summary> 333 workspace.ResetWorkspace() 335 op = core.CreateOperator("LRN", 345 workspace.FeedBlob("X", np.random.randn(1, 6, 6, 1).astype(np.float32)) // NCHW 346 print("X:\n", workspace.FetchBlob("X"), "\n") 347 workspace.RunOperatorOnce(op) 348 print("Y:\n", workspace.FetchBlob("Y")) 349 print("Y_scale:\n", workspace.FetchBlob("Y_scale")) 489 "*(type: int; default: 0)* Amount of neighboring channels to sum over for normalization")
492 "*(type: float; default: 0)* Multiplicative (scaling) factor.")
493 .Arg(
"beta",
"*(type: float; default: 0)* Exponent.")
494 .Arg(
"bias",
"*(type: float; default: 1.0)* Additive factor.")
495 .Arg(
"order",
"*(type: float; default: 'NCHW')* Order of blob dimensions.")
496 .Input(0,
"X",
"*(type: Tensor`<float>`)* Input data tensor (ReLU output).")
497 .Output(0,
"Y",
"*(type: Tensor`<float>`)* Output tensor.")
498 .Output(1,
"Y_scale",
"*(type: Tensor`<float>`)* Output scale.")
499 .InheritOnnxSchema();
500 OPERATOR_SCHEMA(LRNGradient).NumInputs(3).NumOutputs(1);
503 using GradientMakerBase::GradientMakerBase;
504 vector<OperatorDef> GetGradientDefs()
override {
507 vector<string>{I(0), O(0), GO(0)},
508 vector<string>{GI(0)});
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...