Caffe2 - C++ API
A deep learning, cross platform ML framework
instance_norm_gradient_op.cc
1 
17 #include "caffe2/operators/instance_norm_op.h"
18 
19 namespace caffe2 {
20 
21 template <typename T, typename Context>
22 bool InstanceNormGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
23  const auto& input = Input(INPUT);
24  const auto& scale = Input(SCALE);
25  const auto& bias = Input(BIAS);
26  const auto& output_grad = Input(OUTPUT_GRAD);
27  const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
28  const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
29  auto input_grad = Output(INPUT_GRAD);
30  auto scale_grad = Output(SCALE_GRAD);
31  auto bias_grad = Output(BIAS_GRAD);
32  CAFFE_ENFORCE_EQ(4, input.ndim());
33  const int N = input.dim32(0);
34  const int H = input.dim32(1);
35  const int W = input.dim32(2);
36  const int C = input.dim32(3);
37  CAFFE_ENFORCE_EQ(1, scale.ndim());
38  CAFFE_ENFORCE_EQ(C, scale.dim32(0));
39  CAFFE_ENFORCE_EQ(1, bias.ndim());
40  CAFFE_ENFORCE_EQ(C, bias.dim32(0));
41  CAFFE_ENFORCE_EQ(4, output_grad.ndim());
42  CAFFE_ENFORCE_EQ(N, output_grad.dim32(0));
43  CAFFE_ENFORCE_EQ(H, output_grad.dim32(1));
44  CAFFE_ENFORCE_EQ(W, output_grad.dim32(2));
45  CAFFE_ENFORCE_EQ(C, output_grad.dim32(3));
46  input_grad->ResizeLike(input);
47  scale_grad->ResizeLike(scale);
48  bias_grad->ResizeLike(bias);
49 
50  ConstEigenVectorArrayMap<T> scale_arr(scale.template data<T>(), C);
51  ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), C);
52  EigenVectorArrayMap<T> scale_grad_arr(
53  scale_grad->template mutable_data<T>(), C);
54  EigenVectorArrayMap<T> bias_grad_arr(
55  bias_grad->template mutable_data<T>(), C);
56 
57  // Resize before we get into the per-instance loop
58  if (InputSize() < 5) {
59  mean_.Resize(N, C);
60  }
61  if (InputSize() < 6) {
62  inv_stdev_.Resize(N, C);
63  }
64 
65  // looping over per-instance and using Eigen blocks to extract out
66  // a chunk of channels
67  for (int n = 0; n < N; ++n) {
68  // All Eigen mats and arrs in here are per-instance.
69  ConstEigenArrayMap<T> input_mat(
70  input.template data<T>() + n * C * H * W, C, H * W);
71  ConstEigenArrayMap<T> output_grad_mat(
72  output_grad.template data<T>() + n * C * H * W, C, H * W);
73  EigenArrayMap<T> input_grad_mat(
74  input_grad->template mutable_data<T>() + n * C * H * W, C, H * W);
75 
76  // Compute mean if it wasn't passed in
77  if (InputSize() < 5) {
78  EigenVectorArrayMap<T> mean_mutable_arr(
79  mean_.template mutable_data<T>() + n * C, C);
80  mean_mutable_arr = input_mat.rowwise().mean();
81  }
82  CAFFE_ENFORCE_EQ(2, mean.ndim());
83  CAFFE_ENFORCE_EQ(N, mean.dim32(0));
84  CAFFE_ENFORCE_EQ(C, mean.dim32(1));
85  ConstEigenVectorArrayMap<T> mean_arr(mean.template data<T>() + n * C, C);
86 
87  // subtract mean
88  input_grad_mat = input_mat.colwise() - mean_arr;
89 
90  // Compute 1 / stdev if it wasn't passed in
91  if (InputSize() < 6) {
92  EigenVectorArrayMap<T> inv_stdev_mutable_arr(
93  inv_stdev_.template mutable_data<T>() + n * C, C);
94 
95  // Square the diffs along each channel and take the mean to get var
96  inv_stdev_mutable_arr = input_grad_mat.pow(2).rowwise().mean();
97  // sqrt to get stdev and take the inverse
98  inv_stdev_mutable_arr =
99  (inv_stdev_mutable_arr + epsilon_).sqrt().inverse();
100  }
101  CAFFE_ENFORCE_EQ(2, inv_stdev.ndim());
102  CAFFE_ENFORCE_EQ(N, inv_stdev.dim32(0));
103  CAFFE_ENFORCE_EQ(C, inv_stdev.dim32(1));
104 
105  ConstEigenVectorArrayMap<T> inv_stdev_arr(
106  inv_stdev.template data<T>() + n * C, C);
107 
108  // for each channel
109  // dl/dbias = sum_j dl/dy_j
110  bias_grad_arr += output_grad_mat.rowwise().sum();
111  // for each channel
112  // dl/dscale = sum_j dl/dy_j (x_j - mu) / stdev
113  scale_grad_arr +=
114  ((input_grad_mat.colwise() * inv_stdev_arr) * output_grad_mat)
115  .rowwise()
116  .sum();
117 
118  // dl/dx_j = this gross thing
119  // Derived gradient and manually massaged it to minimize extra storage
120  // and number of vectorized calls. Verified it with the autograd package
121  // in python.
122 
123  // a = -1/(HW) sum_j dl/dy_j * (x_j - mu) / stdev^3
124  const auto temp = (inv_stdev_arr.pow(3) *
125  (input_grad_mat * output_grad_mat).rowwise().mean() *
126  -1).eval();
127  // b_j = a * (x_j - mu)
128  input_grad_mat.colwise() *= temp;
129 
130  // c_j = b_j + dl/dy_j / stdev
131  input_grad_mat += output_grad_mat.colwise() * inv_stdev_arr;
132 
133  // dl/dx_j = s * (c_j - mean(c_j))
134  const auto result_mean = input_grad_mat.rowwise().mean().eval();
135  input_grad_mat.colwise() -= result_mean;
136  input_grad_mat.colwise() *= scale_arr;
137  }
138 
139  return true;
140 }
141 
142 template <typename T, typename Context>
143 bool InstanceNormGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
144  const auto& input = Input(INPUT);
145  const auto& scale = Input(SCALE);
146  const auto& bias = Input(BIAS);
147  const auto& output_grad = Input(OUTPUT_GRAD);
148  const auto& mean = InputSize() >= 5 ? Input(MEAN) : mean_;
149  const auto& inv_stdev = InputSize() >= 6 ? Input(INV_STDEV) : inv_stdev_;
150  auto input_grad = Output(INPUT_GRAD);
151  auto scale_grad = Output(SCALE_GRAD);
152  auto bias_grad = Output(BIAS_GRAD);
153  CAFFE_ENFORCE_EQ(4, input.ndim());
154  const int N = input.dim32(0);
155  const int C = input.dim32(1);
156  const int H = input.dim32(2);
157  const int W = input.dim32(3);
158  CAFFE_ENFORCE_EQ(1, scale.ndim());
159  CAFFE_ENFORCE_EQ(C, scale.dim32(0));
160  CAFFE_ENFORCE_EQ(1, bias.ndim());
161  CAFFE_ENFORCE_EQ(C, bias.dim32(0));
162  CAFFE_ENFORCE_EQ(4, output_grad.ndim());
163  CAFFE_ENFORCE_EQ(N, output_grad.dim32(0));
164  CAFFE_ENFORCE_EQ(C, output_grad.dim32(1));
165  CAFFE_ENFORCE_EQ(H, output_grad.dim32(2));
166  CAFFE_ENFORCE_EQ(W, output_grad.dim32(3));
167  input_grad->ResizeLike(input);
168  scale_grad->ResizeLike(scale);
169  bias_grad->ResizeLike(bias);
170 
171  ConstEigenArrayMap<T> input_mat(input.template data<T>(), H * W, N * C);
172  ConstEigenVectorArrayMap<T> scale_arr(scale.template data<T>(), C);
173  ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), C);
174  ConstEigenArrayMap<T> output_grad_mat(
175  output_grad.template data<T>(), H * W, N * C);
176 
177  EigenArrayMap<T> input_grad_mat(
178  input_grad->template mutable_data<T>(), H * W, N * C);
179  EigenVectorArrayMap<T> scale_grad_arr(
180  scale_grad->template mutable_data<T>(), C);
181  EigenVectorArrayMap<T> bias_grad_arr(
182  bias_grad->template mutable_data<T>(), C);
183 
184  // Compute mean if it wasn't passed in
185  if (InputSize() < 5) {
186  mean_.Resize(N, C);
187  EigenVectorArrayMap<T> mean_mutable_arr(
188  mean_.template mutable_data<T>(), N * C);
189  mean_mutable_arr = input_mat.colwise().mean();
190  }
191  CAFFE_ENFORCE_EQ(2, mean.ndim());
192  CAFFE_ENFORCE_EQ(N, mean.dim32(0));
193  CAFFE_ENFORCE_EQ(C, mean.dim32(1));
194  ConstEigenVectorArrayMap<T> mean_arr(mean.template data<T>(), N * C);
195 
196  // subtract mean
197  input_grad_mat = input_mat.rowwise() - mean_arr.transpose();
198 
199  // compute 1 / stdev if not passed in
200  if (InputSize() < 6) {
201  inv_stdev_.Resize(N, C);
202  EigenVectorArrayMap<T> inv_stdev_mutable_arr(
203  inv_stdev_.template mutable_data<T>(), N * C);
204 
205  // Square the diffs along each column and take mean to get var
206  inv_stdev_mutable_arr = input_grad_mat.pow(2).colwise().mean();
207  // sqrt to get stdev and then invert
208  inv_stdev_mutable_arr = (inv_stdev_mutable_arr + epsilon_).sqrt().inverse();
209  }
210  CAFFE_ENFORCE_EQ(2, inv_stdev.ndim());
211  CAFFE_ENFORCE_EQ(N, inv_stdev.dim32(0));
212  CAFFE_ENFORCE_EQ(C, inv_stdev.dim32(1));
213 
214  ConstEigenVectorArrayMap<T> inv_stdev_arr(
215  inv_stdev.template data<T>(), N * C);
216 
217  // Visit comments in the NHWC version about these gradients. scale and bias
218  // grads are about the same, but the input grads no longer slice out one
219  // example at a time and instead vectorize across all N * C feature maps.
220 
221  // scale and bias gradients
222  scale_grad_arr.setZero();
223  bias_grad_arr.setZero();
224  for (int n = 0; n < N; ++n) {
225  scale_grad_arr += ((input_grad_mat.rowwise() * inv_stdev_arr.transpose()) *
226  output_grad_mat)
227  .block(0, n * C, H * W, C)
228  .colwise()
229  .sum();
230  bias_grad_arr += output_grad_mat.block(0, n * C, H * W, C).colwise().sum();
231  }
232 
233  // input gradient
234  const auto temp = ((inv_stdev_arr.pow(3).transpose() *
235  (input_grad_mat * output_grad_mat).colwise().mean()) *
236  -1).eval();
237  input_grad_mat.rowwise() *= temp;
238 
239  input_grad_mat += output_grad_mat.rowwise() * inv_stdev_arr.transpose();
240 
241  const auto result_mean = input_grad_mat.colwise().mean().eval();
242  input_grad_mat.rowwise() -= result_mean;
243 
244  for (int n = 0; n < N; ++n) {
245  input_grad_mat.block(0, n * C, H * W, C).rowwise() *= scale_arr.transpose();
246  }
247 
248  return true;
249 }
250 
252  using GradientMakerBase::GradientMakerBase;
253  vector<OperatorDef> GetGradientDefs() override {
254  vector<string> inputs{I(0), I(1), I(2), GO(0)};
255  if (def_.output_size() >= 2) {
256  inputs.push_back(O(1));
257  }
258  if (def_.output_size() >= 3) {
259  inputs.push_back(O(2));
260  }
261  return SingleGradientDef(
262  "InstanceNormGradient",
263  "",
264  inputs,
265  vector<string>{GI(0), GI(1), GI(2)});
266  }
267 };
268 
269 REGISTER_CPU_OPERATOR(
270  InstanceNormGradient,
272 
273 OPERATOR_SCHEMA(InstanceNormGradient).NumInputs(4, 6).NumOutputs(3);
274 
275 REGISTER_GRADIENT(InstanceNorm, GetInstanceNormGradient);
276 }
Copyright (c) 2016-present, Facebook, Inc.
static vector< OperatorDef > SingleGradientDef(const Args &...args)
a helper function to allow one to create one single operator def, which is usually the case for many ...