Caffe2 - C++ API
A deep learning, cross platform ML framework
instance_norm_op.cc
1 
17 #include "caffe2/operators/instance_norm_op.h"
18 
19 namespace caffe2 {
20 
21 // Here lives two separate implementations of the forward and backward passes of
22 // instance normalization, one for NHWC order and the other for NCHW order.
23 // Two implementations allow us to make use of Eigen vectorized operations
24 // without an expensive tensor transpose operation.
25 
26 template <typename T, typename Context>
27 bool InstanceNormOp<T, Context>::RunOnDeviceWithOrderNHWC() {
28  const auto& X = Input(INPUT);
29  auto* Y = Output(OUTPUT);
30  CAFFE_ENFORCE(Y != &X, "Can't run InstanceNorm NHWC in-place");
31  auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_;
32  auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_;
33  const int N = X.dim32(0);
34  const int H = X.dim32(1);
35  const int W = X.dim32(2);
36  const int C = X.dim32(3);
37  const size_t offset = H * W * C;
38 
39  CAFFE_ENFORCE_EQ(Input(SCALE).size(), C);
40  CAFFE_ENFORCE_EQ(Input(BIAS).size(), C);
41 
42  Y->ResizeLike(X);
43  mean->Resize(N, C);
44  inv_stdev->Resize(N, C);
45  ConstEigenVectorArrayMap<T> scale(Input(SCALE).template data<T>(), C);
46  ConstEigenVectorArrayMap<T> bias(Input(BIAS).template data<T>(), C);
47  for (int n = 0; n < N; ++n) {
48  ConstEigenArrayMap<T> Xmat(X.template data<T>() + offset * n, C, H * W);
49  EigenArrayMap<T> Ymat(Y->template mutable_data<T>() + offset * n, C, H * W);
50  EigenVectorArrayMap<T> mean_arr(
51  mean->template mutable_data<T>() + n * C, C);
52  EigenVectorArrayMap<T> inv_stdev_arr(
53  inv_stdev->template mutable_data<T>() + n * C, C);
54 
55  // The following effectively does the row wise mean computation:
56  // mean_arr = Xmat.rowwise().mean();
57  // but manually vectorizes over columns.
58  mean_arr = Xmat.col(0);
59  for (int i = 1; i < H * W; ++i) {
60  mean_arr += Xmat.col(i);
61  }
62  mean_arr *= 1. / (H * W);
63  Ymat = Xmat.colwise() - mean_arr;
64  // The following effectively does row wise squared norm computation,
65  // but manually vectorizes over columns similar to the mean case.
66  inv_stdev_arr = Ymat.col(0) * Ymat.col(0);
67  for (int i = 1; i < H * W; ++i) {
68  inv_stdev_arr += Ymat.col(i) * Ymat.col(i);
69  }
70  inv_stdev_arr = (inv_stdev_arr / (H * W) + epsilon_).sqrt().inverse();
71  Ymat = (Ymat.colwise() * (inv_stdev_arr * scale)).colwise() + bias;
72  }
73  return true;
74 }
75 
76 template <typename T, typename Context>
77 bool InstanceNormOp<T, Context>::RunOnDeviceWithOrderNCHW() {
78  const auto& X = Input(INPUT);
79  const auto& scale = Input(SCALE);
80  const auto& bias = Input(BIAS);
81  auto* Y = Output(OUTPUT);
82  auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_;
83  auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_;
84  const int N = X.dim32(0);
85  const int C = X.dim32(1);
86  const int H = X.dim32(2);
87  const int W = X.dim32(3);
88 
89  CAFFE_ENFORCE_EQ(scale.size(), C);
90  CAFFE_ENFORCE_EQ(bias.size(), C);
91 
92  Y->ResizeLike(X);
93  mean->Resize(N, C);
94  inv_stdev->Resize(N, C);
95 
96  const auto* Xdata = X.template data<T>();
97  auto* Ydata = Y->template mutable_data<T>();
98  const auto* scale_data = scale.template data<T>();
99  const auto* bias_data = bias.template data<T>();
100  auto* mean_data = mean->template mutable_data<T>();
101  auto* inv_stdev_data = inv_stdev->template mutable_data<T>();
102 
103  // TODO: benchmark parallelization strategies.
104  for (auto i = 0; i < N * C; ++i) {
105  ConstEigenVectorArrayMap<T> Xi(Xdata + H * W * i, H * W);
106  const T Xi_mean = Xi.mean();
107  const T squared_norm = (Xi - Xi_mean).matrix().squaredNorm();
108  const T inv_stdev = 1.0 / std::sqrt(squared_norm / (H * W) + epsilon_);
109  mean_data[i] = Xi_mean;
110  inv_stdev_data[i] = inv_stdev;
111  EigenVectorArrayMap<T> Yi(Ydata + H * W * i, H * W);
112  const T channel_scale = inv_stdev * scale_data[i % C];
113  const T channel_shift = bias_data[i % C] - Xi_mean * channel_scale;
114  Yi = Xi * channel_scale + channel_shift;
115  }
116 
117  return true;
118 }
119 
120 REGISTER_CPU_OPERATOR(InstanceNorm, InstanceNormOp<float, CPUContext>);
121 
122 OPERATOR_SCHEMA(InstanceNorm)
123  .NumInputs(3)
124  .NumOutputs(1, 3)
125  .AllowInplace({{0,0}})
126  .SetDoc(R"DOC(
127 Carries out instance normalization as described in the paper
128 https://arxiv.org/abs/1607.08022. Depending on the mode it is being run,
129 there are multiple cases for the number of outputs, which we list below:
130 
131  * Output case #1: output
132  * Output case #2: output, saved_mean
133  - don't use, doesn't make sense but won't crash
134  * Output case #3: output, saved_mean, saved_inv_stdev
135  - Makes sense for training only
136 
137 For training mode, type 3 is faster in the sense that for the backward
138 pass, it is able to reuse the saved mean and inv_stdev in the gradient
139 computation.
140 )DOC")
141  .Arg("epsilon", "The epsilon value to use to avoid division by zero.")
142  .Arg("order", "A StorageOrder string.")
143  .Input(
144  0,
145  "input",
146  "The input 4-dimensional tensor of shape NCHW or NHWC depending "
147  "on the order parameter.")
148  .Input(1, "scale", "The input 1-dimensional scale tensor of size C.")
149  .Input(2, "bias", "The input 1-dimensional bias tensor of size C.")
150  .Output(
151  0,
152  "output",
153  "The output 4-dimensional tensor of the same shape as input.")
154  .Output(
155  1,
156  "saved_mean",
157  "Optional saved mean used during training to speed up gradient "
158  "computation. Should not be used for testing.")
159  .Output(
160  2,
161  "saved_inv_stdev",
162  "Optional saved inverse stdev used during training to speed up "
163  "gradient computation. Should not be used for testing.");
164 
165 } // namespace caffe2
Copyright (c) 2016-present, Facebook, Inc.