Caffe2 - C++ API
A deep learning, cross platform ML framework
instance_norm_op.cc
1 #include "caffe2/operators/instance_norm_op.h"
2 #include "caffe2/utils/eigen_utils.h"
3 
4 namespace caffe2 {
5 
6 // Here lives two separate implementations of the forward and backward passes of
7 // instance normalization, one for NHWC order and the other for NCHW order.
8 // Two implementations allow us to make use of Eigen vectorized operations
9 // without an expensive tensor transpose operation.
10 
11 template <typename T, typename Context>
12 bool InstanceNormOp<T, Context>::RunOnDeviceWithOrderNHWC() {
13  const auto& X = Input(INPUT);
14 
15  CAFFE_ENFORCE(
16  !IsInputOutputAlias(INPUT, OUTPUT),
17  "Can't run InstanceNorm NHWC in-place");
18  auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_;
19  auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_;
20  const int N = X.dim32(0);
21  const int H = X.dim32(1);
22  const int W = X.dim32(2);
23  const int C = X.dim32(3);
24  const size_t offset = H * W * C;
25 
26  CAFFE_ENFORCE_EQ(Input(SCALE).numel(), C);
27  CAFFE_ENFORCE_EQ(Input(BIAS).numel(), C);
28 
29  auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
30  mean->Resize(N, C);
31  inv_stdev->Resize(N, C);
32  ConstEigenVectorArrayMap<T> scale(Input(SCALE).template data<T>(), C);
33  ConstEigenVectorArrayMap<T> bias(Input(BIAS).template data<T>(), C);
34  for (int n = 0; n < N; ++n) {
35  ConstEigenArrayMap<T> Xmat(X.template data<T>() + offset * n, C, H * W);
36  EigenArrayMap<T> Ymat(Y->template mutable_data<T>() + offset * n, C, H * W);
37  EigenVectorArrayMap<T> mean_arr(
38  mean->template mutable_data<T>() + n * C, C);
39  EigenVectorArrayMap<T> inv_stdev_arr(
40  inv_stdev->template mutable_data<T>() + n * C, C);
41 
42  // The following effectively does the row wise mean computation:
43  // mean_arr = Xmat.rowwise().mean();
44  // but manually vectorizes over columns.
45  mean_arr = Xmat.col(0);
46  for (int i = 1; i < H * W; ++i) {
47  mean_arr += Xmat.col(i);
48  }
49  mean_arr *= 1. / (H * W);
50  Ymat = Xmat.colwise() - mean_arr;
51  // The following effectively does row wise squared norm computation,
52  // but manually vectorizes over columns similar to the mean case.
53  inv_stdev_arr = Ymat.col(0) * Ymat.col(0);
54  for (int i = 1; i < H * W; ++i) {
55  inv_stdev_arr += Ymat.col(i) * Ymat.col(i);
56  }
57  inv_stdev_arr = (inv_stdev_arr / (H * W) + epsilon_).sqrt().inverse();
58  Ymat = (Ymat.colwise() * (inv_stdev_arr * scale)).colwise() + bias;
59  }
60  return true;
61 }
62 
63 template <typename T, typename Context>
64 bool InstanceNormOp<T, Context>::RunOnDeviceWithOrderNCHW() {
65  const auto& X = Input(INPUT);
66  const auto& scale = Input(SCALE);
67  const auto& bias = Input(BIAS);
68 
69  auto* mean = OutputSize() > 1 ? Output(MEAN) : &mean_;
70  auto* inv_stdev = OutputSize() > 1 ? Output(INV_STDEV) : &inv_stdev_;
71  const int N = X.dim32(0);
72  const int C = X.dim32(1);
73  const int H = X.dim32(2);
74  const int W = X.dim32(3);
75 
76  CAFFE_ENFORCE_EQ(scale.numel(), C);
77  CAFFE_ENFORCE_EQ(bias.numel(), C);
78 
79  auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
80  mean->Resize(N, C);
81  inv_stdev->Resize(N, C);
82 
83  const auto* Xdata = X.template data<T>();
84  auto* Ydata = Y->template mutable_data<T>();
85  const auto* scale_data = scale.template data<T>();
86  const auto* bias_data = bias.template data<T>();
87  auto* mean_data = mean->template mutable_data<T>();
88  auto* inv_stdev_data = inv_stdev->template mutable_data<T>();
89 
90  // TODO: benchmark parallelization strategies.
91  for (auto i = 0; i < N * C; ++i) {
92  ConstEigenVectorArrayMap<T> Xi(Xdata + H * W * i, H * W);
93  const T Xi_mean = Xi.mean();
94  const T squared_norm = (Xi - Xi_mean).matrix().squaredNorm();
95  const T inv_stdev = 1.0 / std::sqrt(squared_norm / (H * W) + epsilon_);
96  mean_data[i] = Xi_mean;
97  inv_stdev_data[i] = inv_stdev;
98  EigenVectorArrayMap<T> Yi(Ydata + H * W * i, H * W);
99  const T channel_scale = inv_stdev * scale_data[i % C];
100  const T channel_shift = bias_data[i % C] - Xi_mean * channel_scale;
101  Yi = Xi * channel_scale + channel_shift;
102  }
103 
104  return true;
105 }
106 
107 REGISTER_CPU_OPERATOR(InstanceNorm, InstanceNormOp<float, CPUContext>);
108 
109 OPERATOR_SCHEMA(InstanceNorm)
110  .NumInputs(3)
111  .NumOutputs(1, 3)
112  .AllowInplace({{0,0}})
113  .SetDoc(R"DOC(
114 The *InstanceNorm* op applies Instance Normalization over a 4D input as described in [Instance Normalization: The Missing Ingredient for Fast Stylization](https://arxiv.org/abs/1607.08022).
115 
116 $$output = \frac{input-\mu_{input}}{\sqrt{\sigma_{input}^2} + \epsilon}*scale + bias$$
117 
118 Notice, two of the outputs are optional so there are three output cases for this op. Case 1: output; Case 2: output, saved_mean; Case 3: output, saved_mean, saved_inv_stdev.
119 
120 Github Links:
121 
122 - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/instance_norm_op.h
123 - https://github.com/caffe2/caffe2/blob/master/caffe2/operators/instance_norm_op.cc
124 
125 
126 <details>
127 
128 <summary> <b>Example</b> </summary>
129 
130 **Code**
131 
132 ```
133 
134 workspace.ResetWorkspace()
135 
136 op = core.CreateOperator(
137  "InstanceNorm",
138  ["input", "scale", "bias"],
139  ["output"],
140  epsilon=1e-5,
141 )
142 
143 workspace.FeedBlob("input", np.random.randn(2, 1, 3, 3).astype(np.float32))
144 print("input:\n", workspace.FetchBlob("input"), "\n")
145 
146 workspace.FeedBlob("scale", np.array([1.5]).astype(np.float32))
147 print("scale: ", workspace.FetchBlob("scale"))
148 
149 workspace.FeedBlob("bias", np.array([1.]).astype(np.float32))
150 print("bias: ", workspace.FetchBlob("bias"))
151 
152 workspace.RunOperatorOnce(op)
153 print("output:\n", workspace.FetchBlob("output"))
154 
155 ```
156 
157 **Result**
158 
159 ```
160 
161 input:
162  [[[[ 0.97856593 -1.1832817 -0.2540021 ]
163  [-1.3315694 -0.7485018 0.3787225 ]
164  [-0.6826597 -1.4637762 0.57116514]]]
165 
166 
167  [[[-0.44948956 0.85544354 -0.9315333 ]
168  [-0.37202677 -0.22266895 -0.27194235]
169  [ 0.4948163 -0.7296504 1.3393803 ]]]]
170 
171 scale: [1.5]
172 bias: [1.]
173 output:
174  [[[[ 3.5017493 -0.3791256 1.2890853 ]
175  [-0.6453266 0.40137637 2.4249308 ]
176  [ 0.5195738 -0.8826599 2.7703972 ]]]
177 
178 
179  [[[ 0.12639964 2.856744 -0.8821926 ]
180  [ 0.28847694 0.60098207 0.49788612]
181  [ 2.1021945 -0.45978796 3.869297 ]]]]
182 
183 ```
184 
185 </details>
186 
187 )DOC")
188  .Arg("epsilon", "*(type: float; default: 1e-5)* The epsilon value to use to avoid division by zero.")
189  .Arg("order", "*(type: string; default: \"NCHW\")* Specifies the order of the input data blob, where $N$ is batch size, $C$ is number of channels, $H$ is spatial height, and $W$ is spatial width. The only other valid option is \"NHWC\".")
190  .Input(0, "input", "The input 4-dimensional NCHW tensor to be operated on.")
191  .Input(1, "scale", "The input 1-dimensional scale tensor of size *C*.")
192  .Input(2, "bias", "The input 1-dimensional bias tensor of size *C*.")
193  .Output(
194  0,
195  "output",
196  "The output 4-dimensional tensor of the same shape as input.")
197  .Output(
198  1,
199  "saved_mean",
200  "(Optional) Saved mean used during training to speed up gradient computation. Should not be used for testing.")
201  .Output(
202  2,
203  "saved_inv_stdev",
204  "(Optional) Saved inverse stdev used during training to speed up gradient computation. Should not be used for testing.");
205 
206 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
Definition: static.cpp:64