Caffe2 - C++ API
A deep learning, cross platform ML framework
layer_norm_op.cc
1 #include "caffe2/operators/layer_norm_op.h"
2 
3 #include <ATen/core/dispatch/KernelRegistration.h>
4 #include <ATen/core/dispatch/OpSchemaRegistration.h>
5 #include <c10/core/Tensor.h>
6 
7 #include "caffe2/core/operator_c10wrapper.h"
8 #include "caffe2/utils/eigen_utils.h"
9 
10 namespace caffe2 {
11 
12 template <>
13 template <typename T>
14 void LayerNormOp<CPUContext>::ComputeStdDevAndFusedParams(
15  const int N,
16  const T* mean,
17  const T* var,
18  T* stddev,
19  T* scale,
20  T* bias,
21  float epsilon,
22  CPUContext* /*context*/) {
23  ConstEigenVectorArrayMap<T> var_arr(var, N);
24  EigenVectorArrayMap<T> stddev_arr(stddev, N);
25  EigenVectorArrayMap<T> scale_arr(scale, N);
26  scale_arr = (var_arr + static_cast<T>(epsilon)).rsqrt();
27  stddev_arr = scale_arr * (var_arr + static_cast<T>(epsilon));
28  EigenVectorArrayMap<T>(bias, N) =
29  -scale_arr * ConstEigenVectorArrayMap<T>(mean, N);
30 }
31 
32 template <>
33 template <typename T>
34 void LayerNormOp<CPUContext>::LayerNormForward(
35  const int M,
36  const int N,
37  const T* X,
38  const T* scale,
39  const T* bias,
40  T* Y,
41  CPUContext* context) {
42  EigenArrayMap<T>(Y, N, M) =
43  (ConstEigenArrayMap<T>(X, N, M).rowwise() *
44  ConstEigenVectorArrayMap<T>(scale, M).transpose())
45  .rowwise() +
46  ConstEigenVectorArrayMap<T>(bias, M).transpose();
47 }
48 
49 REGISTER_CPU_OPERATOR(LayerNorm, LayerNormOp<CPUContext>);
50 
51 template <>
52 template <typename T>
53 void LayerNormGradientOp<CPUContext>::ComputeInternalGradients(
54  const int M,
55  const int N,
56  const T* dY,
57  const T* X,
58  T* ds,
59  T* db) {
60  ConstEigenArrayMap<T> dY_arr(dY, N, M);
61  ConstEigenArrayMap<T> X_arr(X, N, M);
62  for (int i = 0; i < M; ++i) {
63  ds[i] = (dY_arr.col(i) * X_arr.col(i)).sum();
64  db[i] = dY_arr.col(i).sum();
65  }
66 }
67 
68 template <>
69 template <typename T>
70 void LayerNormGradientOp<CPUContext>::ComputeFusedParams(
71  const int M,
72  const int N,
73  const T* mean,
74  const T* sig,
75  const T* ds,
76  const T* db,
77  T* dY_scale,
78  T* X_scale,
79  T* bias) {
80  const T scale = T(1) / static_cast<T>(N);
81  ConstEigenVectorArrayMap<T> mean_arr(mean, M);
82  ConstEigenVectorArrayMap<T> ds_arr(ds, M);
83  ConstEigenVectorArrayMap<T> db_arr(db, M);
84  EigenVectorArrayMap<T> rsig_arr(dY_scale, M);
85  EigenVectorArrayMap<T> X_scale_arr(X_scale, M);
86  rsig_arr = ConstEigenVectorArrayMap<T>(sig, M).inverse();
87  X_scale_arr = (db_arr * mean_arr - ds_arr) * rsig_arr.cube() * scale;
88  EigenVectorArrayMap<T>(bias, M) =
89  -X_scale_arr * mean_arr - db_arr * rsig_arr * scale;
90 }
91 
92 template <>
93 template <typename T>
94 void LayerNormGradientOp<CPUContext>::LayerNormBackward(
95  const int M,
96  const int N,
97  const T* dY_scale,
98  const T* dY,
99  const T* X_scale,
100  const T* X,
101  const T* bias,
102  T* dX) {
103  EigenArrayMap<T>(dX, N, M) =
104  (ConstEigenArrayMap<T>(dY, N, M).rowwise() *
105  ConstEigenVectorArrayMap<T>(dY_scale, M).transpose() +
106  ConstEigenArrayMap<T>(X, N, M).rowwise() *
107  ConstEigenVectorArrayMap<T>(X_scale, M).transpose())
108  .rowwise() +
109  ConstEigenVectorArrayMap<T>(bias, M).transpose();
110 }
111 
112 OPERATOR_SCHEMA(LayerNormGradient).NumInputs(5).NumOutputs(1);
113 
114 REGISTER_CPU_OPERATOR(LayerNormGradient, LayerNormGradientOp<CPUContext>);
115 
116 namespace {
117 
118 class GetLayerNormGradient : public GradientMakerBase {
119  using GradientMakerBase::GradientMakerBase;
120  std::vector<OperatorDef> GetGradientDefs() override {
121  return SingleGradientDef(
122  "LayerNormGradient",
123  "",
124  std::vector<std::string>{GO(0), O(0), O(1), O(2), I(0)},
125  std::vector<std::string>{GI(0)});
126  }
127 };
128 
129 } // namespace
130 
131 REGISTER_GRADIENT(LayerNorm, GetLayerNormGradient);
132 
133 OPERATOR_SCHEMA(LayerNorm)
134  .NumInputs(1)
135  .NumOutputs(3)
136  .TensorInferenceFunction([](const OperatorDef& def,
137  const vector<TensorShape>& in) {
138  std::vector<TensorShape> out(3);
139  auto input_dims_long = GetDimsVector(in[0]);
140  std::vector<int> input_dims(
141  input_dims_long.begin(), input_dims_long.end());
142  out[0] = CreateTensorShape(input_dims, TensorProto::FLOAT);
143 
144  ArgumentHelper helper(def);
145 
146  auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
147  const auto canonical_axis =
148  canonical_axis_index_(axis, in[0].dims().size());
149  std::vector<int> stat_dims(
150  input_dims.begin(), input_dims.begin() + canonical_axis);
151  stat_dims.push_back(1);
152  out[1] = CreateTensorShape(stat_dims, TensorProto::FLOAT);
153  out[2] = CreateTensorShape(stat_dims, TensorProto::FLOAT);
154  return out;
155  })
156  .SetDoc(R"DOC(
157 Computes layer normalization as described in https://arxiv.org/pdf/1607.06450.pdf.
158 Given an input vector x \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}],
159 this op treats dimensions a_k through a_{n-1} as feature vectors. For each
160 feature vector, the op contains the mean and standard deviation. Then,
161 it returns the normalized values (with respect to the feature vector).
162 
163 Note that this op does not contain the scale an bias terms described in the
164 paper. Simply follow this op with an FC op to add those. Concretely, this op
165 implements:
166 
167 h = \frac{1}{\sigma}(a - \mu)
168 where \mu = \frac{1}{H}\sum_{i=1}^{H} a_i
169 and \sigma = \sqrt{\frac{1}{H}\sum_{i=1}^{H}(a_i - \mu)^2}
170 where H is the number of hidden units (i.e. product of dimensions from 'axis'
171 to the end.)
172 )DOC")
173  .Arg(
174  "axis",
175  "(int) default to 1; Describes axis of the inputs. Defaults to one "
176  "because the 0th axis most likely describes the batch size")
177  .Arg(
178  "epsilon",
179  "(float) default to 0.001. Small value to be added to the stdev when"
180  " dividing out by that value. This prevents division by zero.")
181  .Input(
182  0,
183  "input",
184  "Input tensor which layer normalization will be applied to")
185  .Output(0, "output", "Normalized values")
186  .Output(1, "mean", "Mean values for each feature vector")
187  .Output(2, "stddev", "Standard deviations for each feature vector");
188 
189 } // namespace caffe2
190 
191 C10_REGISTER_CAFFE2_OPERATOR_CPU(
192  LayerNorm,
193  (std::vector<c10::Argument>{
194  c10::Argument("input"),
195  c10::Argument("axis", c10::IntType::get()),
196  c10::Argument("epsilon", c10::FloatType::get())}),
197  (std::vector<c10::Argument>{c10::Argument("output"),
198  c10::Argument("mean"),
199  c10::Argument("stdev")}),
201 
202 namespace caffe2 {
203 REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU(
204  _c10_ops::LayerNorm(),
205  C10LayerNorm_DontUseThisOpYet);
206 }
Definition: any.cpp:108
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13