1 #include "caffe2/operators/layer_norm_op.h" 3 #include <ATen/core/dispatch/KernelRegistration.h> 4 #include <ATen/core/dispatch/OpSchemaRegistration.h> 5 #include <c10/core/Tensor.h> 7 #include "caffe2/core/operator_c10wrapper.h" 8 #include "caffe2/utils/eigen_utils.h" 14 void LayerNormOp<CPUContext>::ComputeStdDevAndFusedParams(
23 ConstEigenVectorArrayMap<T> var_arr(var, N);
24 EigenVectorArrayMap<T> stddev_arr(stddev, N);
25 EigenVectorArrayMap<T> scale_arr(scale, N);
26 scale_arr = (var_arr +
static_cast<T>(epsilon)).rsqrt();
27 stddev_arr = scale_arr * (var_arr +
static_cast<T>(epsilon));
28 EigenVectorArrayMap<T>(bias, N) =
29 -scale_arr * ConstEigenVectorArrayMap<T>(mean, N);
34 void LayerNormOp<CPUContext>::LayerNormForward(
41 CPUContext* context) {
42 EigenArrayMap<T>(Y, N, M) =
43 (ConstEigenArrayMap<T>(X, N, M).rowwise() *
44 ConstEigenVectorArrayMap<T>(scale, M).transpose())
46 ConstEigenVectorArrayMap<T>(bias, M).transpose();
49 REGISTER_CPU_OPERATOR(LayerNorm, LayerNormOp<CPUContext>);
53 void LayerNormGradientOp<CPUContext>::ComputeInternalGradients(
60 ConstEigenArrayMap<T> dY_arr(dY, N, M);
61 ConstEigenArrayMap<T> X_arr(X, N, M);
62 for (
int i = 0; i < M; ++i) {
63 ds[i] = (dY_arr.col(i) * X_arr.col(i)).sum();
64 db[i] = dY_arr.col(i).sum();
70 void LayerNormGradientOp<CPUContext>::ComputeFusedParams(
80 const T scale =
T(1) /
static_cast<T>(N);
81 ConstEigenVectorArrayMap<T> mean_arr(mean, M);
82 ConstEigenVectorArrayMap<T> ds_arr(ds, M);
83 ConstEigenVectorArrayMap<T> db_arr(db, M);
84 EigenVectorArrayMap<T> rsig_arr(dY_scale, M);
85 EigenVectorArrayMap<T> X_scale_arr(X_scale, M);
86 rsig_arr = ConstEigenVectorArrayMap<T>(sig, M).inverse();
87 X_scale_arr = (db_arr * mean_arr - ds_arr) * rsig_arr.cube() * scale;
88 EigenVectorArrayMap<T>(bias, M) =
89 -X_scale_arr * mean_arr - db_arr * rsig_arr * scale;
94 void LayerNormGradientOp<CPUContext>::LayerNormBackward(
103 EigenArrayMap<T>(dX, N, M) =
104 (ConstEigenArrayMap<T>(dY, N, M).rowwise() *
105 ConstEigenVectorArrayMap<T>(dY_scale, M).transpose() +
106 ConstEigenArrayMap<T>(X, N, M).rowwise() *
107 ConstEigenVectorArrayMap<T>(X_scale, M).transpose())
109 ConstEigenVectorArrayMap<T>(bias, M).transpose();
112 OPERATOR_SCHEMA(LayerNormGradient).NumInputs(5).NumOutputs(1);
114 REGISTER_CPU_OPERATOR(LayerNormGradient, LayerNormGradientOp<CPUContext>);
118 class GetLayerNormGradient :
public GradientMakerBase {
119 using GradientMakerBase::GradientMakerBase;
120 std::vector<OperatorDef> GetGradientDefs()
override {
121 return SingleGradientDef(
124 std::vector<std::string>{GO(0), O(0), O(1), O(2), I(0)},
125 std::vector<std::string>{GI(0)});
131 REGISTER_GRADIENT(LayerNorm, GetLayerNormGradient);
133 OPERATOR_SCHEMA(LayerNorm)
136 .TensorInferenceFunction([](
const OperatorDef& def,
137 const vector<TensorShape>& in) {
138 std::vector<TensorShape> out(3);
139 auto input_dims_long = GetDimsVector(in[0]);
140 std::vector<int> input_dims(
141 input_dims_long.begin(), input_dims_long.end());
142 out[0] = CreateTensorShape(input_dims, TensorProto::FLOAT);
144 ArgumentHelper helper(def);
146 auto axis = helper.GetSingleArgument<int32_t>(
"axis", 1);
147 const auto canonical_axis =
148 canonical_axis_index_(axis, in[0].dims().size());
149 std::vector<int> stat_dims(
150 input_dims.begin(), input_dims.begin() + canonical_axis);
151 stat_dims.push_back(1);
152 out[1] = CreateTensorShape(stat_dims, TensorProto::FLOAT);
153 out[2] = CreateTensorShape(stat_dims, TensorProto::FLOAT);
157 Computes layer normalization as described in https://arxiv.org/pdf/1607.06450.pdf. 158 Given an input vector x \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}], 159 this op treats dimensions a_k through a_{n-1} as feature vectors. For each 160 feature vector, the op contains the mean and standard deviation. Then, 161 it returns the normalized values (with respect to the feature vector). 163 Note that this op does not contain the scale an bias terms described in the 164 paper. Simply follow this op with an FC op to add those. Concretely, this op 167 h = \frac{1}{\sigma}(a - \mu) 168 where \mu = \frac{1}{H}\sum_{i=1}^{H} a_i 169 and \sigma = \sqrt{\frac{1}{H}\sum_{i=1}^{H}(a_i - \mu)^2} 170 where H is the number of hidden units (i.e. product of dimensions from 'axis' 175 "(int) default to 1; Describes axis of the inputs. Defaults to one " 176 "because the 0th axis most likely describes the batch size")
179 "(float) default to 0.001. Small value to be added to the stdev when" 180 " dividing out by that value. This prevents division by zero.")
184 "Input tensor which layer normalization will be applied to")
185 .Output(0,
"output",
"Normalized values")
186 .Output(1,
"mean",
"Mean values for each feature vector")
187 .Output(2,
"stddev",
"Standard deviations for each feature vector");
191 C10_REGISTER_CAFFE2_OPERATOR_CPU(
193 (std::vector<c10::Argument>{
203 REGISTER_C10_OPERATOR_FOR_CAFFE2_DISPATCH_CPU(
204 _c10_ops::LayerNorm(),
205 C10LayerNorm_DontUseThisOpYet);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...