Caffe2 - C++ API
A deep learning, cross platform ML framework
lars_op.cc
1 #include "caffe2/sgd/lars_op.h"
2 
3 namespace caffe2 {
4 
5 template <>
6 void LarsOp<float, CPUContext>::ComputeLearningRate(
7  const float* wd,
8  const float* trust,
9  const float* lr_max,
10  float offset,
11  float lr_min,
12  float* X_norm,
13  float* dX_norm,
14  float* lr_rescaled) {
15  float val = 1.0;
16 
17  if (*X_norm > 0) {
18  val = (*trust) / (*dX_norm / *X_norm + (*wd) + offset);
19  }
20  *lr_rescaled = fmaxf(fminf(val, *lr_max), lr_min);
21 }
22 
23 REGISTER_CPU_OPERATOR(Lars, LarsOp<float, CPUContext>);
24 
25 OPERATOR_SCHEMA(Lars)
26  .NumInputs(5)
27  .NumOutputs(1)
28  .SetDoc(R"DOC(
29 Implement Layer-wise Adaptive Rate Scaling (LARS) with clipping. Before adding weight
30 decay, given a parameter tensor X and its gradient dX, the local learning rate
31 for X will be
32 
33 local_lr = trust * norm(X) / ( norm(dX) + wd * norm(X) + offset * norm(X) )
34 
35  = trust / ( norm(dX) / norm(X) + wd + offset ),
36 
37 where offset is a preset hyper-parameter to avoid numerical issue and trust
38 indicates how much we trust the layer to change its parameters during one update.
39 In this implementation, we uses l2 norm and the computed local learning rate is
40 clipped based on the upper bound lr_max and the lower bound lr_min:
41 
42 local_lr = min(local_lr, lr_max) and local_lr = max(local_lr, lr_min)
43 
44 )DOC")
45  .Input(0, "X", "Parameter tensor")
46  .Input(1, "dX", "Gradient tensor")
47  .Input(2, "wd", "Weight decay")
48  .Input(3, "trust", "Trust")
49  .Input(4, "lr_max", "Upper bound of learning rate")
50  .Output(0, "lr_rescaled", "Rescaled local learning rate")
51  .Arg("offset", "rescaling offset parameter")
52  .Arg("lr_min", "minimum learning rate for clipping");
53 
54 SHOULD_NOT_DO_GRADIENT(Lars);
55 } // namespace caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13