1 #include "caffe2/sgd/lars_op.h" 6 void LarsOp<float, CPUContext>::ComputeLearningRate(
18 val = (*trust) / (*dX_norm / *X_norm + (*wd) + offset);
20 *lr_rescaled = fmaxf(fminf(val, *lr_max), lr_min);
23 REGISTER_CPU_OPERATOR(Lars, LarsOp<float, CPUContext>);
29 Implement Layer-wise Adaptive Rate Scaling (LARS) with clipping. Before adding weight 30 decay, given a parameter tensor X and its gradient dX, the local learning rate 33 local_lr = trust * norm(X) / ( norm(dX) + wd * norm(X) + offset * norm(X) ) 35 = trust / ( norm(dX) / norm(X) + wd + offset ), 37 where offset is a preset hyper-parameter to avoid numerical issue and trust 38 indicates how much we trust the layer to change its parameters during one update. 39 In this implementation, we uses l2 norm and the computed local learning rate is 40 clipped based on the upper bound lr_max and the lower bound lr_min: 42 local_lr = min(local_lr, lr_max) and local_lr = max(local_lr, lr_min) 45 .Input(0, "X",
"Parameter tensor")
46 .Input(1,
"dX",
"Gradient tensor")
47 .Input(2,
"wd",
"Weight decay")
48 .Input(3,
"trust",
"Trust")
49 .Input(4,
"lr_max",
"Upper bound of learning rate")
50 .Output(0,
"lr_rescaled",
"Rescaled local learning rate")
51 .Arg(
"offset",
"rescaling offset parameter")
52 .Arg(
"lr_min",
"minimum learning rate for clipping");
54 SHOULD_NOT_DO_GRADIENT(Lars);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...