doxygen-c/html/yellowfin__op_8cc_source.html

 #include "caffe2/sgd/yellowfin_op.h"

 namespace caffe2 {

 REGISTER_CPU_OPERATOR(YellowFin, YellowFinOp<float, CPUContext>);
 OPERATOR_SCHEMA(YellowFin)
     .NumInputs(10)
     .NumOutputs(8)
     .AllowInplace(
         {{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}})
     .SetDoc(R"DOC(

 Computes the YellowFin update (https://arxiv.org/abs/1706.03471) and performs
 momentum SGD optimization step. lr and mu are not being shared between
 parameters. curv_win, g_avg, g2_avg and scalars_memory are just auxiliary
 memory for computing moving averages (see the publication). Takes arguments
 beta: coefficient for moving averages,
 curv_win_width: timeframe when average squared gradient is being stored,
 epsilon: for numerical purposes,
 nesterov and zero_debias for debias of moving average.

 )DOC")
     .Input(0, "param", "Parameters to be updated")
     .Input(1, "moment", "Momentum")
     .Input(2, "lr", "Learning rate")
     .Input(3, "mu", "Momentum coefficient")
     .Input(4, "curv_win", "Memory for latest curvature ranges")
     .Input(5, "g_avg", "Moving average of gradient")
     .Input(6, "g2_avg", "Moving average of squared gradient")
     .Input(7, "scalars_memory", "Memory for stateful scalars")
     .Input(8, "grad", "Gradient computed")
     .Input(9, "iter", "Iteration number")
     .Output(0, "output_param", "Parameters to be updated")
     .Output(1, "output_moment", "Momentum")
     .Output(2, "output_lr", "Output learning rate")
     .Output(3, "output_mu", "Output momentum coefficient")
     .Output(4, "output_curv_win", "Output memory for latest curvature ranges")
     .Output(5, "output_g_avg", "Output moving average of gradient")
     .Output(6, "output_g2_avg", "Output moving average of squared gradient")
     .Output(7, "output_scalars_memory", "Output memory for stateful scalars")
     .Arg("beta", "Default 0.999")
     .Arg("curv_win_width", "Default 20")
     .Arg("epsilon", "Default 1e-6")
     .Arg("nesterov", "Default false")
     .Arg("zero_debias", "Default true");

 SHOULD_NOT_DO_GRADIENT(YellowFin);

 #define CAFFE2_YELLOWFIN_GETLRMU(T)                                         \
   template <>                                                               \
   void YellowFinOp<T, CPUContext>::GetLrMu() {                              \
     const T curv_ratio = std::sqrt(*g_norm2_max_deb_ / *g_norm2_min_deb_);  \
     const T mu_limit = (curv_ratio - 1.0f) / (curv_ratio + 1.0f);           \
     const T pre_p = *distance_deb_ * *g_norm2_min_deb_;                     \
     const T p = (pre_p * pre_p) / (2.0f * *variance_);                      \
     const T w3 = (-std::sqrt(p * p + 4.0f / 27.0f * p * p * p) - p) / 2.0f; \
     const T w3_sign = w3 > 0.0f ? 1.0f : -1.0f;                             \
     const T w = w3_sign * std::pow(std::abs(w3), 1.0f / 3.0f);              \
     const T y = w - p / 3.0f / w;                                           \
     const T root = y + 1.0f;                                                \
     *mu_ = std::max(root * root, mu_limit * mu_limit);                      \
     *lr_ = std::pow(1.0f - std::sqrt(*mu_), 2) / *g_norm2_min_deb_;         \
     MovingAverage(1, mu_, mu_avg_, mu_avg_out_, mu_deb_);                   \
     MovingAverage(1, lr_, lr_avg_, lr_avg_out_, lr_deb_);                   \
   }

 CAFFE2_YELLOWFIN_GETLRMU(float)
 #undef CAFFE2_YELLOWFIN_GETLRMU

 // Usually moment_ == moment_out_ && param_ == param_out_
 #define CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE(T)                                  \
   template <>                                                                  \
   void YellowFinOp<T, CPUContext>::MomentumSgdUpdate() {                       \
     const T mu = *mu_avg_out_;                                                 \
     const T lr = *lr_avg_out_;                                                 \
     if (!nesterov_) {                                                          \
       for (int i = 0; i < D_; ++i) {                                           \
         moment_out_[i] = mu * moment_[i] + lr * grad_[i];                      \
         param_out_[i] = param_[i] - moment_out_[i];                            \
       }                                                                        \
     } else {                                                                   \
       for (int i = 0; i < D_; ++i) {                                           \
         const T moment_i = moment_[i];                                         \
         moment_out_[i] = mu * moment_i + lr * grad_[i];                        \
         param_out_[i] = param_[i] - (1 + mu) * moment_out_[i] + mu * moment_i; \
       }                                                                        \
     }                                                                          \
   }

 CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE(float)
 #undef CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE

 } // caffe2
caffe2
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13