Caffe2 - C++ API
A deep learning, cross platform ML framework
yellowfin_op.cc
1 
17 #include "caffe2/sgd/yellowfin_op.h"
18 
19 namespace caffe2 {
20 
21 REGISTER_CPU_OPERATOR(YellowFin, YellowFinOp<float, CPUContext>);
22 OPERATOR_SCHEMA(YellowFin)
23  .NumInputs(10)
24  .NumOutputs(8)
25  .AllowInplace(
26  {{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {7, 7}})
27  .SetDoc(R"DOC(
28 
29 Computes the YellowFin update (https://arxiv.org/abs/1706.03471) and performs
30 momentum SGD optimization step. lr and mu are not being shared between
31 parameters. curv_win, g_avg, g2_avg and scalars_memory are just auxiliary
32 memory for computing moving averages (see the publication). Takes arguments
33 beta: coefficient for moving averages,
34 curv_win_width: timeframe when average squared gradient is being stored,
35 epsilon: for numerical purposes,
36 nesterov and zero_debias for debias of moving average.
37 
38 )DOC")
39  .Input(0, "param", "Parameters to be updated")
40  .Input(1, "moment", "Momentum")
41  .Input(2, "lr", "Learning rate")
42  .Input(3, "mu", "Momentum coefficient")
43  .Input(4, "curv_win", "Memory for latest curvature ranges")
44  .Input(5, "g_avg", "Moving average of gradient")
45  .Input(6, "g2_avg", "Moving average of squared gradient")
46  .Input(7, "scalars_memory", "Memory for stateful scalars")
47  .Input(8, "grad", "Gradient computed")
48  .Input(9, "iter", "Iteration number")
49  .Output(0, "output_param", "Parameters to be updated")
50  .Output(1, "output_moment", "Momentum")
51  .Output(2, "output_lr", "Output learning rate")
52  .Output(3, "output_mu", "Output momentum coefficient")
53  .Output(4, "output_curv_win", "Output memory for latest curvature ranges")
54  .Output(5, "output_g_avg", "Output moving average of gradient")
55  .Output(6, "output_g2_avg", "Output moving average of squared gradient")
56  .Output(7, "output_scalars_memory", "Output memory for stateful scalars")
57  .Arg("beta", "Default 0.999")
58  .Arg("curv_win_width", "Default 20")
59  .Arg("epsilon", "Default 1e-6")
60  .Arg("nesterov", "Default false")
61  .Arg("zero_debias", "Default true");
62 
63 SHOULD_NOT_DO_GRADIENT(YellowFin);
64 
65 #define CAFFE2_YELLOWFIN_GETLRMU(T) \
66  template <> \
67  void YellowFinOp<T, CPUContext>::GetLrMu() { \
68  const T curv_ratio = std::sqrt(*g_norm2_max_deb_ / *g_norm2_min_deb_); \
69  const T mu_limit = (curv_ratio - 1.0f) / (curv_ratio + 1.0f); \
70  const T pre_p = *distance_deb_ * *g_norm2_min_deb_; \
71  const T p = (pre_p * pre_p) / (2.0f * *variance_); \
72  const T w3 = (-std::sqrt(p * p + 4.0f / 27.0f * p * p * p) - p) / 2.0f; \
73  const T w3_sign = w3 > 0.0f ? 1.0f : -1.0f; \
74  const T w = w3_sign * std::pow(std::abs(w3), 1.0f / 3.0f); \
75  const T y = w - p / 3.0f / w; \
76  const T root = y + 1.0f; \
77  *mu_ = std::max(root * root, mu_limit * mu_limit); \
78  *lr_ = std::pow(1.0f - std::sqrt(*mu_), 2) / *g_norm2_min_deb_; \
79  MovingAverage(1, mu_, mu_avg_, mu_avg_out_, mu_deb_); \
80  MovingAverage(1, lr_, lr_avg_, lr_avg_out_, lr_deb_); \
81  }
82 
83 CAFFE2_YELLOWFIN_GETLRMU(float)
84 #undef CAFFE2_YELLOWFIN_GETLRMU
85 
86 // Usually moment_ == moment_out_ && param_ == param_out_
87 #define CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE(T) \
88  template <> \
89  void YellowFinOp<T, CPUContext>::MomentumSgdUpdate() { \
90  const T mu = *mu_avg_out_; \
91  const T lr = *lr_avg_out_; \
92  if (!nesterov_) { \
93  for (int i = 0; i < D_; ++i) { \
94  moment_out_[i] = mu * moment_[i] + lr * grad_[i]; \
95  param_out_[i] = param_[i] - moment_out_[i]; \
96  } \
97  } else { \
98  for (int i = 0; i < D_; ++i) { \
99  const T moment_i = moment_[i]; \
100  moment_out_[i] = mu * moment_i + lr * grad_[i]; \
101  param_out_[i] = param_[i] - (1 + mu) * moment_out_[i] + mu * moment_i; \
102  } \
103  } \
104  }
105 
106 CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE(float)
107 #undef CAFFE2_YELLOWFIN_MOMENTUMSGDUPDATE
108 
109 } // caffe2
Copyright (c) 2016-present, Facebook, Inc.