Caffe2 - C++ API
A deep learning, cross platform ML framework
yellowfin_op.h
1 // YellowFin: An automatic tuner for momentum SGD
2 // (https://arxiv.org/abs/1706.03471)
3 // The YellowFinOp tunes learning rate and momentum and performs momentum SGD
4 // steps. The learning rate and momentum are separate for any matrix of
5 // parameters.
6 
7 #pragma once
8 
9 #include <cmath>
10 #include <cstring>
11 #include "caffe2/core/operator.h"
12 #include "caffe2/utils/math.h"
13 
14 namespace caffe2 {
15 
16 template <typename T, class Context>
17 class YellowFinOp final : public Operator<Context> {
18  public:
19  USE_OPERATOR_CONTEXT_FUNCTIONS;
20  YellowFinOp(const OperatorDef& operator_def, Workspace* ws)
21  : Operator<Context>(operator_def, ws),
22  curv_win_width_(
23  this->template GetSingleArgument<int>("curv_win_width", 20)),
24  nesterov_(this->template GetSingleArgument<int>("nesterov", false)),
25  zero_debias_(
26  this->template GetSingleArgument<bool>("zero_debias", true)),
27  epsilon_(this->template GetSingleArgument<T>("epsilon", 1e-6f)),
28  beta_(this->template GetSingleArgument<T>("beta", 0.999f)) {}
29 
30  protected:
31  // GetLrMu and MomentumSgdUpdate have different implementations for GPU and
32  // CPU. All other methods are generic.
33  void GetLrMu();
34  void MomentumSgdUpdate();
35 
36  void AfterApply() {
37  // g
38  MovingAverage(D_, grad_, g_avg_, g_avg_out_, g_deb_);
39  // g2
40  math::Mul(D_, grad_, grad_, aux_vector_, &context_);
41  MovingAverage(D_, aux_vector_, g2_avg_, g2_avg_out_, g2_deb_);
42  // g_norm2
43  math::Dot(D_, grad_, grad_, g_norm2_, &context_);
44  math::Maximum(1, epsilon_, g_norm2_, g_norm2_, &context_);
45  MovingAverage(1, g_norm2_, g_norm2_avg_, g_norm2_avg_out_, g_norm2_deb_);
46  // g_norm
47  math::Sqrt(1, g_norm2_, g_norm_, &context_);
48  MovingAverage(1, g_norm_, g_norm_avg_, g_norm_avg_out_, g_norm_deb_);
49  math::Maximum(1, epsilon_, g_norm_deb_, g_norm_deb_, &context_);
50  // Curvature range: g_norm2_min, g_norm2_max
51  math::CopyVector(curv_win_width_, curv_win_, curv_win_out_, &context_);
52  T* curv_win_cell = curv_win_out_ + (iter_ - 1) % curv_win_width_;
53  math::Log(1, g_norm2_, curv_win_cell, &context_);
54  int valid_end = std::min(curv_win_width_, iter_);
55  math::ReduceMin(
56  valid_end, curv_win_out_, g_norm2_min_, &scratch_tensor_, &context_);
57  math::ReduceMax(
58  valid_end, curv_win_out_, g_norm2_max_, &scratch_tensor_, &context_);
59  MovingAverage(
60  1,
61  g_norm2_min_,
62  g_norm2_min_avg_,
63  g_norm2_min_avg_out_,
64  g_norm2_min_deb_);
65  MovingAverage(
66  1,
67  g_norm2_max_,
68  g_norm2_max_avg_,
69  g_norm2_max_avg_out_,
70  g_norm2_max_deb_);
71  math::Exp(1, g_norm2_min_deb_, g_norm2_min_deb_, &context_);
72  math::Exp(1, g_norm2_max_deb_, g_norm2_max_deb_, &context_);
73  math::Maximum(1, epsilon_, g_norm2_min_deb_, g_norm2_min_deb_, &context_);
74  math::Maximum(1, epsilon_, g_norm2_max_deb_, g_norm2_max_deb_, &context_);
75  // Gradient variance
76  math::Dot(D_, g_deb_, g_deb_, aux_scalar_, &context_);
77 
78  math::Sub(1, g_norm2_deb_, aux_scalar_, variance_, &context_);
79  math::Maximum(1, epsilon_, variance_, variance_, &context_);
80  // Distance to opt
81  math::Div(1, g_norm_avg_out_, g_norm2_avg_out_, distance_, &context_);
82  MovingAverage(
83  1, distance_, distance_avg_, distance_avg_out_, distance_deb_);
84  if (iter_ > 1) {
85  GetLrMu();
86  }
87  }
88 
89  void MovingAverage(
90  const int N,
91  const T* elt,
92  const T* avg,
93  T* new_avg,
94  T* debias_avg) {
95  const T one = 1;
96  math::Scale(N, beta_, avg, new_avg, &context_);
97  math::Axpy(N, one - beta_, elt, new_avg, &context_);
98  math::Scale(N, debias_factor_, new_avg, debias_avg, &context_);
99  }
100 
101  T ZeroDebiasFactor() {
102  if (zero_debias_) {
103  const T one = 1;
104  return one / (one - std::pow(beta_, iter_));
105  } else {
106  return 1;
107  }
108  }
109 
110  public:
111  bool RunOnDevice() override {
112 // Iter live on the CPU
113 
114 #define CAFFE2_YF_READ_INPUT(INPUT_NAME, VAR_NAME) \
115  const auto& VAR_NAME##_tensor = Input(INPUT_NAME); \
116  VAR_NAME##_ = VAR_NAME##_tensor.template data<T>();
117 
118 CAFFE2_YF_READ_INPUT(PARAM, param)
119 CAFFE2_YF_READ_INPUT(MOMENT, moment)
120 CAFFE2_YF_READ_INPUT(LR_AVG, lr_avg)
121 CAFFE2_YF_READ_INPUT(MU_AVG, mu_avg)
122 CAFFE2_YF_READ_INPUT(CURV_WIN, curv_win)
123 CAFFE2_YF_READ_INPUT(G_AVG, g_avg)
124 CAFFE2_YF_READ_INPUT(G2_AVG, g2_avg)
125 CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
126 CAFFE2_YF_READ_INPUT(GRAD, grad)
127 #undef CAFFE2_YF_READ_OUTPUT
128 
129 CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU));
130 CAFFE_ENFORCE_EQ(lr_avg_tensor.numel(), 1);
131 CAFFE_ENFORCE_EQ(mu_avg_tensor.numel(), 1);
132 CAFFE_ENFORCE_EQ(param_tensor.dim(), moment_tensor.dim());
133 CAFFE_ENFORCE_EQ(param_tensor.dim(), g_avg_tensor.dim());
134 CAFFE_ENFORCE_EQ(param_tensor.dim(), g2_avg_tensor.dim());
135 CAFFE_ENFORCE_EQ(param_tensor.dim(), grad_tensor.dim());
136 for (int i = 0; i < param_tensor.dim(); ++i) {
137  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i));
138  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i));
139  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i));
140  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), grad_tensor.dim32(i));
141 }
142 
143  iter_ = OperatorBase::Input<Tensor>(ITER, CPU).template data<int64_t>()[0];
144 
145  D_ = param_tensor.numel();
146 
147  // Input data - persistent memory for internal scalars
148  // Note: Memory for these scalars is being allocated during initialization
149  // of the network. If you want to add / remove a scalar, make a
150  // suitable change of memory size in the initialization.
151  const T* memory_it = scalars_memory_ - 1;
152  g_norm_avg_ = ++memory_it;
153  g_norm2_avg_ = ++memory_it;
154  g_norm2_min_avg_ = ++memory_it;
155  g_norm2_max_avg_ = ++memory_it;
156  distance_avg_ = ++memory_it;
157 
158 // Output data
159 
160 #define CAFFE2_YF_READ_OUTPUT(OUTPUT_NAME, VAR_NAME) \
161  auto VAR_NAME##_out_tensor = \
162  Output(OUTPUT_##OUTPUT_NAME, VAR_NAME##_tensor.sizes(), at::dtype<T>()); \
163  VAR_NAME##_out_ = VAR_NAME##_out_tensor->template mutable_data<T>();
164 
165  CAFFE2_YF_READ_OUTPUT(PARAM, param)
166  CAFFE2_YF_READ_OUTPUT(MOMENT, moment)
167  CAFFE2_YF_READ_OUTPUT(LR_AVG, lr_avg)
168  CAFFE2_YF_READ_OUTPUT(MU_AVG, mu_avg)
169  CAFFE2_YF_READ_OUTPUT(CURV_WIN, curv_win)
170  CAFFE2_YF_READ_OUTPUT(G_AVG, g_avg)
171  CAFFE2_YF_READ_OUTPUT(G2_AVG, g2_avg)
172  CAFFE2_YF_READ_OUTPUT(SCALARS_MEMORY, scalars_memory)
173 #undef CAFFE2_YF_READ_OUTPUT
174 
175  T* out_memory_it = scalars_memory_out_ - 1;
176  g_norm_avg_out_ = ++out_memory_it;
177  g_norm2_avg_out_ = ++out_memory_it;
178  g_norm2_min_avg_out_ = ++out_memory_it;
179  g_norm2_max_avg_out_ = ++out_memory_it;
180  distance_avg_out_ = ++out_memory_it;
181 
182 #define CAFFE2_YF_INIT_VECTOR(NAME) \
183  ReinitializeTensor(&NAME##_tensor_, {D_}, at::dtype<T>().device(Context::GetDeviceType())); \
184  NAME##_ = NAME##_tensor_.template mutable_data<T>();
185 
186  CAFFE2_YF_INIT_VECTOR(aux_vector)
187  CAFFE2_YF_INIT_VECTOR(g_deb)
188  CAFFE2_YF_INIT_VECTOR(g2_deb)
189  CAFFE2_YF_INIT_VECTOR(g_deb2)
190 #undef CAFFE2_YF_INIT_VECTOR
191 
192 #define CAFFE2_YF_INIT_SCALAR(NAME) \
193  ReinitializeTensor(&NAME##_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType())); \
194  NAME##_ = NAME##_tensor_.template mutable_data<T>();
195 
196  CAFFE2_YF_INIT_SCALAR(aux_scalar)
197  CAFFE2_YF_INIT_SCALAR(distance)
198  CAFFE2_YF_INIT_SCALAR(distance_deb)
199  CAFFE2_YF_INIT_SCALAR(g_norm)
200  CAFFE2_YF_INIT_SCALAR(g_norm_deb)
201  CAFFE2_YF_INIT_SCALAR(g_norm2)
202  CAFFE2_YF_INIT_SCALAR(g_norm2_max)
203  CAFFE2_YF_INIT_SCALAR(g_norm2_max_deb)
204  CAFFE2_YF_INIT_SCALAR(g_norm2_min)
205  CAFFE2_YF_INIT_SCALAR(g_norm2_min_deb)
206  CAFFE2_YF_INIT_SCALAR(g_norm2_deb)
207  CAFFE2_YF_INIT_SCALAR(lr)
208  CAFFE2_YF_INIT_SCALAR(lr_deb)
209  CAFFE2_YF_INIT_SCALAR(mu_deb)
210  CAFFE2_YF_INIT_SCALAR(mu)
211  CAFFE2_YF_INIT_SCALAR(variance)
212 #undef CAFFE2_YF_INIT_SCALAR
213 
214  debias_factor_ = ZeroDebiasFactor();
215  MomentumSgdUpdate();
216  AfterApply();
217  return true;
218  }
219 
220  protected:
221  int curv_win_width_;
222  bool nesterov_;
223  bool zero_debias_;
224 
225  T epsilon_;
226  T beta_;
227  T debias_factor_;
228 
229  int D_;
230 
231 // Temporary memory on device, listed all variables used in calculations
232 #define CAFFE2_YF_DEFINE_TENSOR(NAME) \
233  Tensor NAME##_tensor_; \
234  T* NAME##_;
235 
236  CAFFE2_YF_DEFINE_TENSOR(aux_vector)
237  CAFFE2_YF_DEFINE_TENSOR(g_deb)
238  CAFFE2_YF_DEFINE_TENSOR(g2_deb)
239  CAFFE2_YF_DEFINE_TENSOR(g_deb2)
240 
241  CAFFE2_YF_DEFINE_TENSOR(aux_scalar)
242  CAFFE2_YF_DEFINE_TENSOR(distance)
243  CAFFE2_YF_DEFINE_TENSOR(distance_deb)
244  CAFFE2_YF_DEFINE_TENSOR(g_norm)
245  CAFFE2_YF_DEFINE_TENSOR(g_norm_deb)
246  CAFFE2_YF_DEFINE_TENSOR(g_norm2)
247  CAFFE2_YF_DEFINE_TENSOR(g_norm2_deb)
248  CAFFE2_YF_DEFINE_TENSOR(g_norm2_max)
249  CAFFE2_YF_DEFINE_TENSOR(g_norm2_max_deb)
250  CAFFE2_YF_DEFINE_TENSOR(g_norm2_min)
251  CAFFE2_YF_DEFINE_TENSOR(g_norm2_min_deb)
252  CAFFE2_YF_DEFINE_TENSOR(lr)
253  CAFFE2_YF_DEFINE_TENSOR(lr_deb)
254  CAFFE2_YF_DEFINE_TENSOR(mu)
255  CAFFE2_YF_DEFINE_TENSOR(mu_deb)
256  CAFFE2_YF_DEFINE_TENSOR(variance)
257 
258  Tensor scratch_tensor_{Context::GetDeviceType()};
259 
260 #undef CAFFE2_YF_DEFINE_TENSOR
261 
262  // Input tensors' data
263  const T* param_;
264  const T* moment_;
265  const T* lr_avg_;
266  const T* mu_avg_;
267  const T* curv_win_;
268  const T* g_avg_;
269  const T* g2_avg_;
270  const T* scalars_memory_;
271  const T* grad_;
272  int iter_;
273 
274  // Scalar data from scalars_memory_ input tensor
275  const T* g_norm_avg_;
276  const T* g_norm2_avg_;
277  const T* g_norm2_min_avg_;
278  const T* g_norm2_max_avg_;
279  const T* distance_avg_;
280 
281  // Output tensors' data
282 
283  T* param_out_;
284  T* moment_out_;
285  T* lr_avg_out_;
286  T* mu_avg_out_;
287  T* curv_win_out_;
288  T* g_avg_out_;
289  T* g2_avg_out_;
290  T* scalars_memory_out_;
291 
292  // Scalar data from scalars_memory_ output tensor
293  T* g_norm_avg_out_;
294  T* g_norm2_avg_out_;
295  T* g_norm2_min_avg_out_;
296  T* g_norm2_max_avg_out_;
297  T* distance_avg_out_;
298 
299  INPUT_TAGS(
300  PARAM,
301  MOMENT,
302  LR_AVG,
303  MU_AVG,
304  CURV_WIN,
305  G_AVG,
306  G2_AVG,
307  SCALARS_MEMORY,
308  GRAD,
309  ITER);
310  OUTPUT_TAGS(
311  OUTPUT_PARAM,
312  OUTPUT_MOMENT,
313  OUTPUT_LR_AVG,
314  OUTPUT_MU_AVG,
315  OUTPUT_CURV_WIN,
316  OUTPUT_G_AVG,
317  OUTPUT_G2_AVG,
318  OUTPUT_SCALARS_MEMORY);
319 };
320 
321 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13