Caffe2 - C++ API
A deep learning, cross platform ML framework
yellowfin_op.h
1 
17 // YellowFin: An automatic tuner for momentum SGD
18 // (https://arxiv.org/abs/1706.03471)
19 // The YellowFinOp tunes learning rate and momentum and performs momentum SGD
20 // steps. The learning rate and momentum are separate for any matrix of
21 // parameters.
22 
23 #pragma once
24 
25 #include <cmath>
26 #include <cstring>
27 #include "caffe2/core/operator.h"
28 #include "caffe2/utils/math.h"
29 
30 namespace caffe2 {
31 
32 template <typename T, class Context>
33 class YellowFinOp final : public Operator<Context> {
34  public:
35  USE_OPERATOR_CONTEXT_FUNCTIONS;
36  YellowFinOp(const OperatorDef& operator_def, Workspace* ws)
37  : Operator<Context>(operator_def, ws),
38  curv_win_width_(
39  OperatorBase::GetSingleArgument<int>("curv_win_width", 20)),
40  nesterov_(OperatorBase::GetSingleArgument<int>("nesterov", false)),
41  zero_debias_(
42  OperatorBase::GetSingleArgument<bool>("zero_debias", true)),
43  epsilon_(OperatorBase::GetSingleArgument<T>("epsilon", 1e-6f)),
44  beta_(OperatorBase::GetSingleArgument<T>("beta", 0.999f)) {}
45 
46  protected:
47  // GetLrMu and MomentumSgdUpdate have different implementations for GPU and
48  // CPU. All other methods are generic.
49  void GetLrMu();
50  void MomentumSgdUpdate();
51 
52  void AfterApply() {
53  // g
54  MovingAverage(D_, grad_, g_avg_, g_avg_out_, g_deb_);
55  // g2
56  math::Mul(D_, grad_, grad_, aux_vector_, &context_);
57  MovingAverage(D_, aux_vector_, g2_avg_, g2_avg_out_, g2_deb_);
58  // g_norm2
59  math::Dot(D_, grad_, grad_, g_norm2_, &context_);
60  math::Maximum(1, epsilon_, g_norm2_, g_norm2_, &context_);
61  MovingAverage(1, g_norm2_, g_norm2_avg_, g_norm2_avg_out_, g_norm2_deb_);
62  // g_norm
63  math::Sqrt(1, g_norm2_, g_norm_, &context_);
64  MovingAverage(1, g_norm_, g_norm_avg_, g_norm_avg_out_, g_norm_deb_);
65  math::Maximum(1, epsilon_, g_norm_deb_, g_norm_deb_, &context_);
66  // Curvature range: g_norm2_min, g_norm2_max
67  math::CopyVector(curv_win_width_, curv_win_, curv_win_out_, &context_);
68  T* curv_win_cell = curv_win_out_ + (iter_ - 1) % curv_win_width_;
69  math::Log(1, g_norm2_, curv_win_cell, &context_);
70  int valid_end = std::min(curv_win_width_, iter_);
71  math::ReduceMin(
72  valid_end, curv_win_out_, g_norm2_min_, &scratch_tensor_, &context_);
73  math::ReduceMax(
74  valid_end, curv_win_out_, g_norm2_max_, &scratch_tensor_, &context_);
75  MovingAverage(
76  1,
77  g_norm2_min_,
78  g_norm2_min_avg_,
79  g_norm2_min_avg_out_,
80  g_norm2_min_deb_);
81  MovingAverage(
82  1,
83  g_norm2_max_,
84  g_norm2_max_avg_,
85  g_norm2_max_avg_out_,
86  g_norm2_max_deb_);
87  math::Exp(1, g_norm2_min_deb_, g_norm2_min_deb_, &context_);
88  math::Exp(1, g_norm2_max_deb_, g_norm2_max_deb_, &context_);
89  math::Maximum(1, epsilon_, g_norm2_min_deb_, g_norm2_min_deb_, &context_);
90  math::Maximum(1, epsilon_, g_norm2_max_deb_, g_norm2_max_deb_, &context_);
91  // Gradient variance
92  math::Dot(D_, g_deb_, g_deb_, aux_scalar_, &context_);
93 
94  math::Sub(1, g_norm2_deb_, aux_scalar_, variance_, &context_);
95  math::Maximum(1, epsilon_, variance_, variance_, &context_);
96  // Distance to opt
97  math::Div(1, g_norm_avg_out_, g_norm2_avg_out_, distance_, &context_);
98  MovingAverage(
99  1, distance_, distance_avg_, distance_avg_out_, distance_deb_);
100  if (iter_ > 1) {
101  GetLrMu();
102  }
103  }
104 
105  void MovingAverage(
106  const int N,
107  const T* elt,
108  const T* avg,
109  T* new_avg,
110  T* debias_avg) {
111  const T one = 1;
112  math::Scale(N, beta_, avg, new_avg, &context_);
113  math::Axpy(N, one - beta_, elt, new_avg, &context_);
114  math::Scale(N, debias_factor_, new_avg, debias_avg, &context_);
115  }
116 
117  T ZeroDebiasFactor() {
118  if (zero_debias_) {
119  const T one = 1;
120  return one / (one - std::pow(beta_, iter_));
121  } else {
122  return 1;
123  }
124  }
125 
126  public:
127  bool RunOnDevice() override {
128 // Iter live on the CPU
129 
130 #define CAFFE2_YF_READ_INPUT(INPUT_NAME, VAR_NAME) \
131  const auto VAR_NAME##_tensor = Input(INPUT_NAME); \
132  VAR_NAME##_ = VAR_NAME##_tensor.template data<T>();
133 
134  CAFFE2_YF_READ_INPUT(PARAM, param)
135  CAFFE2_YF_READ_INPUT(MOMENT, moment)
136  CAFFE2_YF_READ_INPUT(LR_AVG, lr_avg)
137  CAFFE2_YF_READ_INPUT(MU_AVG, mu_avg)
138  CAFFE2_YF_READ_INPUT(CURV_WIN, curv_win)
139  CAFFE2_YF_READ_INPUT(G_AVG, g_avg)
140  CAFFE2_YF_READ_INPUT(G2_AVG, g2_avg)
141  CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
142  CAFFE2_YF_READ_INPUT(GRAD, grad)
143 #undef CAFFE2_YF_READ_OUTPUT
144 
145  CAFFE_ENFORCE(OperatorBase::InputIsType<TensorCPU>(ITER));
146  CAFFE_ENFORCE_EQ(lr_avg_tensor.size(), 1);
147  CAFFE_ENFORCE_EQ(mu_avg_tensor.size(), 1);
148  CAFFE_ENFORCE_EQ(param_tensor.ndim(), moment_tensor.ndim());
149  CAFFE_ENFORCE_EQ(param_tensor.ndim(), g_avg_tensor.ndim());
150  CAFFE_ENFORCE_EQ(param_tensor.ndim(), g2_avg_tensor.ndim());
151  CAFFE_ENFORCE_EQ(param_tensor.ndim(), grad_tensor.ndim());
152  for (int i = 0; i < param_tensor.ndim(); ++i) {
153  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i));
154  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i));
155  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i));
156  CAFFE_ENFORCE_EQ(param_tensor.dim32(i), grad_tensor.dim32(i));
157  }
158 
159  iter_ = OperatorBase::Input<TensorCPU>(ITER).template data<int64_t>()[0];
160 
161  D_ = param_tensor.size();
162 
163  // Input data - persistent memory for internal scalars
164  // Note: Memory for these scalars is being allocated during initialization
165  // of the network. If you want to add / remove a scalar, make a
166  // suitable change of memory size in the initialization.
167  const T* memory_it = scalars_memory_ - 1;
168  g_norm_avg_ = ++memory_it;
169  g_norm2_avg_ = ++memory_it;
170  g_norm2_min_avg_ = ++memory_it;
171  g_norm2_max_avg_ = ++memory_it;
172  distance_avg_ = ++memory_it;
173 
174 // Output data
175 
176 #define CAFFE2_YF_READ_OUTPUT(OUTPUT_NAME, VAR_NAME) \
177  auto VAR_NAME##_out_tensor = Output(OUTPUT_##OUTPUT_NAME); \
178  VAR_NAME##_out_tensor->ResizeLike(VAR_NAME##_tensor); \
179  VAR_NAME##_out_ = VAR_NAME##_out_tensor->template mutable_data<T>();
180 
181  CAFFE2_YF_READ_OUTPUT(PARAM, param)
182  CAFFE2_YF_READ_OUTPUT(MOMENT, moment)
183  CAFFE2_YF_READ_OUTPUT(LR_AVG, lr_avg)
184  CAFFE2_YF_READ_OUTPUT(MU_AVG, mu_avg)
185  CAFFE2_YF_READ_OUTPUT(CURV_WIN, curv_win)
186  CAFFE2_YF_READ_OUTPUT(G_AVG, g_avg)
187  CAFFE2_YF_READ_OUTPUT(G2_AVG, g2_avg)
188  CAFFE2_YF_READ_OUTPUT(SCALARS_MEMORY, scalars_memory)
189 #undef CAFFE2_YF_READ_OUTPUT
190 
191  T* out_memory_it = scalars_memory_out_ - 1;
192  g_norm_avg_out_ = ++out_memory_it;
193  g_norm2_avg_out_ = ++out_memory_it;
194  g_norm2_min_avg_out_ = ++out_memory_it;
195  g_norm2_max_avg_out_ = ++out_memory_it;
196  distance_avg_out_ = ++out_memory_it;
197 
198 #define CAFFE2_YF_INIT_VECTOR(NAME) \
199  NAME##_tensor_.Resize(D_); \
200  NAME##_ = NAME##_tensor_.template mutable_data<T>();
201 
202  CAFFE2_YF_INIT_VECTOR(aux_vector)
203  CAFFE2_YF_INIT_VECTOR(g_deb)
204  CAFFE2_YF_INIT_VECTOR(g2_deb)
205  CAFFE2_YF_INIT_VECTOR(g_deb2)
206 #undef CAFFE2_YF_INIT_VECTOR
207 
208 #define CAFFE2_YF_INIT_SCALAR(NAME) \
209  NAME##_tensor_.Resize(1); \
210  NAME##_ = NAME##_tensor_.template mutable_data<T>();
211 
212  CAFFE2_YF_INIT_SCALAR(aux_scalar)
213  CAFFE2_YF_INIT_SCALAR(distance)
214  CAFFE2_YF_INIT_SCALAR(distance_deb)
215  CAFFE2_YF_INIT_SCALAR(g_norm)
216  CAFFE2_YF_INIT_SCALAR(g_norm_deb)
217  CAFFE2_YF_INIT_SCALAR(g_norm2)
218  CAFFE2_YF_INIT_SCALAR(g_norm2_max)
219  CAFFE2_YF_INIT_SCALAR(g_norm2_max_deb)
220  CAFFE2_YF_INIT_SCALAR(g_norm2_min)
221  CAFFE2_YF_INIT_SCALAR(g_norm2_min_deb)
222  CAFFE2_YF_INIT_SCALAR(g_norm2_deb)
223  CAFFE2_YF_INIT_SCALAR(lr)
224  CAFFE2_YF_INIT_SCALAR(lr_deb)
225  CAFFE2_YF_INIT_SCALAR(mu_deb)
226  CAFFE2_YF_INIT_SCALAR(mu)
227  CAFFE2_YF_INIT_SCALAR(variance)
228 #undef CAFFE2_YF_INIT_SCALAR
229 
230  debias_factor_ = ZeroDebiasFactor();
231  MomentumSgdUpdate();
232  AfterApply();
233  return true;
234  }
235 
236  protected:
237  int curv_win_width_;
238  bool nesterov_;
239  bool zero_debias_;
240 
241  T epsilon_;
242  T beta_;
243  T debias_factor_;
244 
245  int D_;
246 
247 // Temporary memory on device, listed all variables used in calculations
248 #define CAFFE2_YF_DEFINE_TENSOR(NAME) \
249  Tensor<Context> NAME##_tensor_; \
250  T* NAME##_;
251 
252  CAFFE2_YF_DEFINE_TENSOR(aux_vector)
253  CAFFE2_YF_DEFINE_TENSOR(g_deb)
254  CAFFE2_YF_DEFINE_TENSOR(g2_deb)
255  CAFFE2_YF_DEFINE_TENSOR(g_deb2)
256 
257  CAFFE2_YF_DEFINE_TENSOR(aux_scalar)
258  CAFFE2_YF_DEFINE_TENSOR(distance)
259  CAFFE2_YF_DEFINE_TENSOR(distance_deb)
260  CAFFE2_YF_DEFINE_TENSOR(g_norm)
261  CAFFE2_YF_DEFINE_TENSOR(g_norm_deb)
262  CAFFE2_YF_DEFINE_TENSOR(g_norm2)
263  CAFFE2_YF_DEFINE_TENSOR(g_norm2_deb)
264  CAFFE2_YF_DEFINE_TENSOR(g_norm2_max)
265  CAFFE2_YF_DEFINE_TENSOR(g_norm2_max_deb)
266  CAFFE2_YF_DEFINE_TENSOR(g_norm2_min)
267  CAFFE2_YF_DEFINE_TENSOR(g_norm2_min_deb)
268  CAFFE2_YF_DEFINE_TENSOR(lr)
269  CAFFE2_YF_DEFINE_TENSOR(lr_deb)
270  CAFFE2_YF_DEFINE_TENSOR(mu)
271  CAFFE2_YF_DEFINE_TENSOR(mu_deb)
272  CAFFE2_YF_DEFINE_TENSOR(variance)
273 
274  Tensor<Context> scratch_tensor_;
275 
276 #undef CAFFE2_YF_DEFINE_TENSOR
277 
278  // Input tensors' data
279  const T* param_;
280  const T* moment_;
281  const T* lr_avg_;
282  const T* mu_avg_;
283  const T* curv_win_;
284  const T* g_avg_;
285  const T* g2_avg_;
286  const T* scalars_memory_;
287  const T* grad_;
288  int iter_;
289 
290  // Scalar data from scalars_memory_ input tensor
291  const T* g_norm_avg_;
292  const T* g_norm2_avg_;
293  const T* g_norm2_min_avg_;
294  const T* g_norm2_max_avg_;
295  const T* distance_avg_;
296 
297  // Output tensors' data
298 
299  T* param_out_;
300  T* moment_out_;
301  T* lr_avg_out_;
302  T* mu_avg_out_;
303  T* curv_win_out_;
304  T* g_avg_out_;
305  T* g2_avg_out_;
306  T* scalars_memory_out_;
307 
308  // Scalar data from scalars_memory_ output tensor
309  T* g_norm_avg_out_;
310  T* g_norm2_avg_out_;
311  T* g_norm2_min_avg_out_;
312  T* g_norm2_max_avg_out_;
313  T* distance_avg_out_;
314 
315  INPUT_TAGS(
316  PARAM,
317  MOMENT,
318  LR_AVG,
319  MU_AVG,
320  CURV_WIN,
321  G_AVG,
322  G2_AVG,
323  SCALARS_MEMORY,
324  GRAD,
325  ITER);
326  OUTPUT_TAGS(
327  OUTPUT_PARAM,
328  OUTPUT_MOMENT,
329  OUTPUT_LR_AVG,
330  OUTPUT_MU_AVG,
331  OUTPUT_CURV_WIN,
332  OUTPUT_G_AVG,
333  OUTPUT_G2_AVG,
334  OUTPUT_SCALARS_MEMORY);
335 };
336 
337 } // namespace caffe2
Tensor is the basic class in Caffe2 that stores a contiguous memory with its shape information...
Definition: tensor.h:109
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:63
Copyright (c) 2016-present, Facebook, Inc.