11 #include "caffe2/core/operator.h" 12 #include "caffe2/utils/math.h" 16 template <
typename T,
class Context>
19 USE_OPERATOR_CONTEXT_FUNCTIONS;
23 this->
template GetSingleArgument<int>(
"curv_win_width", 20)),
24 nesterov_(this->
template GetSingleArgument<int>(
"nesterov",
false)),
26 this->
template GetSingleArgument<bool>(
"zero_debias",
true)),
27 epsilon_(this->
template GetSingleArgument<T>(
"epsilon", 1e-6f)),
28 beta_(this->
template GetSingleArgument<T>(
"beta", 0.999f)) {}
34 void MomentumSgdUpdate();
38 MovingAverage(D_, grad_, g_avg_, g_avg_out_, g_deb_);
40 math::Mul(D_, grad_, grad_, aux_vector_, &context_);
41 MovingAverage(D_, aux_vector_, g2_avg_, g2_avg_out_, g2_deb_);
43 math::Dot(D_, grad_, grad_, g_norm2_, &context_);
44 math::Maximum(1, epsilon_, g_norm2_, g_norm2_, &context_);
45 MovingAverage(1, g_norm2_, g_norm2_avg_, g_norm2_avg_out_, g_norm2_deb_);
47 math::Sqrt(1, g_norm2_, g_norm_, &context_);
48 MovingAverage(1, g_norm_, g_norm_avg_, g_norm_avg_out_, g_norm_deb_);
49 math::Maximum(1, epsilon_, g_norm_deb_, g_norm_deb_, &context_);
51 math::CopyVector(curv_win_width_, curv_win_, curv_win_out_, &context_);
52 T* curv_win_cell = curv_win_out_ + (iter_ - 1) % curv_win_width_;
53 math::Log(1, g_norm2_, curv_win_cell, &context_);
54 int valid_end = std::min(curv_win_width_, iter_);
56 valid_end, curv_win_out_, g_norm2_min_, &scratch_tensor_, &context_);
58 valid_end, curv_win_out_, g_norm2_max_, &scratch_tensor_, &context_);
71 math::Exp(1, g_norm2_min_deb_, g_norm2_min_deb_, &context_);
72 math::Exp(1, g_norm2_max_deb_, g_norm2_max_deb_, &context_);
73 math::Maximum(1, epsilon_, g_norm2_min_deb_, g_norm2_min_deb_, &context_);
74 math::Maximum(1, epsilon_, g_norm2_max_deb_, g_norm2_max_deb_, &context_);
76 math::Dot(D_, g_deb_, g_deb_, aux_scalar_, &context_);
78 math::Sub(1, g_norm2_deb_, aux_scalar_, variance_, &context_);
79 math::Maximum(1, epsilon_, variance_, variance_, &context_);
81 math::Div(1, g_norm_avg_out_, g_norm2_avg_out_, distance_, &context_);
83 1, distance_, distance_avg_, distance_avg_out_, distance_deb_);
96 math::Scale(N, beta_, avg, new_avg, &context_);
97 math::Axpy(N, one - beta_, elt, new_avg, &context_);
98 math::Scale(N, debias_factor_, new_avg, debias_avg, &context_);
101 T ZeroDebiasFactor() {
104 return one / (one - std::pow(beta_, iter_));
111 bool RunOnDevice()
override {
114 #define CAFFE2_YF_READ_INPUT(INPUT_NAME, VAR_NAME) \ 115 const auto& VAR_NAME##_tensor = Input(INPUT_NAME); \ 116 VAR_NAME##_ = VAR_NAME##_tensor.template data<T>(); 118 CAFFE2_YF_READ_INPUT(PARAM, param)
119 CAFFE2_YF_READ_INPUT(MOMENT, moment)
120 CAFFE2_YF_READ_INPUT(LR_AVG, lr_avg)
121 CAFFE2_YF_READ_INPUT(MU_AVG, mu_avg)
122 CAFFE2_YF_READ_INPUT(CURV_WIN, curv_win)
123 CAFFE2_YF_READ_INPUT(G_AVG, g_avg)
124 CAFFE2_YF_READ_INPUT(G2_AVG, g2_avg)
125 CAFFE2_YF_READ_INPUT(SCALARS_MEMORY, scalars_memory)
126 CAFFE2_YF_READ_INPUT(GRAD, grad)
127 #undef CAFFE2_YF_READ_OUTPUT 129 CAFFE_ENFORCE(OperatorBase::InputIsTensorType(ITER, CPU));
130 CAFFE_ENFORCE_EQ(lr_avg_tensor.numel(), 1);
131 CAFFE_ENFORCE_EQ(mu_avg_tensor.numel(), 1);
132 CAFFE_ENFORCE_EQ(param_tensor.dim(), moment_tensor.dim());
133 CAFFE_ENFORCE_EQ(param_tensor.dim(), g_avg_tensor.dim());
134 CAFFE_ENFORCE_EQ(param_tensor.dim(), g2_avg_tensor.dim());
135 CAFFE_ENFORCE_EQ(param_tensor.dim(), grad_tensor.dim());
136 for (
int i = 0; i < param_tensor.dim(); ++i) {
137 CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i));
138 CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i));
139 CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i));
140 CAFFE_ENFORCE_EQ(param_tensor.dim32(i), grad_tensor.dim32(i));
143 iter_ = OperatorBase::Input<Tensor>(ITER, CPU).
template data<int64_t>()[0];
145 D_ = param_tensor.numel();
151 const T* memory_it = scalars_memory_ - 1;
152 g_norm_avg_ = ++memory_it;
153 g_norm2_avg_ = ++memory_it;
154 g_norm2_min_avg_ = ++memory_it;
155 g_norm2_max_avg_ = ++memory_it;
156 distance_avg_ = ++memory_it;
160 #define CAFFE2_YF_READ_OUTPUT(OUTPUT_NAME, VAR_NAME) \ 161 auto VAR_NAME##_out_tensor = \ 162 Output(OUTPUT_##OUTPUT_NAME, VAR_NAME##_tensor.sizes(), at::dtype<T>()); \ 163 VAR_NAME##_out_ = VAR_NAME##_out_tensor->template mutable_data<T>(); 165 CAFFE2_YF_READ_OUTPUT(PARAM, param)
166 CAFFE2_YF_READ_OUTPUT(MOMENT, moment)
167 CAFFE2_YF_READ_OUTPUT(LR_AVG, lr_avg)
168 CAFFE2_YF_READ_OUTPUT(MU_AVG, mu_avg)
169 CAFFE2_YF_READ_OUTPUT(CURV_WIN, curv_win)
170 CAFFE2_YF_READ_OUTPUT(G_AVG, g_avg)
171 CAFFE2_YF_READ_OUTPUT(G2_AVG, g2_avg)
172 CAFFE2_YF_READ_OUTPUT(SCALARS_MEMORY, scalars_memory)
173 #undef CAFFE2_YF_READ_OUTPUT 175 T* out_memory_it = scalars_memory_out_ - 1;
176 g_norm_avg_out_ = ++out_memory_it;
177 g_norm2_avg_out_ = ++out_memory_it;
178 g_norm2_min_avg_out_ = ++out_memory_it;
179 g_norm2_max_avg_out_ = ++out_memory_it;
180 distance_avg_out_ = ++out_memory_it;
182 #define CAFFE2_YF_INIT_VECTOR(NAME) \ 183 ReinitializeTensor(&NAME##_tensor_, {D_}, at::dtype<T>().device(Context::GetDeviceType())); \ 184 NAME##_ = NAME##_tensor_.template mutable_data<T>(); 186 CAFFE2_YF_INIT_VECTOR(aux_vector)
187 CAFFE2_YF_INIT_VECTOR(g_deb)
188 CAFFE2_YF_INIT_VECTOR(g2_deb)
189 CAFFE2_YF_INIT_VECTOR(g_deb2)
190 #undef CAFFE2_YF_INIT_VECTOR 192 #define CAFFE2_YF_INIT_SCALAR(NAME) \ 193 ReinitializeTensor(&NAME##_tensor_, {1}, at::dtype<T>().device(Context::GetDeviceType())); \ 194 NAME##_ = NAME##_tensor_.template mutable_data<T>(); 196 CAFFE2_YF_INIT_SCALAR(aux_scalar)
197 CAFFE2_YF_INIT_SCALAR(distance)
198 CAFFE2_YF_INIT_SCALAR(distance_deb)
199 CAFFE2_YF_INIT_SCALAR(g_norm)
200 CAFFE2_YF_INIT_SCALAR(g_norm_deb)
201 CAFFE2_YF_INIT_SCALAR(g_norm2)
202 CAFFE2_YF_INIT_SCALAR(g_norm2_max)
203 CAFFE2_YF_INIT_SCALAR(g_norm2_max_deb)
204 CAFFE2_YF_INIT_SCALAR(g_norm2_min)
205 CAFFE2_YF_INIT_SCALAR(g_norm2_min_deb)
206 CAFFE2_YF_INIT_SCALAR(g_norm2_deb)
207 CAFFE2_YF_INIT_SCALAR(lr)
208 CAFFE2_YF_INIT_SCALAR(lr_deb)
209 CAFFE2_YF_INIT_SCALAR(mu_deb)
210 CAFFE2_YF_INIT_SCALAR(mu)
211 CAFFE2_YF_INIT_SCALAR(variance)
212 #undef CAFFE2_YF_INIT_SCALAR 214 debias_factor_ = ZeroDebiasFactor();
232 #define CAFFE2_YF_DEFINE_TENSOR(NAME) \ 233 Tensor NAME##_tensor_; \ 236 CAFFE2_YF_DEFINE_TENSOR(aux_vector)
237 CAFFE2_YF_DEFINE_TENSOR(g_deb)
238 CAFFE2_YF_DEFINE_TENSOR(g2_deb)
239 CAFFE2_YF_DEFINE_TENSOR(g_deb2)
241 CAFFE2_YF_DEFINE_TENSOR(aux_scalar)
242 CAFFE2_YF_DEFINE_TENSOR(distance)
243 CAFFE2_YF_DEFINE_TENSOR(distance_deb)
244 CAFFE2_YF_DEFINE_TENSOR(g_norm)
245 CAFFE2_YF_DEFINE_TENSOR(g_norm_deb)
246 CAFFE2_YF_DEFINE_TENSOR(g_norm2)
247 CAFFE2_YF_DEFINE_TENSOR(g_norm2_deb)
248 CAFFE2_YF_DEFINE_TENSOR(g_norm2_max)
249 CAFFE2_YF_DEFINE_TENSOR(g_norm2_max_deb)
250 CAFFE2_YF_DEFINE_TENSOR(g_norm2_min)
251 CAFFE2_YF_DEFINE_TENSOR(g_norm2_min_deb)
252 CAFFE2_YF_DEFINE_TENSOR(lr)
253 CAFFE2_YF_DEFINE_TENSOR(lr_deb)
254 CAFFE2_YF_DEFINE_TENSOR(mu)
255 CAFFE2_YF_DEFINE_TENSOR(mu_deb)
256 CAFFE2_YF_DEFINE_TENSOR(variance)
258 Tensor scratch_tensor_{Context::GetDeviceType()};
260 #undef CAFFE2_YF_DEFINE_TENSOR 270 const T* scalars_memory_;
275 const T* g_norm_avg_;
276 const T* g_norm2_avg_;
277 const T* g_norm2_min_avg_;
278 const T* g_norm2_max_avg_;
279 const T* distance_avg_;
290 T* scalars_memory_out_;
295 T* g_norm2_min_avg_out_;
296 T* g_norm2_max_avg_out_;
297 T* distance_avg_out_;
318 OUTPUT_SCALARS_MEMORY);
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...