1 #include "caffe2/sgd/adadelta_op.h" 5 REGISTER_CPU_OPERATOR(Adadelta, AdadeltaOp<CPUContext>);
6 OPERATOR_SCHEMA(Adadelta)
9 .AllowInplace({{0, 0}, {1, 1}, {2, 2}})
12 Computes the AdaDelta update (https://arxiv.org/abs/1212.5701) for an input 13 gradient and accumulated history of squared gradients. Concretely, given 14 inputs (param, moment, moment_delta, grad, learning_rate), computes: 16 new_moment = moment * decay + square(grad) * (1 - decay) 17 new_grad = sqrt(moment_delta + epsilon) / sqrt(new_moment + epsilon) * grad 18 new_param = param + learning_rate * new_grad 19 new_moment_delta = moment_delta * decay + square(new_grad) * (1 - decay) 21 and returns (new_param, new_moment, new_moment_delta). 24 .Input(0, "param",
"Parameters to be updated")
25 .Input(1,
"moment",
"Average of squared gradients")
26 .Input(2,
"moment_delta",
"Average of squared parameter updates")
27 .Input(3,
"grad",
"Gradient computed")
28 .Input(4,
"lr",
"Learning rate")
29 .Output(0,
"output_param",
"Updated parameters")
30 .Output(1,
"output_moment",
"Updated average squared gradient")
33 "output_moment_delta",
34 "Updated average of squared parameter updates")
35 .Arg(
"epsilon",
"Default 1e-5")
38 "Default 0.95, the squared gradient sum is decayed by this factor.");
40 REGISTER_CPU_OPERATOR(SparseAdadelta, SparseAdadeltaOp<CPUContext>);
41 OPERATOR_SCHEMA(SparseAdadelta)
44 .EnforceOneToOneInplace()
47 Given inputs (param, moment, moment_delta, indices, grad, lr), 48 runs the dense AdaDelta update on (param, grad, moment[indices], 49 moment_delta[indices], lr), and returns (new_param, new_moment, 50 new_moment_delta) as in the dense case. 53 .Input(0, "param",
"Parameters to be updated")
54 .Input(1,
"moment",
"Average of squared gradients")
55 .Input(2,
"moment_delta",
"Average of squared parameter updates")
56 .Input(3,
"indices",
"Sparse indices")
57 .Input(4,
"grad",
"Gradient computed")
58 .Input(5,
"lr",
"learning rate")
59 .Output(0,
"output_param",
"Updated parameters")
60 .Output(1,
"output_moment",
"Updated average squared gradient")
63 "output_moment_delta",
64 "Updated average of squared parameter updates")
65 .Arg(
"epsilon",
"Default 1e-5")
68 "Default 0.95, the squared gradient sum is decayed by this factor.");
70 SHOULD_NOT_DO_GRADIENT(Adadelta);
71 SHOULD_NOT_DO_GRADIENT(SparseAdadelta);
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...