3 from __future__
import absolute_import
4 from __future__
import division
5 from __future__
import print_function
6 from __future__
import unicode_literals
8 from collections
import namedtuple, defaultdict
9 from past.builtins
import basestring
17 from caffe2.proto
import caffe2_pb2
20 _LEARNING_RATE_INJECTION =
"lr_injection" 22 AuxOptimizerParams = namedtuple(
"AuxOptimizerParams", [
"local",
"shared"])
23 _optimizer_instance_count = defaultdict(int)
25 FP16_ENGINES = [
"SIMD_Q_FP16",
"SIMD_Q_STOC_FP16",
"SIMD_Q_STOC_MKL_FP16"]
27 logger = logging.getLogger(__name__)
32 self.
_aux_params = AuxOptimizerParams(local=[], shared=[])
33 self.
_instance_num = _optimizer_instance_count[self.__class__.__name__]
34 _optimizer_instance_count[self.__class__.__name__] += 1
40 Adds optimization operators to the net for given parameter and its gradient 41 Parameter is specified by either 'param' being a ParameterInfo object. 42 In this case param.grad has to be set 44 Or by 'param' being a BlobReference and 'grad' being a BlobReference for its 47 def __call__(self, net, param_init_net, param, grad=None):
49 assert isinstance(param, parameter_info.ParameterInfo), (
50 "Expected parameter to be of type ParameterInfo, got {}".format(
53 assert param.grad
is not None 55 if isinstance(param, basestring):
57 param = parameter_info.ParameterInfo(
58 param_id=
None, param=param, grad=grad)
60 self.
_run(net, param_init_net, param)
62 def _run(self, net, param_init_net, param_info):
63 raise Exception(
"Not Implemented")
65 def get_cpu_blob_name(self, base_str, node_name=''):
66 classname = self.__class__.__name__
67 return '%s_%d_%s%s_cpu' % (classname, self.
_instance_num, base_str, node_name)
69 def get_gpu_blob_name(self, base_str, gpu_id, node_name):
70 classname = self.__class__.__name__
71 return '%s_%d_%s%s_gpu%d' % (
77 Returns a blob name that will be unique to the current device 78 and optimizer instance. 80 current_scope = scope.CurrentDeviceScope()
81 if current_scope
is None:
84 if core.IsGPUDeviceType(current_scope.device_type):
86 base_str, current_scope.device_id, current_scope.node_name
91 def build_lr(self, net, param_init_net, base_learning_rate,
92 learning_rate_blob=
None, policy=
"fixed",
93 iter_val=0, **kwargs):
94 if learning_rate_blob
is None:
97 iteration = utils.BuildUniqueMutexIter(
103 if not net.BlobIsDefined(learning_rate_blob):
106 lr = net.LearningRate(
109 base_lr=-base_learning_rate,
114 lr = net.GetBlobRef(learning_rate_blob)
117 lr_multiplier = net.CopyFromCPUInput(
128 current_scope = scope.CurrentDeviceScope()
129 if (current_scope
is not None 130 and core.IsGPUDeviceType(current_scope.device_type)
132 local_lr_multiplier = net.CopyFromCPUInput(
140 [lr, local_lr_multiplier],
149 Set the global learning rate multiplier. If a multiplier already 150 existed, this will overwrite the existing multiplier. The multiplier is 151 used for all future calls to _run(), unless it is overwritten. 155 def _add_local_lr_multiplier(self, local_lr_multiplier, is_gpu_blob=False):
157 Set the local learning rate multiplier. This local multiplier is 158 multiplied with the global learning rate multiplier if it exists. As 159 with the global learning rate multiplier, this multiplier will be 160 used for all future calls to _run(), so please call 161 _clear_local_lr_multiplier() at the beginning of the optimizer's _run() 162 before optionally calling this function. 167 def _clear_local_lr_multiplier(self):
172 def dedup(net, sparse_dedup_aggregator, grad):
173 assert isinstance(grad, core.GradientSlice), (
174 "Dedup only works for sparse gradient, got {}".format(grad))
175 if sparse_dedup_aggregator:
176 return net.DeduplicateGradientSlices(
177 grad, aggregator=sparse_dedup_aggregator)
182 """Returns a list of auxiliary parameters. 185 aux_params: A namedtuple, AuxParams. 187 aux_params.local stores a list of blobs. Each blob is a local 188 auxiliary parameter. A local auxiliary parameter is a parameter in 189 parallel to a learning rate parameter. Take adagrad as an example, 190 the local auxiliary parameter is the squared sum parameter, because 191 every learning rate has a squared sum associated with it. 193 aux_params.shared also stores a list of blobs. Each blob is a shared 194 auxiliary parameter. A shared auxiliary parameter is a parameter 195 that is shared across all the learning rate parameters. Take adam as 196 an example, the iteration parameter is a shared parameter, because 197 all the learning rates share the same iteration parameter. 209 def scale_learning_rate(self, *args, **kwargs):
210 raise NotImplementedError(
211 "Optimizer Need to Implement `scale_learning_rate` method.")
213 def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max):
214 wd = param_init_net.ConstantFill([],
"weight_decay",
215 shape=[1], value=weight_decay)
216 trust = param_init_net.ConstantFill([],
"trust", shape=[1], value=trust)
217 lr_max = param_init_net.ConstantFill([],
"lr_max", shape=[1],
219 return wd, trust, lr_max
223 def __init__(self, base_learning_rate=0.01, policy='fixed',
224 momentum=0.0, nesterov=1, sparse_dedup_aggregator=
None,
225 lars=
None, **kwargs):
226 super(SgdOptimizer, self).__init__()
235 def _run(self, net, param_init_net, param_info):
236 param = param_info.blob
237 grad = param_info.grad
241 "Expect positive base learning rate, got {}".format(
247 if self.
lars is not None and not isinstance(grad, core.GradientSlice):
248 assert self.
lars >= 0, (
249 'Lars offset must be nonnegative, got {}'.format(self.
lars))
251 param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
252 lr_lars_multiplier = net.Lars(
253 [param, grad, wd, trust, lr_max],
257 current_scope = scope.CurrentDeviceScope()
260 is_gpu_blob=(current_scope
is not None 261 and core.IsGPUDeviceType(current_scope.device_type)),
266 lr_sign = -1
if self.
momentum else 1
274 dev = scope.CurrentDeviceScope()
276 dev = core.DeviceOption(caffe2_pb2.CPU)
280 ONE = param_init_net.ConstantFill(
282 "ONE_{}_{}{}".format(dev.device_type, dev.device_id, dev.node_name),
287 self._aux_params.shared.append(ONE)
290 momentum_data = param_init_net.ConstantFill(
291 param, str(param) +
"_momentum", value=0.)
292 self._aux_params.local.append(momentum_data)
294 if isinstance(grad, core.GradientSlice):
297 net.SparseMomentumSGDUpdate(
298 [grad.values, momentum_data, lr, param, grad.indices],
299 [grad.values, momentum_data, param],
303 net.ScatterWeightedSum(
304 [param, ONE, grad.indices, grad.values, lr],
309 net.MomentumSGDUpdate(
310 [grad, momentum_data, lr, param],
311 [grad, momentum_data, param],
318 [param, ONE, grad, coeff],
322 def scale_learning_rate(self, scale):
328 def __init__(self, base_learning_rate=0.1, momentum=0.0,
329 policy=
"fixed", nesterov=1, sparse_dedup_aggregator=
None,
331 super(MultiPrecisionSgdOptimizer, self).__init__(
332 base_learning_rate=base_learning_rate,
336 sparse_dedup_aggregator=sparse_dedup_aggregator,
340 def _run(self, net, param_init_net, param_info):
341 param = param_info.blob
342 param_fp32 = param_info.blob_copy[core.DataType.FLOAT] \
343 if param_info.blob_copy
is not None else None 346 if param_fp32
is None:
347 return SgdOptimizer._run(self, net, param_init_net, param_info)
349 grad = param_info.grad
353 "Expect positive base learning rate, got {}".format(
363 momentum_data = param_init_net.ConstantFill(
364 param_fp32, str(param) +
"_momentum", value=0.)
365 self._aux_params.local.append(momentum_data)
367 assert not isinstance(grad, core.GradientSlice), (
368 "MultiPrecisionSgd does not support sparse gradients")
371 grad_fp32 = net.HalfToFloat(grad, grad +
"_fp32")
374 net.MomentumSGDUpdate(
375 [grad_fp32, momentum_data, lr, param_fp32],
376 [grad_fp32, momentum_data, param_fp32],
381 net.FloatToHalf(param_fp32, param)
385 def __init__(self, base_learning_rate=0.1, momentum=0.0,
386 policy=
"fixed", nesterov=1, weight_decay=0.0001,
387 sparse_dedup_aggregator=
None,
389 super(FP16SgdOptimizer, self).__init__(
390 base_learning_rate=base_learning_rate,
394 sparse_dedup_aggregator=sparse_dedup_aggregator,
399 def _run(self, net, param_init_net, param_info, fp32_update=False):
402 param_name = str(param_info.blob)
406 if param_name.find(
"spatbn") != -1:
414 param = param_info.blob
415 param_fp32 = param_info.blob
417 if param_info.blob_copy
is None:
422 param = param_info.blob
423 param_fp32 = param_info.blob
425 if core.DataType.FLOAT
in param_info.blob_copy:
426 param = param_info.blob
427 param_fp32 = param_info.blob_copy[core.DataType.FLOAT]
428 elif core.DataType.FLOAT16
in param_info.blob_copy:
429 param = param_info.blob_copy[core.DataType.FLOAT16]
430 param_fp32 = param_info.blob
433 "Unrecognized parameter format to be updated " 434 "by FP16 Optimizer. Parameter: {}".format(param_info.name)
437 grad = param_info.grad
442 "Expect positive base learning rate, got {}".format(
452 momentum_data_fp32 = param_init_net.ConstantFill(
453 param_fp32, str(param) +
"_momentum_fp32", value=0.)
455 momentum_data = param_init_net.FloatToHalf(
456 momentum_data_fp32, str(param) +
"_momentum")
458 self._aux_params.local.append(momentum_data)
460 assert not isinstance(grad, core.GradientSlice), (
461 "FP16Sgd does not support sparse gradients")
463 if fp32_update_flag == 0:
464 net.FP16MomentumSGDUpdate(
465 [grad, momentum_data, lr, param],
466 [grad, momentum_data, param],
472 net.FP32MomentumSGDUpdate(
473 [grad, momentum_data_fp32, lr, param],
474 [grad, momentum_data_fp32, param],
481 def __init__(self, weight_decay):
484 def _run(self, net, param_init_net, param_info):
485 dev = scope.CurrentDeviceScope()
487 dev = core.DeviceOption(caffe2_pb2.CPU)
489 ONE = param_init_net.ConstantFill(
491 "ONE_{}_{}".format(dev.device_type, dev.device_id),
495 WD = param_init_net.ConstantFill(
496 [],
"wd_{}_{}".format(dev.device_type, dev.device_id),
500 if isinstance(param_info.grad, core.GradientSlice):
502 "Weight decay does not yet support sparse gradients")
505 [param_info.grad, ONE, param_info.blob, WD],
511 def __init__(self, alpha=0.01, epsilon=1e-4, decay=1, policy="fixed",
512 sparse_dedup_aggregator=
None, rowWise=
False, engine=
'',
513 lars=
None, output_effective_lr=
False,
514 output_effective_lr_and_update=
False, **kwargs):
515 super(AdagradOptimizer, self).__init__()
528 def _run(self, net, param_init_net, param_info):
529 param = param_info.blob
530 grad = param_info.grad
537 if self.
lars is not None and not isinstance(grad, core.GradientSlice):
538 assert self.
lars >= 0, (
539 'Lars offset must be nonnegative, got {}'.format(self.
lars))
541 param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
542 lr_lars_multiplier = net.Lars(
543 [param, grad, wd, trust, lr_max],
548 current_scope = scope.CurrentDeviceScope()
551 is_gpu_blob=(current_scope
is not None 552 and core.IsGPUDeviceType(current_scope.device_type)),
557 base_learning_rate=self.
alpha,
563 logger.info(
"Using engine {} for rowWise Adagrad".format(self.
engine))
565 shapes, types = workspace.InferShapesAndTypes([param_init_net])
566 if str(param)
not in shapes:
569 shape = param_init_net.Shape(param, str(param) +
"_shape")
570 num_rows = param_init_net.Slice(
572 str(shape) +
"_numrows",
575 param_squared_sum = param_init_net.ConstantFill(
577 str(param) +
"_avg_squared_sum",
582 param_squared_sum = param_init_net.ConstantFill(
584 str(param) +
"_avg_squared_sum",
585 shape=[shapes[str(param)][0]],
589 logger.info(
"Using engine {} for regular Adagrad".format(self.
engine))
591 if self.
engine in FP16_ENGINES:
592 shapes, types = workspace.InferShapesAndTypes([param_init_net])
593 assert str(param)
in shapes, shapes
594 shape = shapes[str(param)]
596 param_squared_sum = param_init_net.Float16ConstantFill(
598 str(param) +
"_squared_sum",
603 param_squared_sum = param_init_net.ConstantFill(
605 str(param) +
"_squared_sum",
609 self._aux_params.local.append(param_squared_sum)
612 assert isinstance(grad, core.GradientSlice),\
613 'If SparseAdagrad with rowWise=True, gradient must be '\
614 'a gradientslice. PLease ensure that rowWise is not enabled '\
615 'for the dense Adagrad optimizer, as it is not supported.' 616 if isinstance(grad, core.GradientSlice):
617 assert self.
decay == 1.,\
618 'Decay is not implemented for SparseAdagrad and must be set to 1' 621 op =
'RowWiseSparseAdagrad' 625 [param, param_squared_sum, grad.indices, grad.values, lr],
626 [param, param_squared_sum],
631 output_args = [param, param_squared_sum]
633 output_args.append(str(param) +
'_effective_lr')
634 output_args.append(str(param) +
'_update')
636 output_args.append(str(param) +
'_effective_lr')
639 [param, param_squared_sum, grad, lr],
642 decay=float(self.
decay),
646 def scale_learning_rate(self, scale):
652 def __init__(self, alpha=1.0, epsilon=1e-9, policy="fixed",
653 sparse_dedup_aggregator=
None, engine=
'', moment_init=100.0,
654 lars=
None, output_effective_lr=
False,
655 output_effective_lr_and_update=
False, **kwargs):
656 super(WngradOptimizer, self).__init__()
668 def _run(self, net, param_init_net, param_info):
669 param = param_info.blob
670 grad = param_info.grad
677 if self.
lars is not None and not isinstance(grad, core.GradientSlice):
678 assert self.
lars >= 0, (
679 'Lars offset must be nonnegative, got {}'.format(self.
lars))
681 param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
682 lr_lars_multiplier = net.Lars(
683 [param, grad, wd, trust, lr_max],
687 current_scope = scope.CurrentDeviceScope()
690 is_gpu_blob=(current_scope
is not None 691 and core.IsGPUDeviceType(current_scope.device_type)),
696 base_learning_rate=self.
alpha,
701 moment = param_init_net.ConstantFill(
703 str(param) +
"_moment",
708 self._aux_params.local.append(moment)
710 if isinstance(grad, core.GradientSlice):
713 [param, moment, grad.indices, grad.values, lr],
719 output_args = [param, moment]
721 output_args.append(str(param) +
'_effective_lr')
722 output_args.append(str(param) +
'_update')
724 output_args.append(str(param) +
'_effective_lr')
727 [param, moment, grad, lr],
733 def scale_learning_rate(self, scale):
739 def __init__(self, alpha=0.01, epsilon=1e-4, decay=0.95, policy="fixed",
740 sparse_dedup_aggregator=
None, engine=
'', **kwargs):
741 """Constructor function to add Adadelta Optimizer 745 epsilon: attribute of Adadelta to avoid numerical issues 746 decay: attribute of Adadelta to decay the squared gradient sum 747 policy: specifies how learning rate should be applied, options are 748 "fixed", "step", "exp", etc. 749 sparse_dedup_aggregator: specifies deduplication strategy for 750 gradient slices. Works while using sparse gradients. Options 751 include "mean" and "sum". 752 engine: the engine used, options include "", "CUDNN", etc. 754 super(AdadeltaOptimizer, self).
__init__()
763 def _run(self, net, param_init_net, param_info):
764 param = param_info.blob
765 grad = param_info.grad
772 base_learning_rate=self.
alpha,
777 moment = param_init_net.ConstantFill(
778 [param], str(param) +
"_squared_moment", value=0.0)
780 moment_update = param_init_net.ConstantFill(
781 [param], str(param) +
"_squared_moment_update", value=0.0)
783 self._aux_params.local.append(moment)
784 self._aux_params.local.append(moment_update)
786 if isinstance(grad, core.GradientSlice):
790 param, moment, moment_update, grad.indices,
792 ], [param, moment, moment_update],
798 [param, moment, moment_update, grad, lr],
799 [param, moment, moment_update],
805 def scale_learning_rate(self, scale):
811 def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
812 sparse_dedup_aggregator=
None, engine=
''):
813 super(FtrlOptimizer, self).__init__()
821 def _run(self, net, param_init_net, param_info):
822 param = param_info.blob
823 grad = param_info.grad
828 nz = param_init_net.ConstantFill(
830 str(param) +
"_ftrl_nz",
834 self._aux_params.local.append(nz)
835 if isinstance(grad, core.GradientSlice):
838 [param, nz, grad.indices, grad.values],
857 def scale_learning_rate(self, scale):
863 """Group Lasso FTRL Optimizer.""" 865 def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
866 sparse_dedup_aggregator=
None, engine=
''):
867 super(GFtrlOptimizer, self).__init__()
875 def _run(self, net, param_init_net, param_info):
876 param = param_info.blob
877 grad = param_info.grad
882 nz = param_init_net.ConstantFill(
884 str(param) +
"_gftrl_nz",
888 self._aux_params.local.append(nz)
899 def scale_learning_rate(self, scale):
905 def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
906 policy=
'fixed', use_lr_adaption=
False, lr_alpha=0.01,
907 normalized_lr_adaption=
True, sparse_dedup_aggregator=
None,
908 rowWise=
False, engine=
'', **kwargs):
909 super(AdamOptimizer, self).__init__()
923 def _run(self, net, param_init_net, param_info):
924 param = param_info.blob
925 grad = param_info.grad
932 base_learning_rate=self.
alpha,
937 m1 = param_init_net.ConstantFill(
939 param +
"_first_moment",
944 shapes, types = workspace.InferShapesAndTypes([param_init_net])
945 m2 = param_init_net.ConstantFill(
947 param +
"_avg_second_moment",
948 shape=[shapes[param][0]],
952 m2 = param_init_net.ConstantFill(
954 param +
"_second_moment",
958 self._aux_params.shared.append(iteration)
959 self._aux_params.local.append(m1)
960 self._aux_params.local.append(m2)
963 assert isinstance(grad, core.GradientSlice),\
964 'If SparseAdam with rowWise=True, gradient must be '\
965 'a gradientslice. PLease ensure that rowWise is not enabled '\
966 'for the dense Adam optimizer, as it is not supported.' 968 output_blobs = [param, m1, m2]
970 effective_grad = str(param) +
'_effective_grad' 971 output_blobs.append(effective_grad)
973 if isinstance(grad, core.GradientSlice):
976 op =
'RowWiseSparseAdam' 981 [param, m1, m2, grad.indices, grad.values, lr, iteration],
987 net.LearningRateAdaption(
988 [lr, grad.values, effective_grad],
995 [param, m1, m2, grad, lr, iteration],
1001 net.LearningRateAdaption(
1002 [lr, grad, effective_grad],
1007 def scale_learning_rate(self, scale):
1013 """YellowFin: An automatic tuner for momentum SGD 1015 See https://arxiv.org/abs/1706.03471 for more details. This implementation 1016 has separate learning rate and momentum per each parameter.""" 1026 sparse_dedup_aggregator=
None,
1028 super(YellowFinOptimizer, self).__init__()
1039 def _run(self, net, param_init_net, param_info):
1044 SCALARS_MEMORY_SIZE = 5
1046 param = param_info.blob
1047 grad = param_info.grad
1048 moment = param_init_net.ConstantFill(
1053 curv_win = param_init_net.ConstantFill(
1055 param +
"_curv_win",
1059 g_avg = param_init_net.ConstantFill(
1064 g2_avg = param_init_net.ConstantFill(
1069 lr_avg = param_init_net.ConstantFill(
1075 mu_avg = param_init_net.ConstantFill(
1081 scalars_memory = param_init_net.ConstantFill(
1083 param +
"_scalars_memory",
1084 shape=[SCALARS_MEMORY_SIZE],
1088 assert self.
alpha > 0
1089 assert not isinstance(grad, core.GradientSlice), \
1090 "YellowFin does not support sparse gradients" 1092 iteration = utils.BuildUniqueMutexIter(
1098 self._aux_params.shared.append(iteration)
1099 self._aux_params.local.append(moment)
1100 self._aux_params.local.append(lr_avg)
1101 self._aux_params.local.append(mu_avg)
1102 self._aux_params.local.append(curv_win)
1103 self._aux_params.local.append(g_avg)
1104 self._aux_params.local.append(g2_avg)
1105 self._aux_params.local.append(scalars_memory)
1119 yf_in_out_args + [grad, iteration],
1126 def scale_learning_rate(self, scale):
1142 super(RmsPropOptimizer, self).__init__()
1151 def _run(self, net, param_init_net, param_info):
1152 param = param_info.blob
1153 grad = param_info.grad
1155 assert self.
alpha > 0
1156 assert not isinstance(grad, core.GradientSlice), \
1157 "RmsPropOptimizer doesn't support sparse gradients" 1159 dev = scope.CurrentDeviceScope()
1161 dev = core.DeviceOption(caffe2_pb2.CPU)
1163 ONE = param_init_net.ConstantFill(
1165 "ONE_{}_{}".format(dev.device_type, dev.device_id),
1173 base_learning_rate=-self.
alpha,
1178 grad_o = param_init_net.ConstantFill(
1180 str(param) +
"_grad_o",
1184 ms = param_init_net.ConstantFill(
1186 str(param) +
"_mean_squares",
1190 mom = param_init_net.ConstantFill(
1192 str(param) +
"_momentum",
1196 self._aux_params.local.append(ms)
1197 self._aux_params.local.append(mom)
1200 [grad, ms, mom, ONE],
1208 net.MomentumSGDUpdate(
1209 [grad_o, mom, lr, param],
1210 [grad_o, mom, param],
1213 def scale_learning_rate(self, scale):
1218 def _get_param_to_device(model):
1221 param_to_device = core.InferBlobDevices(model.net)
1222 param_to_device.update(core.InferBlobDevices(model.param_init_net))
1223 return param_to_device
1226 def get_param_device(param_name, grad, param_to_device=None, default_device=None):
1227 device = default_device
1228 param_to_device = param_to_device
or {}
1232 if param_name
in param_to_device:
1233 device = param_to_device[param_name]
1235 if isinstance(grad, core.GradientSlice):
1237 if str(grad.values)
in param_to_device:
1238 device = param_to_device[str(grad.values)]
1239 elif str(grad.indices)
in param_to_device:
1240 device = param_to_device[str(grad.indices)]
1242 grad_name = str(grad)
1243 if grad_name
in param_to_device:
1244 device = param_to_device[grad_name]
1246 assert device
is not None,\
1247 "Cannot infer device for {}: no op creates it".format(param_name)
1251 def get_lr_injection():
1253 Gets current value for lr_injection, a multiplier for all base 1255 Must set allow_lr_injection=True when building optimizer, as it 1256 relies on synchronization over CPU. 1258 return workspace.FetchBlob(_LEARNING_RATE_INJECTION)
1261 def set_lr_injection(lr_injection_value):
1263 Sets lr_injection, a multiplier for all base learning rates. 1264 Must set allow_lr_injection=True when building optimizer, as it 1265 relies on synchronization over CPU. 1268 _LEARNING_RATE_INJECTION,
1270 [float(lr_injection_value)],
1276 def _calc_norm_ratio(
1277 model, params, name_scope, param_to_device, max_gradient_norm
1279 with core.NameScope(name_scope):
1280 grad_squared_sums = []
1281 for i, param
in enumerate(params):
1282 device = get_param_device(
1283 str(param.blob), param.grad, param_to_device
1286 with core.DeviceScope(device):
1292 )
else param.grad.values
1295 grad_squared_sum_name =
'grad_{}_squared_sum'.format(i)
1296 grad_squared_sum = model.net.SumSqrElements(
1298 grad_squared_sum_name,
1300 grad_squared_sum_cpu = model.net.EnsureCPUOutput(
1303 grad_squared_sums.append(grad_squared_sum_cpu)
1305 with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
1306 grad_squared_full_sum = model.net.Sum(
1308 'grad_squared_full_sum',
1310 global_norm = model.net.Pow(
1311 grad_squared_full_sum,
1315 clip_norm = model.param_init_net.ConstantFill(
1319 value=float(max_gradient_norm),
1321 max_norm = model.net.Max(
1322 [global_norm, clip_norm],
1325 norm_ratio = model.net.Div(
1326 [clip_norm, max_norm],
1336 use_param_info_optim=
True,
1337 max_gradient_norm=
None,
1338 allow_lr_injection=
False,
1340 param_to_device = _get_param_to_device(model)
1346 for param_info
in model.GetOptimizationParamInfo():
1347 if weights_only
and param_info.blob
not in model.weights:
1349 params.append(param_info)
1351 lr_multiplier =
None 1352 if max_gradient_norm
is not None:
1353 lr_multiplier = _calc_norm_ratio(
1356 'norm_clipped_grad_update',
1361 if allow_lr_injection:
1362 if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION):
1363 lr_injection = model.param_init_net.ConstantFill(
1365 _LEARNING_RATE_INJECTION,
1370 lr_injection = _LEARNING_RATE_INJECTION
1372 if lr_multiplier
is None:
1373 lr_multiplier = lr_injection
1375 lr_multiplier = model.net.Mul(
1376 [lr_multiplier, lr_injection],
1380 optimizer.add_lr_multiplier(lr_multiplier)
1382 for param_info
in params:
1383 param_name = str(param_info.blob)
1384 device = get_param_device(param_name, param_info.grad, param_to_device)
1385 with core.DeviceScope(device):
1386 if param_info.optimizer
and use_param_info_optim:
1387 param_info.optimizer(
1388 model.net, model.param_init_net, param_info)
1390 optimizer(model.net, model.param_init_net, param_info)
1394 def add_weight_decay(model, weight_decay):
1395 """Adds a decay to weights in the model. 1397 This is a form of L2 regularization. 1400 weight_decay: strength of the regularization 1406 use_param_info_optim=
False,
1413 max_gradient_norm=
None,
1414 allow_lr_injection=
False,
1417 sgd_optimizer =
SgdOptimizer(base_learning_rate, **kwargs)
1421 max_gradient_norm=max_gradient_norm,
1422 allow_lr_injection=allow_lr_injection,
1426 def build_multi_precision_sgd(
1429 max_gradient_norm=
None,
1430 allow_lr_injection=
False,
1434 base_learning_rate, **kwargs
1438 multi_prec_sgd_optimizer,
1439 max_gradient_norm=max_gradient_norm,
1440 allow_lr_injection=allow_lr_injection,
1444 def build_fp16_sgd(model, base_learning_rate, **kwargs):
1446 base_learning_rate, **kwargs
1448 return _build(model, fp16_sgd_optimizer)
1451 def build_ftrl(model, engine="SIMD", **kwargs):
1452 if engine ==
"SIMD":
1453 assert core.IsOperator(
'Ftrl_ENGINE_SIMD')
1454 assert core.IsOperator(
'SparseFtrl_ENGINE_SIMD')
1456 return _build(model, ftrl_optimizer)
1459 def build_gftrl(model, engine="", **kwargs):
1460 if engine ==
"SIMD":
1461 assert core.IsOperator(
'GFtrl_ENGINE_SIMD')
1463 return _build(model, gftrl_optimizer)
1470 max_gradient_norm=
None,
1471 allow_lr_injection=
False,
1478 max_gradient_norm=max_gradient_norm,
1479 allow_lr_injection=allow_lr_injection,
1487 max_gradient_norm=
None,
1488 allow_lr_injection=
False,
1491 wngrad_optimizer =
WngradOptimizer(alpha=base_learning_rate, **kwargs)
1495 max_gradient_norm=max_gradient_norm,
1496 allow_lr_injection=allow_lr_injection,
1504 max_gradient_norm=
None,
1505 allow_lr_injection=
False,
1512 max_gradient_norm=max_gradient_norm,
1513 allow_lr_injection=allow_lr_injection,
1520 max_gradient_norm=
None,
1521 allow_lr_injection=
False,
1524 adam_optimizer =
AdamOptimizer(alpha=base_learning_rate, **kwargs)
1528 max_gradient_norm=max_gradient_norm,
1529 allow_lr_injection=allow_lr_injection,
1533 def build_yellowfin(model, base_learning_rate=0.1, **kwargs):
1535 alpha=base_learning_rate,
1537 return _build(model, yellowfin_optimizer)
1543 max_gradient_norm=
None,
1544 allow_lr_injection=
False,
1551 max_gradient_norm=max_gradient_norm,
1552 allow_lr_injection=allow_lr_injection,
def get_cpu_blob_name(self, base_str, node_name='')
def _add_local_lr_multiplier(self, local_lr_multiplier, is_gpu_blob=False)
def add_lr_multiplier(self, lr_multiplier)
_local_lr_multiplier_on_gpu
def _clear_local_lr_multiplier(self)
def __init__(self, alpha=0.01, epsilon=1e-4, decay=0.95, policy="fixed", sparse_dedup_aggregator=None, engine='', kwargs)
output_effective_lr_and_update
def get_gpu_blob_name(self, base_str, gpu_id, node_name)
def dedup(net, sparse_dedup_aggregator, grad)
def _run(self, net, param_init_net, param_info)
def make_unique_blob_name(self, base_str)
def build_lr(self, net, param_init_net, base_learning_rate, learning_rate_blob=None, policy="fixed", iter_val=0, kwargs)
def get_auxiliary_parameters(self)
output_effective_lr_and_update
def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max)