3 from __future__ 
import absolute_import
     4 from __future__ 
import division
     5 from __future__ 
import print_function
     6 from __future__ 
import unicode_literals
     8 from collections 
import namedtuple, defaultdict
     9 from past.builtins 
import basestring
    17 from caffe2.proto 
import caffe2_pb2
    20 _LEARNING_RATE_INJECTION = 
"lr_injection"    22 AuxOptimizerParams = namedtuple(
"AuxOptimizerParams", [
"local", 
"shared"])
    23 _optimizer_instance_count = defaultdict(int)
    25 FP16_ENGINES = [
"SIMD_Q_FP16", 
"SIMD_Q_STOC_FP16", 
"SIMD_Q_STOC_MKL_FP16"]
    27 logger = logging.getLogger(__name__)
    32         self.
_aux_params = AuxOptimizerParams(local=[], shared=[])
    33         self.
_instance_num = _optimizer_instance_count[self.__class__.__name__]
    34         _optimizer_instance_count[self.__class__.__name__] += 1
    40     Adds optimization operators to the net for given parameter and its gradient    41     Parameter is specified by either 'param' being a ParameterInfo object.    42     In this case  param.grad has to be set    44     Or by 'param' being a BlobReference and 'grad' being a BlobReference for its    47     def __call__(self, net, param_init_net, param, grad=None):
    49             assert isinstance(param, parameter_info.ParameterInfo), (
    50                 "Expected parameter to be of type ParameterInfo, got {}".format(
    53             assert param.grad 
is not None    55             if isinstance(param, basestring):
    57             param = parameter_info.ParameterInfo(
    58                 param_id=
None, param=param, grad=grad)
    60         self.
_run(net, param_init_net, param)
    62     def _run(self, net, param_init_net, param_info):
    63         raise Exception(
"Not Implemented")
    65     def get_cpu_blob_name(self, base_str, node_name=''):
    66         classname = self.__class__.__name__
    67         return '%s_%d_%s%s_cpu' % (classname, self.
_instance_num, base_str, node_name)
    69     def get_gpu_blob_name(self, base_str, gpu_id, node_name):
    70         classname = self.__class__.__name__
    71         return '%s_%d_%s%s_gpu%d' % (
    77         Returns a blob name that will be unique to the current device    78         and optimizer instance.    80         current_scope = scope.CurrentDeviceScope()
    81         if current_scope 
is None:
    84         if core.IsGPUDeviceType(current_scope.device_type):
    86                 base_str, current_scope.device_id, current_scope.node_name
    91     def build_lr(self, net, param_init_net, base_learning_rate,
    92                  learning_rate_blob=
None, policy=
"fixed",
    93                  iter_val=0, **kwargs):
    94         if learning_rate_blob 
is None:
    97         iteration = utils.BuildUniqueMutexIter(
   103         if not net.BlobIsDefined(learning_rate_blob):
   106             lr = net.LearningRate(
   109                 base_lr=-base_learning_rate,
   114             lr = net.GetBlobRef(learning_rate_blob)
   117             lr_multiplier = net.CopyFromCPUInput(
   128             current_scope = scope.CurrentDeviceScope()
   129             if (current_scope 
is not None   130                     and core.IsGPUDeviceType(current_scope.device_type)
   132                 local_lr_multiplier = net.CopyFromCPUInput(
   140                 [lr, local_lr_multiplier],
   149         Set the global learning rate multiplier. If a multiplier already   150         existed, this will overwrite the existing multiplier. The multiplier is   151         used for all future calls to _run(), unless it is overwritten.   155     def _add_local_lr_multiplier(self, local_lr_multiplier, is_gpu_blob=False):
   157         Set the local learning rate multiplier. This local multiplier is   158         multiplied with the global learning rate multiplier if it exists. As   159         with the global learning rate multiplier, this multiplier will be   160         used for all future calls to _run(), so please call   161         _clear_local_lr_multiplier() at the beginning of the optimizer's _run()   162         before optionally calling this function.   167     def _clear_local_lr_multiplier(self):
   172     def dedup(net, sparse_dedup_aggregator, grad):
   173         assert isinstance(grad, core.GradientSlice), (
   174             "Dedup only works for sparse gradient, got {}".format(grad))
   175         if sparse_dedup_aggregator:
   176             return net.DeduplicateGradientSlices(
   177                 grad, aggregator=sparse_dedup_aggregator)
   182         """Returns a list of auxiliary parameters.   185             aux_params: A namedtuple, AuxParams.   187             aux_params.local stores a list of blobs. Each blob is a local   188             auxiliary parameter. A local auxiliary parameter is a parameter in   189             parallel to a learning rate parameter. Take adagrad as an example,   190             the local auxiliary parameter is the squared sum parameter, because   191             every learning rate has a squared sum associated with it.   193             aux_params.shared also stores a list of blobs. Each blob is a shared   194             auxiliary parameter. A shared auxiliary parameter is a parameter   195             that is shared across all the learning rate parameters. Take adam as   196             an example, the iteration parameter is a shared parameter, because   197             all the learning rates share the same iteration parameter.   209     def scale_learning_rate(self, *args, **kwargs):
   210         raise NotImplementedError(
   211             "Optimizer Need to Implement `scale_learning_rate` method.")
   213     def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max):
   214         wd = param_init_net.ConstantFill([], 
"weight_decay",
   215             shape=[1], value=weight_decay)
   216         trust = param_init_net.ConstantFill([], 
"trust", shape=[1], value=trust)
   217         lr_max = param_init_net.ConstantFill([], 
"lr_max", shape=[1],
   219         return wd, trust, lr_max
   223     def __init__(self, base_learning_rate=0.01, policy='fixed',
   224                  momentum=0.0, nesterov=1, sparse_dedup_aggregator=
None,
   225                  lars=
None, **kwargs):
   226         super(SgdOptimizer, self).__init__()
   235     def _run(self, net, param_init_net, param_info):
   236         param = param_info.blob
   237         grad = param_info.grad
   241             "Expect positive base learning rate, got {}".format(
   247         if self.
lars is not None and not isinstance(grad, core.GradientSlice):
   248             assert self.
lars >= 0, (
   249                 'Lars offset must be nonnegative, got {}'.format(self.
lars))
   251                 param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
   252             lr_lars_multiplier = net.Lars(
   253                 [param, grad, wd, trust, lr_max],
   257             current_scope = scope.CurrentDeviceScope()
   260                 is_gpu_blob=(current_scope 
is not None   261                     and core.IsGPUDeviceType(current_scope.device_type)),
   266         lr_sign = -1 
if self.
momentum else 1
   274         dev = scope.CurrentDeviceScope()
   276             dev = core.DeviceOption(caffe2_pb2.CPU)
   280         ONE = param_init_net.ConstantFill(
   282             "ONE_{}_{}{}".format(dev.device_type, dev.device_id, dev.node_name),
   287         self._aux_params.shared.append(ONE)
   290             momentum_data = param_init_net.ConstantFill(
   291                 param, str(param) + 
"_momentum", value=0.)
   292             self._aux_params.local.append(momentum_data)
   294         if isinstance(grad, core.GradientSlice):
   297                 net.SparseMomentumSGDUpdate(
   298                     [grad.values, momentum_data, lr, param, grad.indices],
   299                     [grad.values, momentum_data, param],
   303                 net.ScatterWeightedSum(
   304                     [param, ONE, grad.indices, grad.values, lr],
   309                 net.MomentumSGDUpdate(
   310                     [grad, momentum_data, lr, param],
   311                     [grad, momentum_data, param],
   318                     [param, ONE, grad, coeff],
   322     def scale_learning_rate(self, scale):
   328     def __init__(self, base_learning_rate=0.1, momentum=0.0,
   329                  policy=
"fixed", nesterov=1, sparse_dedup_aggregator=
None,
   331         super(MultiPrecisionSgdOptimizer, self).__init__(
   332             base_learning_rate=base_learning_rate,
   336             sparse_dedup_aggregator=sparse_dedup_aggregator,
   340     def _run(self, net, param_init_net, param_info):
   341         param = param_info.blob
   342         param_fp32 = param_info.blob_copy[core.DataType.FLOAT] \
   343                 if param_info.blob_copy 
is not None else None   346         if param_fp32 
is None:
   347             return SgdOptimizer._run(self, net, param_init_net, param_info)
   349         grad = param_info.grad
   353             "Expect positive base learning rate, got {}".format(
   363         momentum_data = param_init_net.ConstantFill(
   364             param_fp32, str(param) + 
"_momentum", value=0.)
   365         self._aux_params.local.append(momentum_data)
   367         assert not isinstance(grad, core.GradientSlice), (
   368             "MultiPrecisionSgd does not support sparse gradients")
   371         grad_fp32 = net.HalfToFloat(grad, grad + 
"_fp32")
   374         net.MomentumSGDUpdate(
   375             [grad_fp32, momentum_data, lr, param_fp32],
   376             [grad_fp32, momentum_data, param_fp32],
   381         net.FloatToHalf(param_fp32, param)
   385     def __init__(self, base_learning_rate=0.1, momentum=0.0,
   386                  policy=
"fixed", nesterov=1, weight_decay=0.0001,
   387                  sparse_dedup_aggregator=
None,
   389         super(FP16SgdOptimizer, self).__init__(
   390             base_learning_rate=base_learning_rate,
   394             sparse_dedup_aggregator=sparse_dedup_aggregator,
   399     def _run(self, net, param_init_net, param_info, fp32_update=False):
   402         param_name = str(param_info.blob)
   406         if param_name.find(
"spatbn") != -1:
   414             param = param_info.blob
   415             param_fp32 = param_info.blob
   417             if param_info.blob_copy 
is None:
   422                 param = param_info.blob
   423                 param_fp32 = param_info.blob
   425                 if core.DataType.FLOAT 
in param_info.blob_copy:
   426                     param = param_info.blob
   427                     param_fp32 = param_info.blob_copy[core.DataType.FLOAT]
   428                 elif core.DataType.FLOAT16 
in param_info.blob_copy:
   429                     param = param_info.blob_copy[core.DataType.FLOAT16]
   430                     param_fp32 = param_info.blob
   433                         "Unrecognized parameter format to be updated "   434                         "by FP16 Optimizer. Parameter: {}".format(param_info.name)
   437         grad = param_info.grad
   442             "Expect positive base learning rate, got {}".format(
   452         momentum_data_fp32 = param_init_net.ConstantFill(
   453             param_fp32, str(param) + 
"_momentum_fp32", value=0.)
   455         momentum_data = param_init_net.FloatToHalf(
   456             momentum_data_fp32, str(param) + 
"_momentum")
   458         self._aux_params.local.append(momentum_data)
   460         assert not isinstance(grad, core.GradientSlice), (
   461             "FP16Sgd does not support sparse gradients")
   463         if fp32_update_flag == 0:
   464             net.FP16MomentumSGDUpdate(
   465                 [grad, momentum_data, lr, param],
   466                 [grad, momentum_data, param],
   472             net.FP32MomentumSGDUpdate(
   473                 [grad, momentum_data_fp32, lr, param],
   474                 [grad, momentum_data_fp32, param],
   481     def __init__(self, weight_decay):
   484     def _run(self, net, param_init_net, param_info):
   485         dev = scope.CurrentDeviceScope()
   487             dev = core.DeviceOption(caffe2_pb2.CPU)
   489         ONE = param_init_net.ConstantFill(
   491             "ONE_{}_{}".format(dev.device_type, dev.device_id),
   495         WD = param_init_net.ConstantFill(
   496             [], 
"wd_{}_{}".format(dev.device_type, dev.device_id),
   500         if isinstance(param_info.grad, core.GradientSlice):
   502                 "Weight decay does not yet support sparse gradients")
   505                 [param_info.grad, ONE, param_info.blob, WD],
   511     def __init__(self, alpha=0.01, epsilon=1e-4, decay=1, policy="fixed",
   512                  sparse_dedup_aggregator=
None, rowWise=
False, engine=
'',
   513                  lars=
None, output_effective_lr=
False,
   514                  output_effective_lr_and_update=
False, **kwargs):
   515         super(AdagradOptimizer, self).__init__()
   528     def _run(self, net, param_init_net, param_info):
   529         param = param_info.blob
   530         grad = param_info.grad
   537         if self.
lars is not None and not isinstance(grad, core.GradientSlice):
   538             assert self.
lars >= 0, (
   539                 'Lars offset must be nonnegative, got {}'.format(self.
lars))
   541                 param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
   542             lr_lars_multiplier = net.Lars(
   543                 [param, grad, wd, trust, lr_max],
   548             current_scope = scope.CurrentDeviceScope()
   551                 is_gpu_blob=(current_scope 
is not None   552                     and core.IsGPUDeviceType(current_scope.device_type)),
   557             base_learning_rate=self.
alpha,
   563             logger.info(
"Using engine {} for rowWise Adagrad".format(self.
engine))
   565             shapes, types = workspace.InferShapesAndTypes([param_init_net])
   566             if str(param) 
not in shapes:
   569                 shape = param_init_net.Shape(param, str(param) + 
"_shape")
   570                 num_rows = param_init_net.Slice(
   572                     str(shape) + 
"_numrows",
   575                 param_squared_sum = param_init_net.ConstantFill(
   577                     str(param) + 
"_avg_squared_sum",
   582                 param_squared_sum = param_init_net.ConstantFill(
   584                     str(param) + 
"_avg_squared_sum",
   585                     shape=[shapes[str(param)][0]],
   589             logger.info(
"Using engine {} for regular Adagrad".format(self.
engine))
   591             if self.
engine in FP16_ENGINES:
   592                 shapes, types = workspace.InferShapesAndTypes([param_init_net])
   593                 assert str(param) 
in shapes, shapes
   594                 shape = shapes[str(param)]
   596                 param_squared_sum = param_init_net.Float16ConstantFill(
   598                     str(param) + 
"_squared_sum",
   603                 param_squared_sum = param_init_net.ConstantFill(
   605                     str(param) + 
"_squared_sum",
   609         self._aux_params.local.append(param_squared_sum)
   612             assert isinstance(grad, core.GradientSlice),\
   613                 'If SparseAdagrad with rowWise=True, gradient must be '\
   614                 'a gradientslice. PLease ensure that rowWise is not enabled '\
   615                 'for the dense Adagrad optimizer, as it is not supported.'   616         if isinstance(grad, core.GradientSlice):
   617             assert self.
decay == 1.,\
   618                 'Decay is not implemented for SparseAdagrad and must be set to 1'   621                 op = 
'RowWiseSparseAdagrad'   625                 [param, param_squared_sum, grad.indices, grad.values, lr],
   626                 [param, param_squared_sum],
   631             output_args = [param, param_squared_sum]
   633                 output_args.append(str(param) + 
'_effective_lr')
   634                 output_args.append(str(param) + 
'_update')
   636                 output_args.append(str(param) + 
'_effective_lr')
   639                 [param, param_squared_sum, grad, lr],
   642                 decay=float(self.
decay),
   646     def scale_learning_rate(self, scale):
   652     def __init__(self, alpha=1.0, epsilon=1e-9, policy="fixed",
   653                  sparse_dedup_aggregator=
None, engine=
'', moment_init=100.0,
   654                  lars=
None, output_effective_lr=
False,
   655                  output_effective_lr_and_update=
False, **kwargs):
   656         super(WngradOptimizer, self).__init__()
   668     def _run(self, net, param_init_net, param_info):
   669         param = param_info.blob
   670         grad = param_info.grad
   677         if self.
lars is not None and not isinstance(grad, core.GradientSlice):
   678             assert self.
lars >= 0, (
   679                 'Lars offset must be nonnegative, got {}'.format(self.
lars))
   681                 param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
   682             lr_lars_multiplier = net.Lars(
   683                 [param, grad, wd, trust, lr_max],
   687             current_scope = scope.CurrentDeviceScope()
   690                 is_gpu_blob=(current_scope 
is not None   691                     and core.IsGPUDeviceType(current_scope.device_type)),
   696             base_learning_rate=self.
alpha,
   701         moment = param_init_net.ConstantFill(
   703             str(param) + 
"_moment",
   708         self._aux_params.local.append(moment)
   710         if isinstance(grad, core.GradientSlice):
   713                 [param, moment, grad.indices, grad.values, lr],
   719             output_args = [param, moment]
   721                 output_args.append(str(param) + 
'_effective_lr')
   722                 output_args.append(str(param) + 
'_update')
   724                 output_args.append(str(param) + 
'_effective_lr')
   727                 [param, moment, grad, lr],
   733     def scale_learning_rate(self, scale):
   739     def __init__(self, alpha=0.01, epsilon=1e-4, decay=0.95, policy="fixed",
   740                  sparse_dedup_aggregator=
None, engine=
'', **kwargs):
   741         """Constructor function to add Adadelta Optimizer   745             epsilon: attribute of Adadelta to avoid numerical issues   746             decay: attribute of Adadelta to decay the squared gradient sum   747             policy: specifies how learning rate should be applied, options are   748               "fixed", "step", "exp", etc.   749             sparse_dedup_aggregator: specifies deduplication strategy for   750               gradient slices. Works while using sparse gradients. Options   751               include "mean" and "sum".   752             engine: the engine used, options include "", "CUDNN", etc.   754         super(AdadeltaOptimizer, self).
__init__()
   763     def _run(self, net, param_init_net, param_info):
   764         param = param_info.blob
   765         grad = param_info.grad
   772             base_learning_rate=self.
alpha,
   777         moment = param_init_net.ConstantFill(
   778             [param], str(param) + 
"_squared_moment", value=0.0)
   780         moment_update = param_init_net.ConstantFill(
   781             [param], str(param) + 
"_squared_moment_update", value=0.0)
   783         self._aux_params.local.append(moment)
   784         self._aux_params.local.append(moment_update)
   786         if isinstance(grad, core.GradientSlice):
   790                     param, moment, moment_update, grad.indices,
   792                 ], [param, moment, moment_update],
   798                 [param, moment, moment_update, grad, lr],
   799                 [param, moment, moment_update],
   805     def scale_learning_rate(self, scale):
   811     def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
   812                  sparse_dedup_aggregator=
None, engine=
''):
   813         super(FtrlOptimizer, self).__init__()
   821     def _run(self, net, param_init_net, param_info):
   822         param = param_info.blob
   823         grad = param_info.grad
   828         nz = param_init_net.ConstantFill(
   830             str(param) + 
"_ftrl_nz",
   834         self._aux_params.local.append(nz)
   835         if isinstance(grad, core.GradientSlice):
   838                 [param, nz, grad.indices, grad.values],
   857     def scale_learning_rate(self, scale):
   863     """Group Lasso FTRL Optimizer."""   865     def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
   866                  sparse_dedup_aggregator=
None, engine=
''):
   867         super(GFtrlOptimizer, self).__init__()
   875     def _run(self, net, param_init_net, param_info):
   876         param = param_info.blob
   877         grad = param_info.grad
   882         nz = param_init_net.ConstantFill(
   884             str(param) + 
"_gftrl_nz",
   888         self._aux_params.local.append(nz)
   899     def scale_learning_rate(self, scale):
   905     def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
   906                  policy=
'fixed', use_lr_adaption=
False, lr_alpha=0.01,
   907                  normalized_lr_adaption=
True, sparse_dedup_aggregator=
None,
   908                  rowWise=
False, engine=
'', **kwargs):
   909         super(AdamOptimizer, self).__init__()
   923     def _run(self, net, param_init_net, param_info):
   924         param = param_info.blob
   925         grad = param_info.grad
   932             base_learning_rate=self.
alpha,
   937         m1 = param_init_net.ConstantFill(
   939             param + 
"_first_moment",
   944             shapes, types = workspace.InferShapesAndTypes([param_init_net])
   945             m2 = param_init_net.ConstantFill(
   947                 param + 
"_avg_second_moment",
   948                 shape=[shapes[param][0]],
   952             m2 = param_init_net.ConstantFill(
   954                 param + 
"_second_moment",
   958         self._aux_params.shared.append(iteration)
   959         self._aux_params.local.append(m1)
   960         self._aux_params.local.append(m2)
   963             assert isinstance(grad, core.GradientSlice),\
   964                 'If SparseAdam with rowWise=True, gradient must be '\
   965                 'a gradientslice. PLease ensure that rowWise is not enabled '\
   966                 'for the dense Adam optimizer, as it is not supported.'   968         output_blobs = [param, m1, m2]
   970             effective_grad = str(param) + 
'_effective_grad'   971             output_blobs.append(effective_grad)
   973         if isinstance(grad, core.GradientSlice):
   976                 op = 
'RowWiseSparseAdam'   981                 [param, m1, m2, grad.indices, grad.values, lr, iteration],
   987                 net.LearningRateAdaption(
   988                     [lr, grad.values, effective_grad],
   995                 [param, m1, m2, grad, lr, iteration],
  1001                 net.LearningRateAdaption(
  1002                     [lr, grad, effective_grad],
  1007     def scale_learning_rate(self, scale):
  1013     """YellowFin: An automatic tuner for momentum SGD  1015     See https://arxiv.org/abs/1706.03471 for more details. This implementation  1016     has separate learning rate and momentum per each parameter."""  1026                  sparse_dedup_aggregator=
None,
  1028         super(YellowFinOptimizer, self).__init__()
  1039     def _run(self, net, param_init_net, param_info):
  1044         SCALARS_MEMORY_SIZE = 5
  1046         param = param_info.blob
  1047         grad = param_info.grad
  1048         moment = param_init_net.ConstantFill(
  1053         curv_win = param_init_net.ConstantFill(
  1055             param + 
"_curv_win",
  1059         g_avg = param_init_net.ConstantFill(
  1064         g2_avg = param_init_net.ConstantFill(
  1069         lr_avg = param_init_net.ConstantFill(
  1075         mu_avg = param_init_net.ConstantFill(
  1081         scalars_memory = param_init_net.ConstantFill(
  1083             param + 
"_scalars_memory",
  1084             shape=[SCALARS_MEMORY_SIZE],
  1088         assert self.
alpha > 0
  1089         assert not isinstance(grad, core.GradientSlice), \
  1090             "YellowFin does not support sparse gradients"  1092         iteration = utils.BuildUniqueMutexIter(
  1098         self._aux_params.shared.append(iteration)
  1099         self._aux_params.local.append(moment)
  1100         self._aux_params.local.append(lr_avg)
  1101         self._aux_params.local.append(mu_avg)
  1102         self._aux_params.local.append(curv_win)
  1103         self._aux_params.local.append(g_avg)
  1104         self._aux_params.local.append(g2_avg)
  1105         self._aux_params.local.append(scalars_memory)
  1119             yf_in_out_args + [grad, iteration],
  1126     def scale_learning_rate(self, scale):
  1142         super(RmsPropOptimizer, self).__init__()
  1151     def _run(self, net, param_init_net, param_info):
  1152         param = param_info.blob
  1153         grad = param_info.grad
  1155         assert self.
alpha > 0
  1156         assert not isinstance(grad, core.GradientSlice), \
  1157             "RmsPropOptimizer doesn't support sparse gradients"  1159         dev = scope.CurrentDeviceScope()
  1161             dev = core.DeviceOption(caffe2_pb2.CPU)
  1163         ONE = param_init_net.ConstantFill(
  1165             "ONE_{}_{}".format(dev.device_type, dev.device_id),
  1173             base_learning_rate=-self.
alpha,
  1178         grad_o = param_init_net.ConstantFill(
  1180             str(param) + 
"_grad_o",
  1184         ms = param_init_net.ConstantFill(
  1186             str(param) + 
"_mean_squares",
  1190         mom = param_init_net.ConstantFill(
  1192             str(param) + 
"_momentum",
  1196         self._aux_params.local.append(ms)
  1197         self._aux_params.local.append(mom)
  1200             [grad, ms, mom, ONE],
  1208         net.MomentumSGDUpdate(
  1209             [grad_o, mom, lr, param],
  1210             [grad_o, mom, param],
  1213     def scale_learning_rate(self, scale):
  1218 def _get_param_to_device(model):
  1221     param_to_device = core.InferBlobDevices(model.net)
  1222     param_to_device.update(core.InferBlobDevices(model.param_init_net))
  1223     return param_to_device
  1226 def get_param_device(param_name, grad, param_to_device=None, default_device=None):
  1227     device = default_device
  1228     param_to_device = param_to_device 
or {}
  1232     if param_name 
in param_to_device:
  1233         device = param_to_device[param_name]
  1235         if isinstance(grad, core.GradientSlice):
  1237             if str(grad.values) 
in param_to_device:
  1238                 device = param_to_device[str(grad.values)]
  1239             elif str(grad.indices) 
in param_to_device:
  1240                 device = param_to_device[str(grad.indices)]
  1242             grad_name = str(grad)
  1243             if grad_name 
in param_to_device:
  1244                 device = param_to_device[grad_name]
  1246     assert device 
is not None,\
  1247         "Cannot infer device for {}: no op creates it".format(param_name)
  1251 def get_lr_injection():
  1253     Gets current value for lr_injection, a multiplier for all base  1255     Must set allow_lr_injection=True when building optimizer, as it  1256     relies on synchronization over CPU.  1258     return workspace.FetchBlob(_LEARNING_RATE_INJECTION)
  1261 def set_lr_injection(lr_injection_value):
  1263     Sets lr_injection, a multiplier for all base learning rates.  1264     Must set allow_lr_injection=True when building optimizer, as it  1265     relies on synchronization over CPU.  1268         _LEARNING_RATE_INJECTION,
  1270             [float(lr_injection_value)],
  1276 def _calc_norm_ratio(
  1277     model, params, name_scope, param_to_device, max_gradient_norm
  1279     with core.NameScope(name_scope):
  1280         grad_squared_sums = []
  1281         for i, param 
in enumerate(params):
  1282             device = get_param_device(
  1283                 str(param.blob), param.grad, param_to_device
  1286             with core.DeviceScope(device):
  1292                     ) 
else param.grad.values
  1295                 grad_squared_sum_name = 
'grad_{}_squared_sum'.format(i)
  1296                 grad_squared_sum = model.net.SumSqrElements(
  1298                     grad_squared_sum_name,
  1300                 grad_squared_sum_cpu = model.net.EnsureCPUOutput(
  1303                 grad_squared_sums.append(grad_squared_sum_cpu)
  1305         with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
  1306             grad_squared_full_sum = model.net.Sum(
  1308                 'grad_squared_full_sum',
  1310             global_norm = model.net.Pow(
  1311                 grad_squared_full_sum,
  1315             clip_norm = model.param_init_net.ConstantFill(
  1319                 value=float(max_gradient_norm),
  1321             max_norm = model.net.Max(
  1322                 [global_norm, clip_norm],
  1325             norm_ratio = model.net.Div(
  1326                 [clip_norm, max_norm],
  1336     use_param_info_optim=
True,
  1337     max_gradient_norm=
None,
  1338     allow_lr_injection=
False,
  1340     param_to_device = _get_param_to_device(model)
  1346     for param_info 
in model.GetOptimizationParamInfo():
  1347         if weights_only 
and param_info.blob 
not in model.weights:
  1349         params.append(param_info)
  1351     lr_multiplier = 
None  1352     if max_gradient_norm 
is not None:
  1353         lr_multiplier = _calc_norm_ratio(
  1356             'norm_clipped_grad_update',
  1361     if allow_lr_injection:
  1362         if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION):
  1363             lr_injection = model.param_init_net.ConstantFill(
  1365                 _LEARNING_RATE_INJECTION,
  1370             lr_injection = _LEARNING_RATE_INJECTION
  1372         if lr_multiplier 
is None:
  1373             lr_multiplier = lr_injection
  1375             lr_multiplier = model.net.Mul(
  1376                 [lr_multiplier, lr_injection],
  1380     optimizer.add_lr_multiplier(lr_multiplier)
  1382     for param_info 
in params:
  1383         param_name = str(param_info.blob)
  1384         device = get_param_device(param_name, param_info.grad, param_to_device)
  1385         with core.DeviceScope(device):
  1386             if param_info.optimizer 
and use_param_info_optim:
  1387                 param_info.optimizer(
  1388                     model.net, model.param_init_net, param_info)
  1390                 optimizer(model.net, model.param_init_net, param_info)
  1394 def add_weight_decay(model, weight_decay):
  1395     """Adds a decay to weights in the model.  1397     This is a form of L2 regularization.  1400         weight_decay: strength of the regularization  1406         use_param_info_optim=
False,
  1413     max_gradient_norm=
None,
  1414     allow_lr_injection=
False,
  1417     sgd_optimizer = 
SgdOptimizer(base_learning_rate, **kwargs)
  1421         max_gradient_norm=max_gradient_norm,
  1422         allow_lr_injection=allow_lr_injection,
  1426 def build_multi_precision_sgd(
  1429     max_gradient_norm=
None,
  1430     allow_lr_injection=
False,
  1434         base_learning_rate, **kwargs
  1438         multi_prec_sgd_optimizer,
  1439         max_gradient_norm=max_gradient_norm,
  1440         allow_lr_injection=allow_lr_injection,
  1444 def build_fp16_sgd(model, base_learning_rate, **kwargs):
  1446         base_learning_rate, **kwargs
  1448     return _build(model, fp16_sgd_optimizer)
  1451 def build_ftrl(model, engine="SIMD", **kwargs):
  1452     if engine == 
"SIMD":
  1453         assert core.IsOperator(
'Ftrl_ENGINE_SIMD')
  1454         assert core.IsOperator(
'SparseFtrl_ENGINE_SIMD')
  1456     return _build(model, ftrl_optimizer)
  1459 def build_gftrl(model, engine="", **kwargs):
  1460     if engine == 
"SIMD":
  1461         assert core.IsOperator(
'GFtrl_ENGINE_SIMD')
  1463     return _build(model, gftrl_optimizer)
  1470     max_gradient_norm=
None,
  1471     allow_lr_injection=
False,
  1478         max_gradient_norm=max_gradient_norm,
  1479         allow_lr_injection=allow_lr_injection,
  1487     max_gradient_norm=
None,
  1488     allow_lr_injection=
False,
  1491     wngrad_optimizer = 
WngradOptimizer(alpha=base_learning_rate, **kwargs)
  1495         max_gradient_norm=max_gradient_norm,
  1496         allow_lr_injection=allow_lr_injection,
  1504     max_gradient_norm=
None,
  1505     allow_lr_injection=
False,
  1512         max_gradient_norm=max_gradient_norm,
  1513         allow_lr_injection=allow_lr_injection,
  1520     max_gradient_norm=
None,
  1521     allow_lr_injection=
False,
  1524     adam_optimizer = 
AdamOptimizer(alpha=base_learning_rate, **kwargs)
  1528         max_gradient_norm=max_gradient_norm,
  1529         allow_lr_injection=allow_lr_injection,
  1533 def build_yellowfin(model, base_learning_rate=0.1, **kwargs):
  1535         alpha=base_learning_rate,
  1537     return _build(model, yellowfin_optimizer)
  1543     max_gradient_norm=
None,
  1544     allow_lr_injection=
False,
  1551         max_gradient_norm=max_gradient_norm,
  1552         allow_lr_injection=allow_lr_injection,
 
def get_cpu_blob_name(self, base_str, node_name='')
 
def _add_local_lr_multiplier(self, local_lr_multiplier, is_gpu_blob=False)
 
def add_lr_multiplier(self, lr_multiplier)
 
_local_lr_multiplier_on_gpu
 
def _clear_local_lr_multiplier(self)
 
def __init__(self, alpha=0.01, epsilon=1e-4, decay=0.95, policy="fixed", sparse_dedup_aggregator=None, engine='', kwargs)
 
output_effective_lr_and_update
 
def get_gpu_blob_name(self, base_str, gpu_id, node_name)
 
def dedup(net, sparse_dedup_aggregator, grad)
 
def _run(self, net, param_init_net, param_info)
 
def make_unique_blob_name(self, base_str)
 
def build_lr(self, net, param_init_net, base_learning_rate, learning_rate_blob=None, policy="fixed", iter_val=0, kwargs)
 
def get_auxiliary_parameters(self)
 
output_effective_lr_and_update
 
def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max)