Caffe2 - Python API
A deep learning, cross platform ML framework
optimizer.py
1 # @package optimizer
2 # Module caffe2.python.optimizer
3 from __future__ import absolute_import
4 from __future__ import division
5 from __future__ import print_function
6 from __future__ import unicode_literals
7 
8 from collections import namedtuple, defaultdict
9 from past.builtins import basestring
10 
11 import logging
12 
13 import numpy as np
14 
15 from caffe2.python import core, scope, utils, workspace
16 from caffe2.python.modeling import parameter_info
17 from caffe2.proto import caffe2_pb2
18 
19 
20 _LEARNING_RATE_INJECTION = "lr_injection"
21 
22 AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
23 _optimizer_instance_count = defaultdict(int)
24 
25 FP16_ENGINES = ["SIMD_Q_FP16", "SIMD_Q_STOC_FP16", "SIMD_Q_STOC_MKL_FP16"]
26 
27 logger = logging.getLogger(__name__)
28 
29 
30 class Optimizer(object):
31  def __init__(self):
32  self._aux_params = AuxOptimizerParams(local=[], shared=[])
33  self._instance_num = _optimizer_instance_count[self.__class__.__name__]
34  _optimizer_instance_count[self.__class__.__name__] += 1
35  self._lr_multiplier = None
36  self._local_lr_multiplier = None
37  self._local_lr_multiplier_on_gpu = False
38 
39  '''
40  Adds optimization operators to the net for given parameter and its gradient
41  Parameter is specified by either 'param' being a ParameterInfo object.
42  In this case param.grad has to be set
43 
44  Or by 'param' being a BlobReference and 'grad' being a BlobReference for its
45  gradient.
46  '''
47  def __call__(self, net, param_init_net, param, grad=None):
48  if grad is None:
49  assert isinstance(param, parameter_info.ParameterInfo), (
50  "Expected parameter to be of type ParameterInfo, got {}".format(
51  param
52  ))
53  assert param.grad is not None
54  else:
55  if isinstance(param, basestring):
56  param = core.BlobReference(param)
57  param = parameter_info.ParameterInfo(
58  param_id=None, param=param, grad=grad)
59 
60  self._run(net, param_init_net, param)
61 
62  def _run(self, net, param_init_net, param_info):
63  raise Exception("Not Implemented")
64 
65  def get_cpu_blob_name(self, base_str, node_name=''):
66  classname = self.__class__.__name__
67  return '%s_%d_%s%s_cpu' % (classname, self._instance_num, base_str, node_name)
68 
69  def get_gpu_blob_name(self, base_str, gpu_id, node_name):
70  classname = self.__class__.__name__
71  return '%s_%d_%s%s_gpu%d' % (
72  classname, self._instance_num, base_str, node_name, gpu_id,
73  )
74 
75  def make_unique_blob_name(self, base_str):
76  """
77  Returns a blob name that will be unique to the current device
78  and optimizer instance.
79  """
80  current_scope = scope.CurrentDeviceScope()
81  if current_scope is None:
82  return self.get_cpu_blob_name(base_str)
83 
84  if core.IsGPUDeviceType(current_scope.device_type):
85  return self.get_gpu_blob_name(
86  base_str, current_scope.device_id, current_scope.node_name
87  )
88  else:
89  return self.get_cpu_blob_name(base_str, current_scope.node_name)
90 
91  def build_lr(self, net, param_init_net, base_learning_rate,
92  learning_rate_blob=None, policy="fixed",
93  iter_val=0, **kwargs):
94  if learning_rate_blob is None:
95  learning_rate_blob = self.make_unique_blob_name('lr')
96 
97  iteration = utils.BuildUniqueMutexIter(
98  param_init_net,
99  net,
100  iter_val=iter_val
101  )
102 
103  if not net.BlobIsDefined(learning_rate_blob):
104  # There is one interesting thing here: since we are minimizing, we are
105  # doing "descent" so the learning rate is set to be negative.
106  lr = net.LearningRate(
107  [iteration],
108  learning_rate_blob,
109  base_lr=-base_learning_rate,
110  policy=policy,
111  **kwargs
112  )
113  else:
114  lr = net.GetBlobRef(learning_rate_blob)
115 
116  if self._lr_multiplier is not None:
117  lr_multiplier = net.CopyFromCPUInput(
118  self._lr_multiplier, self.make_unique_blob_name('lr_multiplier')
119  )
120 
121  lr = net.Mul(
122  [lr, lr_multiplier],
123  self.make_unique_blob_name('scaled_lr'),
124  broadcast=1,
125  )
126 
127  if self._local_lr_multiplier is not None:
128  current_scope = scope.CurrentDeviceScope()
129  if (current_scope is not None
130  and core.IsGPUDeviceType(current_scope.device_type)
131  and not self._local_lr_multiplier_on_gpu):
132  local_lr_multiplier = net.CopyFromCPUInput(
134  self.make_unique_blob_name('local_lr_multiplier')
135  )
136  else:
137  local_lr_multiplier = self._local_lr_multiplier
138 
139  lr = net.Mul(
140  [lr, local_lr_multiplier],
141  self.make_unique_blob_name('local_scaled_lr'),
142  broadcast=1,
143  )
144 
145  return lr, iteration
146 
147  def add_lr_multiplier(self, lr_multiplier):
148  """
149  Set the global learning rate multiplier. If a multiplier already
150  existed, this will overwrite the existing multiplier. The multiplier is
151  used for all future calls to _run(), unless it is overwritten.
152  """
153  self._lr_multiplier = lr_multiplier
154 
155  def _add_local_lr_multiplier(self, local_lr_multiplier, is_gpu_blob=False):
156  """
157  Set the local learning rate multiplier. This local multiplier is
158  multiplied with the global learning rate multiplier if it exists. As
159  with the global learning rate multiplier, this multiplier will be
160  used for all future calls to _run(), so please call
161  _clear_local_lr_multiplier() at the beginning of the optimizer's _run()
162  before optionally calling this function.
163  """
164  self._local_lr_multiplier = local_lr_multiplier
165  self._local_lr_multiplier_on_gpu = is_gpu_blob
166 
167  def _clear_local_lr_multiplier(self):
168  self._local_lr_multiplier = None
169  self._local_lr_multiplier_on_gpu = False
170 
171  @staticmethod
172  def dedup(net, sparse_dedup_aggregator, grad):
173  assert isinstance(grad, core.GradientSlice), (
174  "Dedup only works for sparse gradient, got {}".format(grad))
175  if sparse_dedup_aggregator:
176  return net.DeduplicateGradientSlices(
177  grad, aggregator=sparse_dedup_aggregator)
178  else:
179  return grad
180 
182  """Returns a list of auxiliary parameters.
183 
184  Returns:
185  aux_params: A namedtuple, AuxParams.
186 
187  aux_params.local stores a list of blobs. Each blob is a local
188  auxiliary parameter. A local auxiliary parameter is a parameter in
189  parallel to a learning rate parameter. Take adagrad as an example,
190  the local auxiliary parameter is the squared sum parameter, because
191  every learning rate has a squared sum associated with it.
192 
193  aux_params.shared also stores a list of blobs. Each blob is a shared
194  auxiliary parameter. A shared auxiliary parameter is a parameter
195  that is shared across all the learning rate parameters. Take adam as
196  an example, the iteration parameter is a shared parameter, because
197  all the learning rates share the same iteration parameter.
198  """
199  return self._aux_params
200 
201  # TODO(xlwang): In transfer learning, parameter initialized from pretrained
202  # model might require a different learning rate than otherwise initialized.
203  # To this end, here we implement a python solution where
204  # `base_learning_rate` is scaled by `scale`, by calling
205  # `scale_learning_rate`; Alternatively, we can achieve same effect by
206  # rewriting the LearningRate operator in C++
207  # Note that it is the responsibility of specific optimizer to decide what
208  # logic should be used for `scale_learning_rate`
209  def scale_learning_rate(self, *args, **kwargs):
210  raise NotImplementedError(
211  "Optimizer Need to Implement `scale_learning_rate` method.")
212 
213  def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max):
214  wd = param_init_net.ConstantFill([], "weight_decay",
215  shape=[1], value=weight_decay)
216  trust = param_init_net.ConstantFill([], "trust", shape=[1], value=trust)
217  lr_max = param_init_net.ConstantFill([], "lr_max", shape=[1],
218  value=lr_max)
219  return wd, trust, lr_max
220 
221 
223  def __init__(self, base_learning_rate=0.01, policy='fixed',
224  momentum=0.0, nesterov=1, sparse_dedup_aggregator=None,
225  lars=None, **kwargs):
226  super(SgdOptimizer, self).__init__()
227  self.base_learning_rate = base_learning_rate
228  self.policy = policy
229  self.momentum = momentum
230  self.nesterov = nesterov
231  self.sparse_dedup_aggregator = sparse_dedup_aggregator
232  self.lars = lars
233  self.init_kwargs = kwargs
234 
235  def _run(self, net, param_init_net, param_info):
236  param = param_info.blob
237  grad = param_info.grad
238  if self.base_learning_rate == 0:
239  return
240  assert self.base_learning_rate > 0, (
241  "Expect positive base learning rate, got {}".format(
242  self.base_learning_rate))
243 
245 
246  # TODO(zqq): support LARS for sparse parameters
247  if self.lars is not None and not isinstance(grad, core.GradientSlice):
248  assert self.lars >= 0, (
249  'Lars offset must be nonnegative, got {}'.format(self.lars))
250  wd, trust, lr_max = self.create_lars_inputs(
251  param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
252  lr_lars_multiplier = net.Lars(
253  [param, grad, wd, trust, lr_max],
254  self.make_unique_blob_name(str(param) + "_lars"),
255  offset=self.lars,
256  lr_min=0.0)
257  current_scope = scope.CurrentDeviceScope()
259  lr_lars_multiplier,
260  is_gpu_blob=(current_scope is not None
261  and core.IsGPUDeviceType(current_scope.device_type)),
262  )
263 
264  # We need negative sign for LR when used directly with WeightedSum
265  # below.
266  lr_sign = -1 if self.momentum else 1
267  lr, _ = self.build_lr(
268  net, param_init_net,
269  base_learning_rate=self.base_learning_rate * lr_sign,
270  policy=self.policy,
271  **(self.init_kwargs)
272  )
273 
274  dev = scope.CurrentDeviceScope()
275  if dev is None:
276  dev = core.DeviceOption(caffe2_pb2.CPU)
277 
278  # Each GPU/CPU must have its own ONE blob, thus modify the name
279  # to include device information.
280  ONE = param_init_net.ConstantFill(
281  [],
282  "ONE_{}_{}{}".format(dev.device_type, dev.device_id, dev.node_name),
283  shape=[1],
284  value=1.0
285  )
286 
287  self._aux_params.shared.append(ONE)
288 
289  if self.momentum > 0:
290  momentum_data = param_init_net.ConstantFill(
291  param, str(param) + "_momentum", value=0.)
292  self._aux_params.local.append(momentum_data)
293 
294  if isinstance(grad, core.GradientSlice):
295  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
296  if self.momentum > 0.:
297  net.SparseMomentumSGDUpdate(
298  [grad.values, momentum_data, lr, param, grad.indices],
299  [grad.values, momentum_data, param],
300  momentum=self.momentum,
301  nesterov=self.nesterov)
302  else:
303  net.ScatterWeightedSum(
304  [param, ONE, grad.indices, grad.values, lr],
305  param
306  )
307  else:
308  if self.momentum > 0.:
309  net.MomentumSGDUpdate(
310  [grad, momentum_data, lr, param],
311  [grad, momentum_data, param],
312  momentum=self.momentum,
313  nesterov=self.nesterov)
314  else:
315  coeff = lr
316 
317  net.WeightedSum(
318  [param, ONE, grad, coeff],
319  param
320  )
321 
322  def scale_learning_rate(self, scale):
323  self.base_learning_rate *= scale
324  return
325 
326 
328  def __init__(self, base_learning_rate=0.1, momentum=0.0,
329  policy="fixed", nesterov=1, sparse_dedup_aggregator=None,
330  **kwargs):
331  super(MultiPrecisionSgdOptimizer, self).__init__(
332  base_learning_rate=base_learning_rate,
333  policy=policy,
334  momentum=momentum,
335  nesterov=nesterov,
336  sparse_dedup_aggregator=sparse_dedup_aggregator,
337  **kwargs
338  )
339 
340  def _run(self, net, param_init_net, param_info):
341  param = param_info.blob
342  param_fp32 = param_info.blob_copy[core.DataType.FLOAT] \
343  if param_info.blob_copy is not None else None
344 
345  # If we have a straight fp32 parameter, run the base class
346  if param_fp32 is None:
347  return SgdOptimizer._run(self, net, param_init_net, param_info)
348 
349  grad = param_info.grad
350  if self.base_learning_rate == 0:
351  return
352  assert self.base_learning_rate > 0, (
353  "Expect positive base learning rate, got {}".format(
354  self.base_learning_rate))
355 
356  lr, _ = self.build_lr(
357  net, param_init_net,
358  base_learning_rate=-self.base_learning_rate,
359  policy=self.policy,
360  **(self.init_kwargs)
361  )
362 
363  momentum_data = param_init_net.ConstantFill(
364  param_fp32, str(param) + "_momentum", value=0.)
365  self._aux_params.local.append(momentum_data)
366 
367  assert not isinstance(grad, core.GradientSlice), (
368  "MultiPrecisionSgd does not support sparse gradients")
369 
370  # Copy gradient to fp32
371  grad_fp32 = net.HalfToFloat(grad, grad + "_fp32")
372 
373  # update (fused) in fp32
374  net.MomentumSGDUpdate(
375  [grad_fp32, momentum_data, lr, param_fp32],
376  [grad_fp32, momentum_data, param_fp32],
377  momentum=self.momentum,
378  nesterov=self.nesterov)
379 
380  # Copy updated param back to fp16
381  net.FloatToHalf(param_fp32, param)
382 
383 
385  def __init__(self, base_learning_rate=0.1, momentum=0.0,
386  policy="fixed", nesterov=1, weight_decay=0.0001,
387  sparse_dedup_aggregator=None,
388  **kwargs):
389  super(FP16SgdOptimizer, self).__init__(
390  base_learning_rate=base_learning_rate,
391  policy=policy,
392  momentum=momentum,
393  nesterov=nesterov,
394  sparse_dedup_aggregator=sparse_dedup_aggregator,
395  **kwargs
396  )
397  self.weight_decay = weight_decay
398 
399  def _run(self, net, param_init_net, param_info, fp32_update=False):
400 
401  fp32_update_flag = 0
402  param_name = str(param_info.blob)
403 
404  # should only be triggered in FP16 training by SpatialBN, which
405  # requires FP32 params in CuDNN.
406  if param_name.find("spatbn") != -1:
407  fp32_update = True
408 
409  if fp32_update:
410  # doing a 32bit update
411  # Have to assume param_info.blob is FP32 as there is no way
412  # (that i currently know of) to query a blob's type in python
413  fp32_update_flag = 1
414  param = param_info.blob
415  param_fp32 = param_info.blob
416  else:
417  if param_info.blob_copy is None:
418  # doing a 32bit update
419  # Have to assume param_info.blob is FP32 as there is no way
420  # (that i currently know of) to query a blob's type in python
421  fp32_update_flag = 1
422  param = param_info.blob
423  param_fp32 = param_info.blob
424  else:
425  if core.DataType.FLOAT in param_info.blob_copy:
426  param = param_info.blob
427  param_fp32 = param_info.blob_copy[core.DataType.FLOAT]
428  elif core.DataType.FLOAT16 in param_info.blob_copy:
429  param = param_info.blob_copy[core.DataType.FLOAT16]
430  param_fp32 = param_info.blob
431  else:
432  assert (False), (
433  "Unrecognized parameter format to be updated "
434  "by FP16 Optimizer. Parameter: {}".format(param_info.name)
435  )
436 
437  grad = param_info.grad
438 
439  if self.base_learning_rate == 0:
440  return
441  assert self.base_learning_rate > 0, (
442  "Expect positive base learning rate, got {}".format(
443  self.base_learning_rate))
444 
445  lr, _ = self.build_lr(
446  net, param_init_net,
447  base_learning_rate=-self.base_learning_rate,
448  policy=self.policy,
449  **(self.init_kwargs)
450  )
451 
452  momentum_data_fp32 = param_init_net.ConstantFill(
453  param_fp32, str(param) + "_momentum_fp32", value=0.)
454 
455  momentum_data = param_init_net.FloatToHalf(
456  momentum_data_fp32, str(param) + "_momentum")
457 
458  self._aux_params.local.append(momentum_data)
459 
460  assert not isinstance(grad, core.GradientSlice), (
461  "FP16Sgd does not support sparse gradients")
462 
463  if fp32_update_flag == 0:
464  net.FP16MomentumSGDUpdate(
465  [grad, momentum_data, lr, param],
466  [grad, momentum_data, param],
467  momentum=self.momentum,
468  nesterov=self.nesterov,
469  weight_decay=self.weight_decay)
470  else:
471  # flag set to 1, therefore doing FP32 update
472  net.FP32MomentumSGDUpdate(
473  [grad, momentum_data_fp32, lr, param],
474  [grad, momentum_data_fp32, param],
475  momentum=self.momentum,
476  nesterov=self.nesterov,
477  weight_decay=self.weight_decay)
478 
479 
481  def __init__(self, weight_decay):
482  self.weight_decay = weight_decay
483 
484  def _run(self, net, param_init_net, param_info):
485  dev = scope.CurrentDeviceScope()
486  if dev is None:
487  dev = core.DeviceOption(caffe2_pb2.CPU)
488 
489  ONE = param_init_net.ConstantFill(
490  [],
491  "ONE_{}_{}".format(dev.device_type, dev.device_id),
492  shape=[1],
493  value=1.0
494  )
495  WD = param_init_net.ConstantFill(
496  [], "wd_{}_{}".format(dev.device_type, dev.device_id),
497  shape=[1], value=self.weight_decay
498  )
499 
500  if isinstance(param_info.grad, core.GradientSlice):
501  raise ValueError(
502  "Weight decay does not yet support sparse gradients")
503  else:
504  net.WeightedSum(
505  [param_info.grad, ONE, param_info.blob, WD],
506  param_info.grad,
507  )
508 
509 
511  def __init__(self, alpha=0.01, epsilon=1e-4, decay=1, policy="fixed",
512  sparse_dedup_aggregator=None, rowWise=False, engine='',
513  lars=None, output_effective_lr=False,
514  output_effective_lr_and_update=False, **kwargs):
515  super(AdagradOptimizer, self).__init__()
516  self.alpha = alpha
517  self.epsilon = epsilon
518  self.decay = decay
519  self.policy = policy
520  self.sparse_dedup_aggregator = sparse_dedup_aggregator
521  self.rowWise = rowWise
522  self.engine = engine
523  self.lars = lars
524  self.output_effective_lr = output_effective_lr
525  self.output_effective_lr_and_update = output_effective_lr_and_update
526  self.init_kwargs = kwargs
527 
528  def _run(self, net, param_init_net, param_info):
529  param = param_info.blob
530  grad = param_info.grad
531 
532  if self.alpha <= 0:
533  return
534 
536 
537  if self.lars is not None and not isinstance(grad, core.GradientSlice):
538  assert self.lars >= 0, (
539  'Lars offset must be nonnegative, got {}'.format(self.lars))
540  wd, trust, lr_max = self.create_lars_inputs(
541  param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
542  lr_lars_multiplier = net.Lars(
543  [param, grad, wd, trust, lr_max],
544  self.make_unique_blob_name(str(param) + "_lars"),
545  offset=self.lars,
546  lr_min=0.0)
547 
548  current_scope = scope.CurrentDeviceScope()
550  lr_lars_multiplier,
551  is_gpu_blob=(current_scope is not None
552  and core.IsGPUDeviceType(current_scope.device_type)),
553  )
554 
555  lr, _ = self.build_lr(
556  net, param_init_net,
557  base_learning_rate=self.alpha,
558  policy=self.policy,
559  **(self.init_kwargs)
560  )
561 
562  if self.rowWise:
563  logger.info("Using engine {} for rowWise Adagrad".format(self.engine))
564 
565  shapes, types = workspace.InferShapesAndTypes([param_init_net])
566  if str(param) not in shapes:
567  # Type/shape inference is not available for this param, fallback
568  # on Shape/Slice logic
569  shape = param_init_net.Shape(param, str(param) + "_shape")
570  num_rows = param_init_net.Slice(
571  [shape],
572  str(shape) + "_numrows",
573  starts=[0], ends=[1]
574  )
575  param_squared_sum = param_init_net.ConstantFill(
576  num_rows,
577  str(param) + "_avg_squared_sum",
578  input_as_shape=1,
579  value=0.0
580  )
581  else:
582  param_squared_sum = param_init_net.ConstantFill(
583  [],
584  str(param) + "_avg_squared_sum",
585  shape=[shapes[str(param)][0]],
586  value=0.0
587  )
588  else:
589  logger.info("Using engine {} for regular Adagrad".format(self.engine))
590 
591  if self.engine in FP16_ENGINES:
592  shapes, types = workspace.InferShapesAndTypes([param_init_net])
593  assert str(param) in shapes, shapes
594  shape = shapes[str(param)]
595 
596  param_squared_sum = param_init_net.Float16ConstantFill(
597  [],
598  str(param) + "_squared_sum",
599  value=0.0,
600  shape=shape,
601  )
602  else:
603  param_squared_sum = param_init_net.ConstantFill(
604  [param],
605  str(param) + "_squared_sum",
606  value=0.0
607  )
608 
609  self._aux_params.local.append(param_squared_sum)
610 
611  if self.rowWise:
612  assert isinstance(grad, core.GradientSlice),\
613  'If SparseAdagrad with rowWise=True, gradient must be '\
614  'a gradientslice. PLease ensure that rowWise is not enabled '\
615  'for the dense Adagrad optimizer, as it is not supported.'
616  if isinstance(grad, core.GradientSlice):
617  assert self.decay == 1.,\
618  'Decay is not implemented for SparseAdagrad and must be set to 1'
619  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
620  if self.rowWise:
621  op = 'RowWiseSparseAdagrad'
622  else:
623  op = 'SparseAdagrad'
624  net.__getattr__(op)(
625  [param, param_squared_sum, grad.indices, grad.values, lr],
626  [param, param_squared_sum],
627  epsilon=self.epsilon,
628  engine=self.engine,
629  )
630  else:
631  output_args = [param, param_squared_sum]
633  output_args.append(str(param) + '_effective_lr')
634  output_args.append(str(param) + '_update')
635  elif self.output_effective_lr:
636  output_args.append(str(param) + '_effective_lr')
637 
638  net.Adagrad(
639  [param, param_squared_sum, grad, lr],
640  output_args,
641  epsilon=self.epsilon,
642  decay=float(self.decay),
643  engine=self.engine
644  )
645 
646  def scale_learning_rate(self, scale):
647  self.alpha *= scale
648  return
649 
650 
652  def __init__(self, alpha=1.0, epsilon=1e-9, policy="fixed",
653  sparse_dedup_aggregator=None, engine='', moment_init=100.0,
654  lars=None, output_effective_lr=False,
655  output_effective_lr_and_update=False, **kwargs):
656  super(WngradOptimizer, self).__init__()
657  self.alpha = alpha
658  self.epsilon = epsilon
659  self.policy = policy
660  self.sparse_dedup_aggregator = sparse_dedup_aggregator
661  self.engine = engine
662  self.moment_init = moment_init
663  self.lars = lars
664  self.output_effective_lr = output_effective_lr
665  self.output_effective_lr_and_update = output_effective_lr_and_update
666  self.init_kwargs = kwargs
667 
668  def _run(self, net, param_init_net, param_info):
669  param = param_info.blob
670  grad = param_info.grad
671 
672  if self.alpha <= 0:
673  return
674 
676 
677  if self.lars is not None and not isinstance(grad, core.GradientSlice):
678  assert self.lars >= 0, (
679  'Lars offset must be nonnegative, got {}'.format(self.lars))
680  wd, trust, lr_max = self.create_lars_inputs(
681  param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
682  lr_lars_multiplier = net.Lars(
683  [param, grad, wd, trust, lr_max],
684  self.make_unique_blob_name(str(param) + "_lars"),
685  offset=self.lars,
686  lr_min=0.0)
687  current_scope = scope.CurrentDeviceScope()
689  lr_lars_multiplier,
690  is_gpu_blob=(current_scope is not None
691  and core.IsGPUDeviceType(current_scope.device_type)),
692  )
693 
694  lr, _ = self.build_lr(
695  net, param_init_net,
696  base_learning_rate=self.alpha,
697  policy=self.policy,
698  **(self.init_kwargs)
699  )
700 
701  moment = param_init_net.ConstantFill(
702  [],
703  str(param) + "_moment",
704  shape=[1],
705  value=self.moment_init
706  )
707 
708  self._aux_params.local.append(moment)
709 
710  if isinstance(grad, core.GradientSlice):
711  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
712  net.SparseWngrad(
713  [param, moment, grad.indices, grad.values, lr],
714  [param, moment],
715  epsilon=self.epsilon,
716  engine=self.engine
717  )
718  else:
719  output_args = [param, moment]
721  output_args.append(str(param) + '_effective_lr')
722  output_args.append(str(param) + '_update')
723  elif self.output_effective_lr:
724  output_args.append(str(param) + '_effective_lr')
725 
726  net.Wngrad(
727  [param, moment, grad, lr],
728  output_args,
729  epsilon=self.epsilon,
730  engine=self.engine
731  )
732 
733  def scale_learning_rate(self, scale):
734  self.alpha *= scale
735  return
736 
737 
739  def __init__(self, alpha=0.01, epsilon=1e-4, decay=0.95, policy="fixed",
740  sparse_dedup_aggregator=None, engine='', **kwargs):
741  """Constructor function to add Adadelta Optimizer
742 
743  Args:
744  alpha: learning rate
745  epsilon: attribute of Adadelta to avoid numerical issues
746  decay: attribute of Adadelta to decay the squared gradient sum
747  policy: specifies how learning rate should be applied, options are
748  "fixed", "step", "exp", etc.
749  sparse_dedup_aggregator: specifies deduplication strategy for
750  gradient slices. Works while using sparse gradients. Options
751  include "mean" and "sum".
752  engine: the engine used, options include "", "CUDNN", etc.
753  """
754  super(AdadeltaOptimizer, self).__init__()
755  self.alpha = alpha
756  self.epsilon = epsilon
757  self.decay = decay
758  self.policy = policy
759  self.sparse_dedup_aggregator = sparse_dedup_aggregator
760  self.engine = engine
761  self.init_kwargs = kwargs
762 
763  def _run(self, net, param_init_net, param_info):
764  param = param_info.blob
765  grad = param_info.grad
766 
767  if self.alpha <= 0:
768  return
769 
770  lr, _ = self.build_lr(
771  net, param_init_net,
772  base_learning_rate=self.alpha,
773  policy=self.policy,
774  **(self.init_kwargs)
775  )
776 
777  moment = param_init_net.ConstantFill(
778  [param], str(param) + "_squared_moment", value=0.0)
779 
780  moment_update = param_init_net.ConstantFill(
781  [param], str(param) + "_squared_moment_update", value=0.0)
782 
783  self._aux_params.local.append(moment)
784  self._aux_params.local.append(moment_update)
785 
786  if isinstance(grad, core.GradientSlice):
787  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
788  net.SparseAdadelta(
789  [
790  param, moment, moment_update, grad.indices,
791  grad.values, lr
792  ], [param, moment, moment_update],
793  epsilon=self.epsilon,
794  decay=self.decay,
795  engine=self.engine)
796  else:
797  net.Adadelta(
798  [param, moment, moment_update, grad, lr],
799  [param, moment, moment_update],
800  epsilon=self.epsilon,
801  decay=self.decay,
802  engine=self.engine
803  )
804 
805  def scale_learning_rate(self, scale):
806  self.alpha *= scale
807  return
808 
809 
811  def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
812  sparse_dedup_aggregator=None, engine=''):
813  super(FtrlOptimizer, self).__init__()
814  self.alpha = alpha
815  self.beta = beta
816  self.lambda1 = lambda1
817  self.lambda2 = lambda2
818  self.sparse_dedup_aggregator = sparse_dedup_aggregator
819  self.engine = engine
820 
821  def _run(self, net, param_init_net, param_info):
822  param = param_info.blob
823  grad = param_info.grad
824 
825  if self.alpha <= 0:
826  return
827 
828  nz = param_init_net.ConstantFill(
829  [param],
830  str(param) + "_ftrl_nz",
831  extra_shape=[2],
832  value=0.0
833  )
834  self._aux_params.local.append(nz)
835  if isinstance(grad, core.GradientSlice):
836  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
837  net.SparseFtrl(
838  [param, nz, grad.indices, grad.values],
839  [param, nz],
840  engine=self.engine,
841  alpha=self.alpha,
842  beta=self.beta,
843  lambda1=self.lambda1,
844  lambda2=self.lambda2
845  )
846  else:
847  net.Ftrl(
848  [param, nz, grad],
849  [param, nz],
850  engine=self.engine,
851  alpha=self.alpha,
852  beta=self.beta,
853  lambda1=self.lambda1,
854  lambda2=self.lambda2
855  )
856 
857  def scale_learning_rate(self, scale):
858  self.alpha *= scale
859  return
860 
861 
863  """Group Lasso FTRL Optimizer."""
864 
865  def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
866  sparse_dedup_aggregator=None, engine=''):
867  super(GFtrlOptimizer, self).__init__()
868  self.alpha = alpha
869  self.beta = beta
870  self.lambda1 = lambda1
871  self.lambda2 = lambda2
872  self.sparse_dedup_aggregator = sparse_dedup_aggregator
873  self.engine = engine
874 
875  def _run(self, net, param_init_net, param_info):
876  param = param_info.blob
877  grad = param_info.grad
878 
879  if self.alpha <= 0:
880  return
881 
882  nz = param_init_net.ConstantFill(
883  [param],
884  str(param) + "_gftrl_nz",
885  extra_shape=[2],
886  value=0.0
887  )
888  self._aux_params.local.append(nz)
889  net.GFtrl(
890  [param, nz, grad],
891  [param, nz],
892  engine=self.engine,
893  alpha=self.alpha,
894  beta=self.beta,
895  lambda1=self.lambda1,
896  lambda2=self.lambda2
897  )
898 
899  def scale_learning_rate(self, scale):
900  self.alpha *= scale
901  return
902 
903 
905  def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
906  policy='fixed', use_lr_adaption=False, lr_alpha=0.01,
907  normalized_lr_adaption=True, sparse_dedup_aggregator=None,
908  rowWise=False, engine='', **kwargs):
909  super(AdamOptimizer, self).__init__()
910  self.alpha = alpha
911  self.beta1 = beta1
912  self.beta2 = beta2
913  self.epsilon = epsilon
914  self.policy = policy
915  self.use_lr_adaption = use_lr_adaption
916  self.lr_alpha = lr_alpha
917  self.normalized_lr_adaption = normalized_lr_adaption
918  self.sparse_dedup_aggregator = sparse_dedup_aggregator
919  self.rowWise = rowWise
920  self.engine = engine
921  self.init_kwargs = kwargs
922 
923  def _run(self, net, param_init_net, param_info):
924  param = param_info.blob
925  grad = param_info.grad
926 
927  if self.alpha <= 0:
928  return
929 
930  lr, iteration = self.build_lr(
931  net, param_init_net,
932  base_learning_rate=self.alpha,
933  policy=self.policy,
934  **(self.init_kwargs)
935  )
936 
937  m1 = param_init_net.ConstantFill(
938  [param],
939  param + "_first_moment",
940  value=0.0
941  )
942 
943  if self.rowWise:
944  shapes, types = workspace.InferShapesAndTypes([param_init_net])
945  m2 = param_init_net.ConstantFill(
946  [],
947  param + "_avg_second_moment",
948  shape=[shapes[param][0]],
949  value=0.0
950  )
951  else:
952  m2 = param_init_net.ConstantFill(
953  [param],
954  param + "_second_moment",
955  value=0.0
956  )
957 
958  self._aux_params.shared.append(iteration)
959  self._aux_params.local.append(m1)
960  self._aux_params.local.append(m2)
961 
962  if self.rowWise:
963  assert isinstance(grad, core.GradientSlice),\
964  'If SparseAdam with rowWise=True, gradient must be '\
965  'a gradientslice. PLease ensure that rowWise is not enabled '\
966  'for the dense Adam optimizer, as it is not supported.'
967 
968  output_blobs = [param, m1, m2]
969  if self.use_lr_adaption:
970  effective_grad = str(param) + '_effective_grad'
971  output_blobs.append(effective_grad)
972 
973  if isinstance(grad, core.GradientSlice):
974  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
975  if self.rowWise:
976  op = 'RowWiseSparseAdam'
977  else:
978  op = 'SparseAdam'
979 
980  net.__getattr__(op)(
981  [param, m1, m2, grad.indices, grad.values, lr, iteration],
982  output_blobs,
983  beta1=self.beta1,
984  beta2=self.beta2,
985  epsilon=self.epsilon)
986  if self.use_lr_adaption:
987  net.LearningRateAdaption(
988  [lr, grad.values, effective_grad],
989  [lr],
990  lr_alpha=self.lr_alpha,
991  normalized_lr_adaption=self.normalized_lr_adaption)
992 
993  else:
994  net.Adam(
995  [param, m1, m2, grad, lr, iteration],
996  output_blobs,
997  beta1=self.beta1,
998  beta2=self.beta2,
999  epsilon=self.epsilon)
1000  if self.use_lr_adaption:
1001  net.LearningRateAdaption(
1002  [lr, grad, effective_grad],
1003  [lr],
1004  lr_alpha=self.lr_alpha,
1005  normalized_lr_adaption=self.normalized_lr_adaption)
1006 
1007  def scale_learning_rate(self, scale):
1008  self.alpha *= scale
1009  return
1010 
1011 
1013  """YellowFin: An automatic tuner for momentum SGD
1014 
1015  See https://arxiv.org/abs/1706.03471 for more details. This implementation
1016  has separate learning rate and momentum per each parameter."""
1017 
1018  def __init__(self,
1019  alpha=0.1,
1020  mu=0.0,
1021  beta=0.999,
1022  curv_win_width=20,
1023  zero_debias=True,
1024  epsilon=0.1**6,
1025  policy='fixed',
1026  sparse_dedup_aggregator=None,
1027  **kwargs):
1028  super(YellowFinOptimizer, self).__init__()
1029  self.alpha = alpha
1030  self.mu = mu
1031  self.beta = beta
1032  self.curv_win_width = curv_win_width
1033  self.zero_debias = zero_debias
1034  self.epsilon = epsilon
1035  self.policy = policy
1036  self.sparse_dedup_aggregator = sparse_dedup_aggregator
1037  self.init_kwargs = kwargs
1038 
1039  def _run(self, net, param_init_net, param_info):
1040 
1041  # Note: This is number of persistent scalars in YellowFin optimizer.
1042  # It should always be the number of scalars being used. The same
1043  # number should be used in class for the operation.
1044  SCALARS_MEMORY_SIZE = 5
1045 
1046  param = param_info.blob
1047  grad = param_info.grad
1048  moment = param_init_net.ConstantFill(
1049  [param],
1050  param + "_moment",
1051  value=0.0
1052  )
1053  curv_win = param_init_net.ConstantFill(
1054  [],
1055  param + "_curv_win",
1056  shape=[self.curv_win_width],
1057  value=0.0
1058  )
1059  g_avg = param_init_net.ConstantFill(
1060  [param],
1061  param + "_g_avg",
1062  value=0.0
1063  )
1064  g2_avg = param_init_net.ConstantFill(
1065  [param],
1066  param + "_g2_avg",
1067  value=0.0
1068  )
1069  lr_avg = param_init_net.ConstantFill(
1070  [],
1071  param + "_lr_avg",
1072  shape=[1],
1073  value=self.alpha
1074  )
1075  mu_avg = param_init_net.ConstantFill(
1076  [],
1077  param + "_mu_avg",
1078  shape=[1],
1079  value=self.mu
1080  )
1081  scalars_memory = param_init_net.ConstantFill(
1082  [],
1083  param + "_scalars_memory",
1084  shape=[SCALARS_MEMORY_SIZE],
1085  value=0.0
1086  )
1087 
1088  assert self.alpha > 0
1089  assert not isinstance(grad, core.GradientSlice), \
1090  "YellowFin does not support sparse gradients"
1091 
1092  iteration = utils.BuildUniqueMutexIter(
1093  param_init_net,
1094  net,
1095  iter_val=0
1096  )
1097 
1098  self._aux_params.shared.append(iteration)
1099  self._aux_params.local.append(moment)
1100  self._aux_params.local.append(lr_avg)
1101  self._aux_params.local.append(mu_avg)
1102  self._aux_params.local.append(curv_win)
1103  self._aux_params.local.append(g_avg)
1104  self._aux_params.local.append(g2_avg)
1105  self._aux_params.local.append(scalars_memory)
1106 
1107  yf_in_out_args = [
1108  param,
1109  moment,
1110  lr_avg,
1111  mu_avg,
1112  curv_win,
1113  g_avg,
1114  g2_avg,
1115  scalars_memory
1116  ]
1117 
1118  net.YellowFin(
1119  yf_in_out_args + [grad, iteration],
1120  yf_in_out_args,
1121  beta=self.beta,
1122  epsilon=self.epsilon,
1123  curv_win_width=self.curv_win_width,
1124  zero_debias=self.zero_debias)
1125 
1126  def scale_learning_rate(self, scale):
1127  self.alpha *= scale
1128  return
1129 
1130 
1132  def __init__(
1133  self,
1134  alpha=0.01,
1135  decay=0.9,
1136  momentum=0.0,
1137  epsilon=1e-5,
1138  policy='fixed',
1139  engine='',
1140  **kwargs
1141  ):
1142  super(RmsPropOptimizer, self).__init__()
1143  self.alpha = alpha
1144  self.decay = decay
1145  self.momentum = momentum
1146  self.epsilon = epsilon
1147  self.policy = policy
1148  self.engine = engine
1149  self.init_kwargs = kwargs
1150 
1151  def _run(self, net, param_init_net, param_info):
1152  param = param_info.blob
1153  grad = param_info.grad
1154 
1155  assert self.alpha > 0
1156  assert not isinstance(grad, core.GradientSlice), \
1157  "RmsPropOptimizer doesn't support sparse gradients"
1158 
1159  dev = scope.CurrentDeviceScope()
1160  if dev is None:
1161  dev = core.DeviceOption(caffe2_pb2.CPU)
1162 
1163  ONE = param_init_net.ConstantFill(
1164  [],
1165  "ONE_{}_{}".format(dev.device_type, dev.device_id),
1166  shape=[1],
1167  value=1.0
1168  )
1169 
1170  lr, _ = self.build_lr(
1171  net,
1172  param_init_net,
1173  base_learning_rate=-self.alpha,
1174  policy=self.policy,
1175  **(self.init_kwargs)
1176  )
1177 
1178  grad_o = param_init_net.ConstantFill(
1179  [param],
1180  str(param) + "_grad_o",
1181  values=0.0,
1182  )
1183 
1184  ms = param_init_net.ConstantFill(
1185  [param],
1186  str(param) + "_mean_squares",
1187  values=0.0,
1188  )
1189 
1190  mom = param_init_net.ConstantFill(
1191  [param],
1192  str(param) + "_momentum",
1193  values=0.0,
1194  )
1195 
1196  self._aux_params.local.append(ms)
1197  self._aux_params.local.append(mom)
1198 
1199  net.RmsProp(
1200  [grad, ms, mom, ONE],
1201  [grad_o, ms, mom],
1202  decay=self.decay,
1203  momentum=self.momentum,
1204  epsilon=self.epsilon,
1205  engine=self.engine,
1206  )
1207 
1208  net.MomentumSGDUpdate(
1209  [grad_o, mom, lr, param],
1210  [grad_o, mom, param],
1211  )
1212 
1213  def scale_learning_rate(self, scale):
1214  self.alpha *= scale
1215  return
1216 
1217 
1218 def _get_param_to_device(model):
1219  # Infer blob devices by going through the net and param_init_net
1220  # ops and observing the device used to create or use the blob.
1221  param_to_device = core.InferBlobDevices(model.net)
1222  param_to_device.update(core.InferBlobDevices(model.param_init_net))
1223  return param_to_device
1224 
1225 
1226 def get_param_device(param_name, grad, param_to_device=None, default_device=None):
1227  device = default_device
1228  param_to_device = param_to_device or {}
1229  # We first check if parameter's device has been inferred. If not,
1230  # we check the gradient. This can happen if parameter is not output
1231  # by any blob but created by a FetchBlob.
1232  if param_name in param_to_device:
1233  device = param_to_device[param_name]
1234  else:
1235  if isinstance(grad, core.GradientSlice):
1236  grad = grad
1237  if str(grad.values) in param_to_device:
1238  device = param_to_device[str(grad.values)]
1239  elif str(grad.indices) in param_to_device:
1240  device = param_to_device[str(grad.indices)]
1241  else:
1242  grad_name = str(grad)
1243  if grad_name in param_to_device:
1244  device = param_to_device[grad_name]
1245 
1246  assert device is not None,\
1247  "Cannot infer device for {}: no op creates it".format(param_name)
1248  return device
1249 
1250 
1251 def get_lr_injection():
1252  """
1253  Gets current value for lr_injection, a multiplier for all base
1254  learning rates.
1255  Must set allow_lr_injection=True when building optimizer, as it
1256  relies on synchronization over CPU.
1257  """
1258  return workspace.FetchBlob(_LEARNING_RATE_INJECTION)
1259 
1260 
1261 def set_lr_injection(lr_injection_value):
1262  """
1263  Sets lr_injection, a multiplier for all base learning rates.
1264  Must set allow_lr_injection=True when building optimizer, as it
1265  relies on synchronization over CPU.
1266  """
1267  workspace.FeedBlob(
1268  _LEARNING_RATE_INJECTION,
1269  np.array(
1270  [float(lr_injection_value)],
1271  dtype=np.float32,
1272  ),
1273  )
1274 
1275 
1276 def _calc_norm_ratio(
1277  model, params, name_scope, param_to_device, max_gradient_norm
1278 ):
1279  with core.NameScope(name_scope):
1280  grad_squared_sums = []
1281  for i, param in enumerate(params):
1282  device = get_param_device(
1283  str(param.blob), param.grad, param_to_device
1284  )
1285 
1286  with core.DeviceScope(device):
1287  grad = (
1288  param.grad
1289  if not isinstance(
1290  param.grad,
1291  core.GradientSlice,
1292  ) else param.grad.values
1293  )
1294 
1295  grad_squared_sum_name = 'grad_{}_squared_sum'.format(i)
1296  grad_squared_sum = model.net.SumSqrElements(
1297  grad,
1298  grad_squared_sum_name,
1299  )
1300  grad_squared_sum_cpu = model.net.EnsureCPUOutput(
1301  grad_squared_sum
1302  )
1303  grad_squared_sums.append(grad_squared_sum_cpu)
1304 
1305  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
1306  grad_squared_full_sum = model.net.Sum(
1307  grad_squared_sums,
1308  'grad_squared_full_sum',
1309  )
1310  global_norm = model.net.Pow(
1311  grad_squared_full_sum,
1312  'global_norm',
1313  exponent=0.5,
1314  )
1315  clip_norm = model.param_init_net.ConstantFill(
1316  [],
1317  'clip_norm',
1318  shape=[],
1319  value=float(max_gradient_norm),
1320  )
1321  max_norm = model.net.Max(
1322  [global_norm, clip_norm],
1323  'max_norm',
1324  )
1325  norm_ratio = model.net.Div(
1326  [clip_norm, max_norm],
1327  'norm_ratio',
1328  )
1329  return norm_ratio
1330 
1331 
1332 def _build(
1333  model,
1334  optimizer,
1335  weights_only=False,
1336  use_param_info_optim=True,
1337  max_gradient_norm=None,
1338  allow_lr_injection=False,
1339 ):
1340  param_to_device = _get_param_to_device(model)
1341 
1342  # Validate there are no duplicate params
1343  model.Validate()
1344 
1345  params = []
1346  for param_info in model.GetOptimizationParamInfo():
1347  if weights_only and param_info.blob not in model.weights:
1348  continue
1349  params.append(param_info)
1350 
1351  lr_multiplier = None
1352  if max_gradient_norm is not None:
1353  lr_multiplier = _calc_norm_ratio(
1354  model,
1355  params,
1356  'norm_clipped_grad_update',
1357  param_to_device,
1358  max_gradient_norm,
1359  )
1360 
1361  if allow_lr_injection:
1362  if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION):
1363  lr_injection = model.param_init_net.ConstantFill(
1364  [],
1365  _LEARNING_RATE_INJECTION,
1366  shape=[1],
1367  value=1.0,
1368  )
1369  else:
1370  lr_injection = _LEARNING_RATE_INJECTION
1371 
1372  if lr_multiplier is None:
1373  lr_multiplier = lr_injection
1374  else:
1375  lr_multiplier = model.net.Mul(
1376  [lr_multiplier, lr_injection],
1377  'lr_multiplier',
1378  broadcast=1,
1379  )
1380  optimizer.add_lr_multiplier(lr_multiplier)
1381 
1382  for param_info in params:
1383  param_name = str(param_info.blob)
1384  device = get_param_device(param_name, param_info.grad, param_to_device)
1385  with core.DeviceScope(device):
1386  if param_info.optimizer and use_param_info_optim:
1387  param_info.optimizer(
1388  model.net, model.param_init_net, param_info)
1389  else:
1390  optimizer(model.net, model.param_init_net, param_info)
1391  return optimizer
1392 
1393 
1394 def add_weight_decay(model, weight_decay):
1395  """Adds a decay to weights in the model.
1396 
1397  This is a form of L2 regularization.
1398 
1399  Args:
1400  weight_decay: strength of the regularization
1401  """
1402  _build(
1403  model,
1404  WeightDecayBuilder(weight_decay=weight_decay),
1405  weights_only=True,
1406  use_param_info_optim=False,
1407  )
1408 
1409 
1410 def build_sgd(
1411  model,
1412  base_learning_rate,
1413  max_gradient_norm=None,
1414  allow_lr_injection=False,
1415  **kwargs
1416 ):
1417  sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
1418  return _build(
1419  model,
1420  sgd_optimizer,
1421  max_gradient_norm=max_gradient_norm,
1422  allow_lr_injection=allow_lr_injection,
1423  )
1424 
1425 
1426 def build_multi_precision_sgd(
1427  model,
1428  base_learning_rate,
1429  max_gradient_norm=None,
1430  allow_lr_injection=False,
1431  **kwargs
1432 ):
1433  multi_prec_sgd_optimizer = MultiPrecisionSgdOptimizer(
1434  base_learning_rate, **kwargs
1435  )
1436  return _build(
1437  model,
1438  multi_prec_sgd_optimizer,
1439  max_gradient_norm=max_gradient_norm,
1440  allow_lr_injection=allow_lr_injection,
1441  )
1442 
1443 
1444 def build_fp16_sgd(model, base_learning_rate, **kwargs):
1445  fp16_sgd_optimizer = FP16SgdOptimizer(
1446  base_learning_rate, **kwargs
1447  )
1448  return _build(model, fp16_sgd_optimizer)
1449 
1450 
1451 def build_ftrl(model, engine="SIMD", **kwargs):
1452  if engine == "SIMD":
1453  assert core.IsOperator('Ftrl_ENGINE_SIMD')
1454  assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
1455  ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
1456  return _build(model, ftrl_optimizer)
1457 
1458 
1459 def build_gftrl(model, engine="", **kwargs):
1460  if engine == "SIMD":
1461  assert core.IsOperator('GFtrl_ENGINE_SIMD')
1462  gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs)
1463  return _build(model, gftrl_optimizer)
1464 
1465 
1466 def build_adagrad(
1467  model,
1468  base_learning_rate,
1469  parameters=None,
1470  max_gradient_norm=None,
1471  allow_lr_injection=False,
1472  **kwargs
1473 ):
1474  adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
1475  return _build(
1476  model,
1477  adagrad_optimizer,
1478  max_gradient_norm=max_gradient_norm,
1479  allow_lr_injection=allow_lr_injection,
1480  )
1481 
1482 
1483 def build_wngrad(
1484  model,
1485  base_learning_rate,
1486  parameters=None,
1487  max_gradient_norm=None,
1488  allow_lr_injection=False,
1489  **kwargs
1490 ):
1491  wngrad_optimizer = WngradOptimizer(alpha=base_learning_rate, **kwargs)
1492  return _build(
1493  model,
1494  wngrad_optimizer,
1495  max_gradient_norm=max_gradient_norm,
1496  allow_lr_injection=allow_lr_injection,
1497  )
1498 
1499 
1500 def build_adadelta(
1501  model,
1502  base_learning_rate,
1503  parameters=None,
1504  max_gradient_norm=None,
1505  allow_lr_injection=False,
1506  **kwargs
1507 ):
1508  adadelta_optimizer = AdadeltaOptimizer(alpha=base_learning_rate, **kwargs)
1509  return _build(
1510  model,
1511  adadelta_optimizer,
1512  max_gradient_norm=max_gradient_norm,
1513  allow_lr_injection=allow_lr_injection,
1514  )
1515 
1516 
1517 def build_adam(
1518  model,
1519  base_learning_rate,
1520  max_gradient_norm=None,
1521  allow_lr_injection=False,
1522  **kwargs
1523 ):
1524  adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
1525  return _build(
1526  model,
1527  adam_optimizer,
1528  max_gradient_norm=max_gradient_norm,
1529  allow_lr_injection=allow_lr_injection,
1530  )
1531 
1532 
1533 def build_yellowfin(model, base_learning_rate=0.1, **kwargs):
1534  yellowfin_optimizer = YellowFinOptimizer(
1535  alpha=base_learning_rate,
1536  **kwargs)
1537  return _build(model, yellowfin_optimizer)
1538 
1539 
1540 def build_rms_prop(
1541  model,
1542  base_learning_rate,
1543  max_gradient_norm=None,
1544  allow_lr_injection=False,
1545  **kwargs
1546 ):
1547  rms_prop_optimizer = RmsPropOptimizer(alpha=base_learning_rate, **kwargs)
1548  return _build(
1549  model,
1550  rms_prop_optimizer,
1551  max_gradient_norm=max_gradient_norm,
1552  allow_lr_injection=allow_lr_injection,
1553  )
def get_cpu_blob_name(self, base_str, node_name='')
Definition: optimizer.py:65
def _add_local_lr_multiplier(self, local_lr_multiplier, is_gpu_blob=False)
Definition: optimizer.py:155
def add_lr_multiplier(self, lr_multiplier)
Definition: optimizer.py:147
def __init__(self, alpha=0.01, epsilon=1e-4, decay=0.95, policy="fixed", sparse_dedup_aggregator=None, engine='', kwargs)
Definition: optimizer.py:740
def get_gpu_blob_name(self, base_str, gpu_id, node_name)
Definition: optimizer.py:69
def dedup(net, sparse_dedup_aggregator, grad)
Definition: optimizer.py:172
def _run(self, net, param_init_net, param_info)
Definition: optimizer.py:62
def make_unique_blob_name(self, base_str)
Definition: optimizer.py:75
def build_lr(self, net, param_init_net, base_learning_rate, learning_rate_blob=None, policy="fixed", iter_val=0, kwargs)
Definition: optimizer.py:93
def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max)
Definition: optimizer.py:213