Caffe2 - Python API
A deep learning, cross platform ML framework
optimizer.py
1 # Copyright (c) 2016-present, Facebook, Inc.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 ##############################################################################
15 
16 ## @package optimizer
17 # Module caffe2.python.optimizer
18 from __future__ import absolute_import
19 from __future__ import division
20 from __future__ import print_function
21 from __future__ import unicode_literals
22 
23 from collections import namedtuple, defaultdict
24 from past.builtins import basestring
25 
26 import numpy as np
27 
28 from caffe2.python import core, scope, workspace
29 from caffe2.python.modeling import parameter_info
30 from caffe2.proto import caffe2_pb2
31 
32 
33 _OPTIMIZER_ITERATION_NAME = "optimizer_iteration"
34 _LEARNING_RATE_INJECTION = "lr_injection"
35 
36 AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
37 _optimizer_instance_count = defaultdict(int)
38 
39 
40 class Optimizer(object):
41  def __init__(self):
42  self._aux_params = AuxOptimizerParams(local=[], shared=[])
43  self._instance_num = _optimizer_instance_count[self.__class__.__name__]
44  _optimizer_instance_count[self.__class__.__name__] += 1
45  self._lr_multiplier = None
46 
47  '''
48  Adds optimization operators to the net for given parameter and its gradient
49  Parameter is specified by either 'param' being a ParameterInfo object.
50  In this case param.grad has to be set
51 
52  Or by 'param' being a BlobReference and 'grad' being a BlobReference for its
53  gradient.
54  '''
55  def __call__(self, net, param_init_net, param, grad=None):
56  if grad is None:
57  assert isinstance(param, parameter_info.ParameterInfo)
58  assert param.grad is not None
59  else:
60  if isinstance(param, basestring):
61  param = core.BlobReference(param)
62  param = parameter_info.ParameterInfo(
63  param_id=None, param=param, grad=grad)
64 
65  self._run(net, param_init_net, param)
66 
67  def _run(self, net, param_init_net, param_info):
68  raise Exception("Not Implemented")
69 
70  def get_cpu_blob_name(self, base_str, node_name=''):
71  classname = self.__class__.__name__
72  return '%s_%d_%s%s_cpu' % (classname, self._instance_num, base_str, node_name)
73 
74  def get_gpu_blob_name(self, base_str, gpu_id, node_name):
75  classname = self.__class__.__name__
76  return '%s_%d_%s%s_gpu%d' % (
77  classname, self._instance_num, base_str, node_name, gpu_id,
78  )
79 
80  def make_unique_blob_name(self, base_str):
81  """
82  Returns a blob name that will be unique to the current device
83  and optimizer instance.
84  """
85  current_scope = scope.CurrentDeviceScope()
86  if current_scope is None:
87  return self.get_cpu_blob_name(base_str)
88 
89  if current_scope.device_type == caffe2_pb2.CUDA:
90  return self.get_gpu_blob_name(
91  base_str, current_scope.cuda_gpu_id, current_scope.node_name
92  )
93  else:
94  return self.get_cpu_blob_name(base_str, current_scope.node_name)
95 
96  def build_lr(self, net, param_init_net, base_learning_rate,
97  learning_rate_blob=None, policy="fixed",
98  iter_val=0, **kwargs):
99  if learning_rate_blob is None:
100  learning_rate_blob = self.make_unique_blob_name('lr')
101 
102  optimization_iter_blob = _OPTIMIZER_ITERATION_NAME
103  if not param_init_net.BlobIsDefined(optimization_iter_blob):
104  # Add training operators.
105  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
106  iteration = param_init_net.ConstantFill(
107  [], optimization_iter_blob, shape=[1],
108  value=iter_val,
109  dtype=core.DataType.INT64)
110  iter_mutex = param_init_net.CreateMutex(
111  [], ["iteration_mutex"]
112  )
113  net.AtomicIter([iter_mutex, iteration], [iteration])
114  else:
115  iteration = param_init_net.GetBlobRef(optimization_iter_blob)
116 
117  if not net.BlobIsDefined(learning_rate_blob):
118  # There is one interesting thing here: since we are minimizing, we are
119  # doing "descent" so the learning rate is set to be negative.
120  lr = net.LearningRate(
121  [iteration],
122  learning_rate_blob,
123  base_lr=-base_learning_rate,
124  policy=policy,
125  **kwargs
126  )
127  else:
128  lr = net.GetBlobRef(learning_rate_blob)
129 
130  if self._lr_multiplier is not None:
131  lr_multiplier = net.CopyFromCPUInput(
132  self._lr_multiplier, self.make_unique_blob_name('lr_multiplier')
133  )
134  scaled_lr = net.Mul(
135  [lr, lr_multiplier],
136  self.make_unique_blob_name('scaled_lr'),
137  broadcast=1,
138  )
139  lr = scaled_lr
140 
141  return lr, iteration
142 
143  def add_lr_multiplier(self, lr_multiplier):
144  self._lr_multiplier = lr_multiplier
145 
146  @staticmethod
147  def dedup(net, sparse_dedup_aggregator, grad):
148  assert (isinstance(grad, core.GradientSlice))
149  if sparse_dedup_aggregator:
150  return net.DeduplicateGradientSlices(
151  grad, aggregator=sparse_dedup_aggregator)
152  else:
153  return grad
154 
156  """Returns a list of auxiliary parameters.
157 
158  Returns:
159  aux_params: A namedtuple, AuxParams.
160 
161  aux_params.local stores a list of blobs. Each blob is a local
162  auxiliary parameter. A local auxiliary parameter is a parameter in
163  parallel to a learning rate parameter. Take adagrad as an example,
164  the local auxiliary parameter is the squared sum parameter, because
165  every learning rate has a squared sum associated with it.
166 
167  aux_params.shared also stores a list of blobs. Each blob is a shared
168  auxiliary parameter. A shared auxiliary parameter is a parameter
169  that is shared across all the learning rate parameters. Take adam as
170  an example, the iteration parameter is a shared parameter, because
171  all the learning rates share the same iteration parameter.
172  """
173  return self._aux_params
174 
175  # TODO(xlwang): In transfer learning, parameter initialized from pretrained
176  # model might require a different learning rate than otherwise initialized.
177  # To this end, here we implement a python solution where
178  # `base_learning_rate` is scaled by `scale`, by calling
179  # `scale_learning_rate`; Alternatively, we can achieve same effect by
180  # rewriting the LearningRate operator in C++
181  # Note that it is the responsibility of specific optimizer to decide what
182  # logic should be used for `scale_learning_rate`
183  def scale_learning_rate(self, *args, **kwargs):
184  raise NotImplementedError(
185  "Optimizer Need to Implement `scale_learning_rate` method.")
186 
187 
189  def __init__(self, base_learning_rate=0.01, policy='fixed',
190  momentum=0.0, nesterov=1, sparse_dedup_aggregator=None,
191  **kwargs):
192  super(SgdOptimizer, self).__init__()
193  self.base_learning_rate = base_learning_rate
194  self.policy = policy
195  self.momentum = momentum
196  self.nesterov = nesterov
197  self.sparse_dedup_aggregator = sparse_dedup_aggregator
198  self.init_kwargs = kwargs
199 
200  def _run(self, net, param_init_net, param_info):
201  param = param_info.blob
202  grad = param_info.grad
203  if self.base_learning_rate == 0:
204  return
205  assert self.base_learning_rate > 0
206 
207  # We need negative sign for LR when used directly with WeightedSum
208  # below.
209  lr_sign = -1 if self.momentum else 1
210  lr, _ = self.build_lr(
211  net, param_init_net,
212  base_learning_rate=self.base_learning_rate * lr_sign,
213  policy=self.policy,
214  **(self.init_kwargs)
215  )
216 
217  dev = scope.CurrentDeviceScope()
218  if dev is None:
219  dev = core.DeviceOption(caffe2_pb2.CPU)
220 
221  # Each GPU/CPU must have its own ONE blob, thus modify the name
222  # to include device information.
223  ONE = param_init_net.ConstantFill(
224  [],
225  "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
226  shape=[1],
227  value=1.0
228  )
229 
230  self._aux_params.shared.append(ONE)
231 
232  if self.momentum > 0:
233  momentum_data = param_init_net.ConstantFill(
234  param, str(param) + "_momentum", value=0.)
235  self._aux_params.local.append(momentum_data)
236 
237  if isinstance(grad, core.GradientSlice):
238  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
239  if self.momentum > 0.:
240  net.SparseMomentumSGDUpdate(
241  [grad.values, momentum_data, lr, param, grad.indices],
242  [grad.values, momentum_data, param],
243  momentum=self.momentum,
244  nesterov=self.nesterov)
245  else:
246  net.ScatterWeightedSum(
247  [param, ONE, grad.indices, grad.values, lr],
248  param
249  )
250  else:
251  if self.momentum > 0.:
252  net.MomentumSGDUpdate(
253  [grad, momentum_data, lr, param],
254  [grad, momentum_data, param],
255  momentum=self.momentum,
256  nesterov=self.nesterov)
257  else:
258  coeff = lr
259 
260  net.WeightedSum(
261  [param, ONE, grad, coeff],
262  param
263  )
264 
265  def scale_learning_rate(self, scale):
266  self.base_learning_rate *= scale
267  return
268 
269 
271  def __init__(self, base_learning_rate=0.1, momentum=0.0,
272  policy="fixed", nesterov=1, sparse_dedup_aggregator=None,
273  **kwargs):
274  super(SgdOptimizer, self).__init__()
275  self.base_learning_rate = base_learning_rate
276  self.momentum = momentum
277  self.policy = policy
278  self.nesterov = nesterov
279  self.sparse_dedup_aggregator = sparse_dedup_aggregator
280  self.init_kwargs = kwargs
281 
282  def _run(self, net, param_init_net, param_info):
283  param = param_info.blob
284  param_fp32 = param_info.blob_copy[core.DataType.FLOAT] \
285  if param_info.blob_copy is not None else None
286 
287  # If we have a straight fp32 parameter, run the base class
288  if param_fp32 is None:
289  return SgdOptimizer._run(self, net, param_init_net, param_info)
290 
291  grad = param_info.grad
292  if self.base_learning_rate == 0:
293  return
294  assert self.base_learning_rate > 0
295 
296  lr, _ = self.build_lr(
297  net, param_init_net,
298  base_learning_rate=-self.base_learning_rate,
299  policy=self.policy,
300  **(self.init_kwargs)
301  )
302 
303  momentum_data = param_init_net.ConstantFill(
304  param_fp32, str(param) + "_momentum", value=0.)
305  self._aux_params.local.append(momentum_data)
306 
307  assert not isinstance(grad, core.GradientSlice), \
308  "Doesn't support sparse gradients"
309 
310  # Copy gradient to fp32
311  grad_fp32 = net.HalfToFloat(grad, grad + "_fp32")
312 
313  # update (fused) in fp32
314  net.MomentumSGDUpdate(
315  [grad_fp32, momentum_data, lr, param_fp32],
316  [grad_fp32, momentum_data, param_fp32],
317  momentum=self.momentum,
318  nesterov=self.nesterov)
319 
320  # Copy updated param back to fp16
321  net.FloatToHalf(param_fp32, param)
322 
323 
325  def __init__(self, base_learning_rate=0.1, momentum=0.0,
326  policy="fixed", nesterov=1, weight_decay=0.0001,
327  sparse_dedup_aggregator=None,
328  **kwargs):
329  super(SgdOptimizer, self).__init__()
330  self.base_learning_rate = base_learning_rate
331  self.momentum = momentum
332  self.policy = policy
333  self.nesterov = nesterov
334  self.sparse_dedup_aggregator = sparse_dedup_aggregator
335  self.init_kwargs = kwargs
336  self.weight_decay = weight_decay
337 
338  def _run(self, net, param_init_net, param_info, fp32_update=False):
339 
340  fp32_update_flag = 0
341  param_name = str(param_info.blob)
342 
343  # should only be triggered in FP16 training by SpatialBN, which
344  # requires FP32 params in CuDNN.
345  if param_name.find("spatbn") != -1:
346  fp32_update = True
347 
348  if fp32_update:
349  # doing a 32bit update
350  # Have to assume param_info.blob is FP32 as there is no way
351  # (that i currently know of) to query a blob's type in python
352  fp32_update_flag = 1
353  param = param_info.blob
354  param_fp32 = param_info.blob
355  else:
356  if param_info.blob_copy is None:
357  # doing a 32bit update
358  # Have to assume param_info.blob is FP32 as there is no way
359  # (that i currently know of) to query a blob's type in python
360  fp32_update_flag = 1
361  param = param_info.blob
362  param_fp32 = param_info.blob
363  else:
364  if core.DataType.FLOAT in param_info.blob_copy:
365  param = param_info.blob
366  param_fp32 = param_info.blob_copy[core.DataType.FLOAT]
367  elif core.DataType.FLOAT16 in param_info.blob_copy:
368  param = param_info.blob_copy[core.DataType.FLOAT16]
369  param_fp32 = param_info.blob
370  else:
371  assert (False), (
372  "Unrecognized parameter format to be updated "
373  "by FP16 Optimizer. Parameter: {}".format(param_info.name)
374  )
375 
376  grad = param_info.grad
377 
378  if self.base_learning_rate == 0:
379  return
380  assert self.base_learning_rate > 0
381 
382  lr, _ = self.build_lr(
383  net, param_init_net,
384  base_learning_rate=-self.base_learning_rate,
385  policy=self.policy,
386  **(self.init_kwargs)
387  )
388 
389  momentum_data_fp32 = param_init_net.ConstantFill(
390  param_fp32, str(param) + "_momentum_fp32", value=0.)
391 
392  momentum_data = param_init_net.FloatToHalf(
393  momentum_data_fp32, str(param) + "_momentum")
394 
395  self._aux_params.local.append(momentum_data)
396 
397  assert not isinstance(grad, core.GradientSlice), \
398  "Doesn't support sparse gradients"
399 
400  if fp32_update_flag == 0:
401  net.FP16MomentumSGDUpdate(
402  [grad, momentum_data, lr, param],
403  [grad, momentum_data, param],
404  momentum=self.momentum,
405  nesterov=self.nesterov,
406  weight_decay=self.weight_decay)
407  else:
408  # flag set to 1, therefore doing FP32 update
409  net.FP32MomentumSGDUpdate(
410  [grad, momentum_data_fp32, lr, param],
411  [grad, momentum_data_fp32, param],
412  momentum=self.momentum,
413  nesterov=self.nesterov,
414  weight_decay=self.weight_decay)
415 
416 
418  def __init__(self, weight_decay):
419  self.weight_decay = weight_decay
420 
421  def _run(self, net, param_init_net, param_info):
422  dev = scope.CurrentDeviceScope()
423  if dev is None:
424  dev = core.DeviceOption(caffe2_pb2.CPU)
425 
426  ONE = param_init_net.ConstantFill(
427  [],
428  "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
429  shape=[1],
430  value=1.0
431  )
432  WD = param_init_net.ConstantFill(
433  [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
434  shape=[1], value=self.weight_decay
435  )
436 
437  if isinstance(param_info.grad, core.GradientSlice):
438  assert "Weight decay does not yet support sparse gradients"
439  else:
440  net.WeightedSum(
441  [param_info.grad, ONE, param_info.blob, WD],
442  param_info.grad,
443  )
444 
445 
447  def __init__(self, alpha=0.01, epsilon=1e-4, decay=1, policy="fixed",
448  sparse_dedup_aggregator=None, rowWise=False,
449  engine='', **kwargs):
450  super(AdagradOptimizer, self).__init__()
451  self.alpha = alpha
452  self.epsilon = epsilon
453  self.decay = decay
454  self.policy = policy
455  self.sparse_dedup_aggregator = sparse_dedup_aggregator
456  self.engine = engine
457  self.init_kwargs = kwargs
458  self.rowWise = rowWise
459 
460  def _run(self, net, param_init_net, param_info):
461  param = param_info.blob
462  grad = param_info.grad
463 
464  if self.alpha <= 0:
465  return
466 
467  lr, _ = self.build_lr(
468  net, param_init_net,
469  base_learning_rate=self.alpha,
470  policy=self.policy,
471  **(self.init_kwargs)
472  )
473 
474  if self.rowWise:
475  shapes, types = workspace.InferShapesAndTypes([param_init_net])
476  if str(param) not in shapes:
477  # Type/shape inference is not available for this param, fallback
478  # on Shape/Slice logic
479  shape = param_init_net.Shape(param, str(param) + "_shape")
480  num_rows = param_init_net.Slice(
481  [shape],
482  str(shape) + "_numrows",
483  starts=[0], ends=[1]
484  )
485  param_squared_sum = param_init_net.ConstantFill(
486  num_rows,
487  str(param) + "_avg_squared_sum",
488  input_as_shape=1,
489  value=0.0
490  )
491  else:
492  param_squared_sum = param_init_net.ConstantFill(
493  [],
494  str(param) + "_avg_squared_sum",
495  shape=[shapes[str(param)][0]],
496  value=0.0
497  )
498 
499  else:
500  param_squared_sum = param_init_net.ConstantFill(
501  [param],
502  str(param) + "_squared_sum",
503  value=0.0
504  )
505 
506  self._aux_params.local.append(param_squared_sum)
507 
508  if self.rowWise:
509  assert isinstance(grad, core.GradientSlice),\
510  'If SparseAdagrad with rowWise=True, gradient must be '\
511  'a gradientslice. PLease ensure that rowWise is not enabled '\
512  'for the dense Adagrad optimizer, as it is not supported.'
513  if isinstance(grad, core.GradientSlice):
514  assert self.decay == 1.,\
515  'Decay is not implemented for SparseAdagrad and must be set to 1'
516  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
517  if self.rowWise:
518  op = 'RowWiseSparseAdagrad'
519  else:
520  op = 'SparseAdagrad'
521  net.__getattr__(op)(
522  [param, param_squared_sum, grad.indices, grad.values, lr],
523  [param, param_squared_sum],
524  epsilon=self.epsilon,
525  engine=self.engine
526  )
527  else:
528  net.Adagrad(
529  [param, param_squared_sum, grad, lr],
530  [param, param_squared_sum],
531  epsilon=self.epsilon,
532  decay=float(self.decay),
533  engine=self.engine
534  )
535 
536  def scale_learning_rate(self, scale):
537  self.alpha *= scale
538  return
539 
540 
542  def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
543  sparse_dedup_aggregator=None, engine=''):
544  super(FtrlOptimizer, self).__init__()
545  self.alpha = alpha
546  self.beta = beta
547  self.lambda1 = lambda1
548  self.lambda2 = lambda2
549  self.sparse_dedup_aggregator = sparse_dedup_aggregator
550  self.engine = engine
551 
552  def _run(self, net, param_init_net, param_info):
553  param = param_info.blob
554  grad = param_info.grad
555 
556  if self.alpha <= 0:
557  return
558 
559  nz = param_init_net.ConstantFill(
560  [param],
561  str(param) + "_ftrl_nz",
562  extra_shape=[2],
563  value=0.0
564  )
565  self._aux_params.local.append(nz)
566  if isinstance(grad, core.GradientSlice):
567  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
568  net.SparseFtrl(
569  [param, nz, grad.indices, grad.values],
570  [param, nz],
571  engine=self.engine,
572  alpha=self.alpha,
573  beta=self.beta,
574  lambda1=self.lambda1,
575  lambda2=self.lambda2
576  )
577  else:
578  net.Ftrl(
579  [param, nz, grad],
580  [param, nz],
581  engine=self.engine,
582  alpha=self.alpha,
583  beta=self.beta,
584  lambda1=self.lambda1,
585  lambda2=self.lambda2
586  )
587 
588  def scale_learning_rate(self, scale):
589  self.alpha *= scale
590  return
591 
592 
594  def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
595  policy='fixed', sparse_dedup_aggregator=None, rowWise=False,
596  engine='', **kwargs):
597  super(AdamOptimizer, self).__init__()
598  self.alpha = alpha
599  self.beta1 = beta1
600  self.beta2 = beta2
601  self.epsilon = epsilon
602  self.policy = policy
603  self.sparse_dedup_aggregator = sparse_dedup_aggregator
604  self.rowWise = rowWise
605  self.engine = engine
606  self.init_kwargs = kwargs
607 
608  def _run(self, net, param_init_net, param_info):
609  param = param_info.blob
610  grad = param_info.grad
611 
612  if self.alpha <= 0:
613  return
614 
615  lr, iteration = self.build_lr(
616  net, param_init_net,
617  base_learning_rate=self.alpha,
618  policy=self.policy,
619  **(self.init_kwargs)
620  )
621 
622  m1 = param_init_net.ConstantFill(
623  [param],
624  param + "_first_moment",
625  value=0.0
626  )
627  if self.rowWise:
628  shapes, types = workspace.InferShapesAndTypes([param_init_net])
629  m2 = param_init_net.ConstantFill(
630  [],
631  param + "_avg_second_moment",
632  shape=[shapes[param][0]],
633  value=0.0
634  )
635 
636  else:
637 
638  m2 = param_init_net.ConstantFill(
639  [param],
640  param + "_second_moment",
641  value=0.0
642  )
643 
644  self._aux_params.shared.append(iteration)
645  self._aux_params.local.append(m1)
646  self._aux_params.local.append(m2)
647 
648  if self.rowWise:
649  assert isinstance(grad, core.GradientSlice),\
650  'If SparseAdam with rowWise=True, gradient must be '\
651  'a gradientslice. PLease ensure that rowWise is not enabled '\
652  'for the dense Adam optimizer, as it is not supported.'
653  if isinstance(grad, core.GradientSlice):
654  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
655  if self.rowWise:
656  op = 'RowWiseSparseAdam'
657  else:
658  op = 'SparseAdam'
659  net.__getattr__(op)(
660  [param, m1, m2, grad.indices, grad.values, lr, iteration],
661  [param, m1, m2],
662  beta1=self.beta1,
663  beta2=self.beta2,
664  epsilon=self.epsilon
665  )
666 
667  else:
668  net.Adam(
669  [param, m1, m2, grad, lr, iteration],
670  [param, m1, m2],
671  beta1=self.beta1,
672  beta2=self.beta2,
673  epsilon=self.epsilon)
674 
675  def scale_learning_rate(self, scale):
676  self.alpha *= scale
677  return
678 
679 
681  """YellowFin: An automatic tuner for momentum SGD
682 
683  See https://arxiv.org/abs/1706.03471 for more details. This implementation
684  has separate learning rate and momentum per each parameter."""
685 
686  def __init__(self,
687  alpha=0.1,
688  mu=0.0,
689  beta=0.999,
690  curv_win_width=20,
691  zero_debias=True,
692  epsilon=0.1**6,
693  policy='fixed',
694  sparse_dedup_aggregator=None,
695  **kwargs):
696  super(YellowFinOptimizer, self).__init__()
697  self.alpha = alpha
698  self.mu = mu
699  self.beta = beta
700  self.curv_win_width = curv_win_width
701  self.zero_debias = zero_debias
702  self.epsilon = epsilon
703  self.policy = policy
704  self.sparse_dedup_aggregator = sparse_dedup_aggregator
705  self.init_kwargs = kwargs
706 
707  def _run(self, net, param_init_net, param_info):
708 
709  # Note: This is number of persistent scalars in YellowFin optimizer.
710  # It should always be the number of scalars being used. The same
711  # number should be used in class for the operation.
712  SCALARS_MEMORY_SIZE = 5
713 
714  param = param_info.blob
715  grad = param_info.grad
716  moment = param_init_net.ConstantFill(
717  [param],
718  param + "_moment",
719  value=0.0
720  )
721  curv_win = param_init_net.ConstantFill(
722  [],
723  param + "_curv_win",
724  shape=[self.curv_win_width],
725  value=0.0
726  )
727  g_avg = param_init_net.ConstantFill(
728  [param],
729  param + "_g_avg",
730  value=0.0
731  )
732  g2_avg = param_init_net.ConstantFill(
733  [param],
734  param + "_g2_avg",
735  value=0.0
736  )
737  lr_avg = param_init_net.ConstantFill(
738  [],
739  param + "_lr_avg",
740  shape=[1],
741  value=self.alpha
742  )
743  mu_avg = param_init_net.ConstantFill(
744  [],
745  param + "_mu_avg",
746  shape=[1],
747  value=self.mu
748  )
749  scalars_memory = param_init_net.ConstantFill(
750  [],
751  param + "_scalars_memory",
752  shape=[SCALARS_MEMORY_SIZE],
753  value=0.0
754  )
755 
756  assert self.alpha > 0
757  assert not isinstance(grad, core.GradientSlice), \
758  "Doesn't support sparse gradients"
759 
760  if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME):
761  # Add training operators.
762  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
763  iteration = param_init_net.ConstantFill(
764  [],
765  _OPTIMIZER_ITERATION_NAME,
766  shape=[1],
767  value=0,
768  dtype=core.DataType.INT64)
769  iter_mutex = param_init_net.CreateMutex([],
770  ["iteration_mutex"])
771  net.AtomicIter([iter_mutex, iteration], [iteration])
772  else:
773  iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME)
774 
775  self._aux_params.shared.append(iteration)
776  self._aux_params.local.append(moment)
777  self._aux_params.local.append(lr_avg)
778  self._aux_params.local.append(mu_avg)
779  self._aux_params.local.append(curv_win)
780  self._aux_params.local.append(g_avg)
781  self._aux_params.local.append(g2_avg)
782  self._aux_params.local.append(scalars_memory)
783 
784  yf_in_out_args = [
785  param,
786  moment,
787  lr_avg,
788  mu_avg,
789  curv_win,
790  g_avg,
791  g2_avg,
792  scalars_memory
793  ]
794 
795  net.YellowFin(
796  yf_in_out_args + [grad, iteration],
797  yf_in_out_args,
798  beta=self.beta,
799  epsilon=self.epsilon,
800  curv_win_width=self.curv_win_width,
801  zero_debias=self.zero_debias)
802 
803  def scale_learning_rate(self, scale):
804  self.alpha *= scale
805  return
806 
807 
809  def __init__(
810  self,
811  alpha=0.01,
812  decay=0.9,
813  momentum=0.0,
814  epsilon=1e-5,
815  policy='fixed',
816  engine='',
817  **kwargs
818  ):
819  super(RmsPropOptimizer, self).__init__()
820  self.alpha = alpha
821  self.decay = decay
822  self.momentum = momentum
823  self.epsilon = epsilon
824  self.policy = policy
825  self.engine = engine
826  self.init_kwargs = kwargs
827 
828  def _run(self, net, param_init_net, param_info):
829  param = param_info.blob
830  grad = param_info.grad
831 
832  assert self.alpha > 0
833  assert not isinstance(grad, core.GradientSlice), \
834  "RmsPropOptimizer doesn't support sparse gradients"
835 
836  dev = scope.CurrentDeviceScope()
837  if dev is None:
838  dev = core.DeviceOption(caffe2_pb2.CPU)
839 
840  ONE = param_init_net.ConstantFill(
841  [],
842  "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
843  shape=[1],
844  value=1.0
845  )
846 
847  lr, _ = self.build_lr(
848  net,
849  param_init_net,
850  base_learning_rate=-self.alpha,
851  policy=self.policy,
852  **(self.init_kwargs)
853  )
854 
855  grad_o = param_init_net.ConstantFill(
856  [param],
857  str(param) + "_grad_o",
858  values=0.0,
859  )
860 
861  ms = param_init_net.ConstantFill(
862  [param],
863  str(param) + "_mean_squares",
864  values=0.0,
865  )
866 
867  mom = param_init_net.ConstantFill(
868  [param],
869  str(param) + "_momentum",
870  values=0.0,
871  )
872 
873  self._aux_params.local.append(ms)
874  self._aux_params.local.append(mom)
875 
876  net.RmsProp(
877  [grad, ms, mom, ONE],
878  [grad_o, ms, mom],
879  decay=self.decay,
880  momentum=self.momentum,
881  epsilon=self.epsilon,
882  engine=self.engine,
883  )
884 
885  net.MomentumSGDUpdate(
886  [grad_o, mom, lr, param],
887  [grad_o, mom, param],
888  )
889 
890  def scale_learning_rate(self, scale):
891  self.alpha *= scale
892  return
893 
894 
895 def _get_param_to_device(model):
896  # Infer blob devices by going through the net and param_init_net
897  # ops and observing the device used to create or use the blob.
898  param_to_device = core.InferBlobDevices(model.net)
899  param_to_device.update(core.InferBlobDevices(model.param_init_net))
900  return param_to_device
901 
902 
903 def get_param_device(param_name, grad, param_to_device=None, default_device=None):
904  device = default_device
905  param_to_device = param_to_device or {}
906  # We first check if parameter's device has been inferred. If not,
907  # we check the gradient. This can happen if parameter is not output
908  # by any blob but created by a FetchBlob.
909  if param_name in param_to_device:
910  device = param_to_device[param_name]
911  else:
912  if isinstance(grad, core.GradientSlice):
913  grad = grad
914  if str(grad.values) in param_to_device:
915  device = param_to_device[str(grad.values)]
916  elif str(grad.indices) in param_to_device:
917  device = param_to_device[str(grad.indices)]
918  else:
919  grad_name = str(grad)
920  if grad_name in param_to_device:
921  device = param_to_device[grad_name]
922 
923  assert device is not None,\
924  "Cannot infer device for {}: no op creates it".format(param_name)
925  return device
926 
927 
928 def get_lr_injection():
929  """
930  Gets current value for lr_injection, a multiplier for all base
931  learning rates.
932  Must set allow_lr_injection=True when building optimizer, as it
933  relies on synchronization over CPU.
934  """
935  return workspace.FetchBlob(_LEARNING_RATE_INJECTION)
936 
937 
938 def set_lr_injection(lr_injection_value):
939  """
940  Sets lr_injection, a multiplier for all base learning rates.
941  Must set allow_lr_injection=True when building optimizer, as it
942  relies on synchronization over CPU.
943  """
944  workspace.FeedBlob(
945  _LEARNING_RATE_INJECTION,
946  np.array(
947  [float(lr_injection_value)],
948  dtype=np.float32,
949  ),
950  )
951 
952 
953 def _calc_norm_ratio(
954  model, params, name_scope, param_to_device, max_gradient_norm
955 ):
956  with core.NameScope(name_scope):
957  grad_squared_sums = []
958  for i, param in enumerate(params):
959  device = get_param_device(
960  str(param.blob), param.grad, param_to_device
961  )
962 
963  with core.DeviceScope(device):
964  grad = (
965  param.grad
966  if not isinstance(
967  param.grad,
968  core.GradientSlice,
969  ) else param.grad.values
970  )
971 
972  grad_squared_sum_name = 'grad_{}_squared_sum'.format(i)
973  grad_squared_sum = model.net.SumSqrElements(
974  grad,
975  grad_squared_sum_name,
976  )
977  grad_squared_sum_cpu = model.net.EnsureCPUOutput(
978  grad_squared_sum
979  )
980  grad_squared_sums.append(grad_squared_sum_cpu)
981 
982  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
983  grad_squared_full_sum = model.net.Sum(
984  grad_squared_sums,
985  'grad_squared_full_sum',
986  )
987  global_norm = model.net.Pow(
988  grad_squared_full_sum,
989  'global_norm',
990  exponent=0.5,
991  )
992  clip_norm = model.param_init_net.ConstantFill(
993  [],
994  'clip_norm',
995  shape=[],
996  value=float(max_gradient_norm),
997  )
998  max_norm = model.net.Max(
999  [global_norm, clip_norm],
1000  'max_norm',
1001  )
1002  norm_ratio = model.net.Div(
1003  [clip_norm, max_norm],
1004  'norm_ratio',
1005  )
1006  return norm_ratio
1007 
1008 
1009 def _build(
1010  model,
1011  optimizer,
1012  weights_only=False,
1013  use_param_info_optim=True,
1014  max_gradient_norm=None,
1015  allow_lr_injection=False,
1016 ):
1017  param_to_device = _get_param_to_device(model)
1018 
1019  # Validate there are no duplicate params
1020  model.Validate()
1021 
1022  params = []
1023  for param_info in model.GetOptimizationParamInfo():
1024  if weights_only and param_info.blob not in model.weights:
1025  continue
1026  params.append(param_info)
1027 
1028  lr_multiplier = None
1029  if max_gradient_norm is not None:
1030  lr_multiplier = _calc_norm_ratio(
1031  model,
1032  params,
1033  'norm_clipped_grad_update',
1034  param_to_device,
1035  max_gradient_norm,
1036  )
1037 
1038  if allow_lr_injection:
1039  if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION):
1040  lr_injection = model.param_init_net.ConstantFill(
1041  [],
1042  _LEARNING_RATE_INJECTION,
1043  shape=[1],
1044  value=1.0,
1045  )
1046  else:
1047  lr_injection = _LEARNING_RATE_INJECTION
1048 
1049  if lr_multiplier is None:
1050  lr_multiplier = lr_injection
1051  else:
1052  lr_multiplier = model.net.Mul(
1053  [lr_multiplier, lr_injection],
1054  'lr_multiplier',
1055  broadcast=1,
1056  )
1057  optimizer.add_lr_multiplier(lr_multiplier)
1058 
1059  for param_info in params:
1060  param_name = str(param_info.blob)
1061 
1062  device = get_param_device(param_name, param_info.grad, param_to_device)
1063 
1064  with core.DeviceScope(device):
1065  if param_info.optimizer and use_param_info_optim:
1066  param_info.optimizer(model.net, model.param_init_net, param_info)
1067  else:
1068  optimizer(model.net, model.param_init_net, param_info)
1069  return optimizer
1070 
1071 
1072 def add_weight_decay(model, weight_decay):
1073  """Adds a decay to weights in the model.
1074 
1075  This is a form of L2 regularization.
1076 
1077  Args:
1078  weight_decay: strength of the regularization
1079  """
1080  _build(
1081  model,
1082  WeightDecayBuilder(weight_decay=weight_decay),
1083  weights_only=True,
1084  use_param_info_optim=False,
1085  )
1086 
1087 
1088 def build_sgd(
1089  model,
1090  base_learning_rate,
1091  max_gradient_norm=None,
1092  allow_lr_injection=False,
1093  **kwargs
1094 ):
1095  sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
1096  return _build(
1097  model,
1098  sgd_optimizer,
1099  max_gradient_norm=max_gradient_norm,
1100  allow_lr_injection=allow_lr_injection,
1101  )
1102 
1103 
1104 def build_multi_precision_sgd(
1105  model,
1106  base_learning_rate,
1107  max_gradient_norm=None,
1108  allow_lr_injection=False,
1109  **kwargs
1110 ):
1111  multi_prec_sgd_optimizer = MultiPrecisionSgdOptimizer(
1112  base_learning_rate, **kwargs
1113  )
1114  return _build(
1115  model,
1116  multi_prec_sgd_optimizer,
1117  max_gradient_norm=max_gradient_norm,
1118  allow_lr_injection=allow_lr_injection,
1119  )
1120 
1121 
1122 def build_fp16_sgd(model, base_learning_rate, **kwargs):
1123  fp16_sgd_optimizer = FP16SgdOptimizer(
1124  base_learning_rate, **kwargs
1125  )
1126  return _build(model, fp16_sgd_optimizer)
1127 
1128 
1129 def build_ftrl(model, engine="SIMD", **kwargs):
1130  if engine == "SIMD":
1131  assert core.IsOperator('Ftrl_ENGINE_SIMD')
1132  assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
1133  ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
1134  return _build(model, ftrl_optimizer)
1135 
1136 
1137 def build_adagrad(
1138  model,
1139  base_learning_rate,
1140  parameters=None,
1141  max_gradient_norm=None,
1142  allow_lr_injection=False,
1143  **kwargs
1144 ):
1145  adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
1146  return _build(
1147  model,
1148  adagrad_optimizer,
1149  max_gradient_norm=max_gradient_norm,
1150  allow_lr_injection=allow_lr_injection,
1151  )
1152 
1153 
1154 def build_adam(
1155  model,
1156  base_learning_rate,
1157  max_gradient_norm=None,
1158  allow_lr_injection=False,
1159  **kwargs
1160 ):
1161  adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
1162  return _build(
1163  model,
1164  adam_optimizer,
1165  max_gradient_norm=max_gradient_norm,
1166  allow_lr_injection=allow_lr_injection,
1167  )
1168 
1169 
1170 def build_yellowfin(model, base_learning_rate=0.1, **kwargs):
1171  yellowfin_optimizer = YellowFinOptimizer(
1172  alpha=base_learning_rate,
1173  **kwargs)
1174  return _build(model, yellowfin_optimizer)
1175 
1176 
1177 def build_rms_prop(
1178  model,
1179  base_learning_rate,
1180  max_gradient_norm=None,
1181  allow_lr_injection=False,
1182  **kwargs
1183 ):
1184  rms_prop_optimizer = RmsPropOptimizer(alpha=base_learning_rate, **kwargs)
1185  return _build(
1186  model,
1187  rms_prop_optimizer,
1188  max_gradient_norm=max_gradient_norm,
1189  allow_lr_injection=allow_lr_injection,
1190  )
def get_cpu_blob_name(self, base_str, node_name='')
Definition: optimizer.py:70
Module caffe2.python.optimizer.
def get_gpu_blob_name(self, base_str, gpu_id, node_name)
Definition: optimizer.py:74
def dedup(net, sparse_dedup_aggregator, grad)
Definition: optimizer.py:147
def _run(self, net, param_init_net, param_info)
Definition: optimizer.py:67
def make_unique_blob_name(self, base_str)
Definition: optimizer.py:80
def build_lr(self, net, param_init_net, base_learning_rate, learning_rate_blob=None, policy="fixed", iter_val=0, kwargs)
Definition: optimizer.py:98