Caffe2 - Python API
A deep learning, cross platform ML framework
regularizer.py
1 # @package optimizer
2 # Module caffe2.python.regularizer
3 from __future__ import absolute_import, division, print_function, unicode_literals
4 
5 from caffe2.python import core, utils
6 import numpy as np
7 
8 
9 class RegularizationBy(object):
10  AFTER_OPTIMIZER = "after_optimizer"
11  ON_LOSS = "on_loss"
12 
13 
14 class Regularizer(object):
15  def __init__(self):
16  self.kEpsilon = 1e-9
17 
18  """
19  Adds regularization to train_net for given parameter. Its factor ahead of
20  regularization is given when initialization.
21  The param should be a BlobReference.
22  """
23 
24  def __call__(self, net, param_init_net, param, grad=None, by=None):
25  assert isinstance(param, core.BlobReference)
26  by_enum = utils.EnumClassKeyVals(RegularizationBy)
27  assert by in by_enum.values(), (
28  "Regularizer of type {} is called with invalid by={}, "
29  "not in {}".format(self.__class__, by, by_enum.values())
30  )
31  run_func = "_run_" + by
32  assert hasattr(
33  self, run_func
34  ), "Regularizer of type {} does not implement function {}".format(
35  self.__class__, run_func
36  )
37  return getattr(self, run_func)(net, param_init_net, param, grad)
38 
39  def _run_on_loss(self, net, param_init_net, param, grad=None):
40  return None
41 
42  def _run_after_optimizer(self, net, param_init_net, param, grad):
43  return None
44 
45  def _ensure_clipped(
46  self,
47  net,
48  param,
49  grad=None,
50  min=None,
51  max=None,
52  open_range=False,
53  left_open=False,
54  right_open=False,
55  ):
56  min = (
57  min + self.kEpsilon
58  if min is not None and (open_range or left_open)
59  else min
60  )
61  max = (
62  max - self.kEpsilon
63  if max is not None and (open_range or right_open)
64  else max
65  )
66  input_blobs = (
67  [param, grad.indices, grad.values]
68  if isinstance(grad, core.GradientSlice)
69  else [param]
70  )
71  net.EnsureClipped(input_blobs, [param], min=min, max=max)
72 
73 
75  def __init__(self, reg_lambda):
76  super(L1Norm, self).__init__()
77  assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
78 
79  self.reg_lambda = reg_lambda
80 
81  def _run_on_loss(self, net, param_init_net, param, grad=None):
82  output_blob = net.NextScopedBlob(param + "_l1_regularization")
83  net.LpNorm([param], [output_blob], p=1)
84  net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
85  return output_blob
86 
87 
89  def __init__(self, reg_lambda):
90  super(L2Norm, self).__init__()
91  assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
92 
93  self.reg_lambda = reg_lambda
94 
95  def _run_on_loss(self, net, param_init_net, param, grad=None):
96  output_blob = net.NextScopedBlob(param + "_l2_regularization")
97  net.LpNorm([param], [output_blob], p=2)
98  net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
99  return output_blob
100 
101 
103  def __init__(self, norm=1.0):
104  super(MaxNorm, self).__init__()
105  self.norm = norm
106 
107  def _run_after_optimizer(self, net, param_init_net, param, grad):
108  assert self.norm > 0, "norm should be bigger than 0."
109  if isinstance(grad, core.GradientSlice):
110  net.SparseNormalize(
111  [param, grad.indices, grad.values],
112  [param],
113  use_max_norm=True,
114  norm=self.norm,
115  )
116  else:
117  raise NotImplementedError("MaxNorm is not supported for dense parameters")
118 
119 
121  def __init__(self, norm=1.0):
122  super(ConstantNorm, self).__init__()
123  self.norm = norm
124 
125  def _run_after_optimizer(self, net, param_init_net, param, grad):
126  assert self.norm > 0, "norm should be bigger than 0."
127  if isinstance(grad, core.GradientSlice):
128  net.SparseNormalize(
129  [param, grad.indices, grad.values],
130  [param],
131  use_max_norm=False,
132  norm=self.norm,
133  )
134  else:
135  raise NotImplementedError(
136  "ConstantNorm is not supported for dense parameters"
137  )
138 
139 
141  """
142  Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
143  35(67-68), 7. Chapter 19
144  """
145 
146  def __init__(self, reg_lambda, discount_policy="inv", discount_options=None):
147  """
148  discount is a positive weight that is decreasing, and here it is implemented
149  similar to the learning rate. It is specified by a learning rate policy and
150  corresponding options
151  """
152  super(LogBarrier, self).__init__()
153  assert reg_lambda > 0, "factor ahead of regularization should be 0 or positive"
154  self.reg_lambda = reg_lambda
155  self.discount_policy = discount_policy
156  self.discount_options = discount_options or {"gamma": 1.0, "power": 1.0}
157 
158  def _run_on_loss(self, net, param_init_net, param, grad=None):
159  iteration = utils.BuildUniqueMutexIter(param_init_net, net)
160  # Since we are most likely to do a minimization
161  discount = net.NextScopedBlob(param + "_log_barrier_discount")
162  net.LearningRate(
163  [iteration],
164  [discount],
165  base_lr=-self.reg_lambda,
166  policy=self.discount_policy,
167  **self.discount_options
168  )
169  # TODO(xlwang): param might still be negative at the initialization time or
170  # slighly negative due to the distributed training. Enforce it's non-negativity
171  # for now (at least above machine epsilon)
172  param_non_neg = net.NextScopedBlob(param + "_non_neg")
173  net.Clip([param], [param_non_neg], min=self.kEpsilon)
174  param_log = net.NextScopedBlob(param + "_log")
175  net.Log([param_non_neg], [param_log])
176  param_log_sum = net.NextScopedBlob(param + "_log_sum")
177  net.SumElements([param_log], [param_log_sum])
178  output_blob = net.NextScopedBlob(param + "_log_barrier")
179  net.Mul([param_log_sum, discount], [output_blob], broadcast=1)
180  return output_blob
181 
182  def _run_after_optimizer(self, net, param_init_net, param, grad):
183  self._ensure_clipped(net, param, grad, min=0, open_range=True)
184 
185 
187  """
188  Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
189  35(67-68), 7. Chapter 16
190  """
191 
192  def __init__(
193  self, lb=None, ub=None, left_open=False, right_open=False, epsilon=None
194  ):
195  super(BoundedGradientProjection, self).__init__()
196  lb = float(lb) if lb is not None else None
197  ub = float(ub) if ub is not None else None
198  epsilon = float(epsilon) if epsilon is not None else self.kEpsilon
199  assert epsilon > 0, "Bounded Gradient Projection with invalid eps={eps}".format(
200  eps=epsilon
201  )
202  assert (
203  (lb is None)
204  or (ub is None)
205  or (
206  lb + (epsilon if left_open else 0.)
207  <= ub - (epsilon if right_open else 0.)
208  )
209  ), (
210  "Bounded Gradient Projection with invalid "
211  "{lp}ub={ub}, lb={lb}{rp}, eps={eps}".format(
212  lb=lb,
213  ub=ub,
214  lp="(" if left_open else "[",
215  rp=")" if right_open else "]",
216  eps=epsilon,
217  )
218  )
219  self.left_open = left_open
220  self.right_open = right_open
221  self.kEpsilon = epsilon
222  self.lb = lb
223  self.ub = ub
224 
225  def _run_after_optimizer(self, net, param_init_net, param, grad):
226  self._ensure_clipped(
227  net,
228  param,
229  grad,
230  min=self.lb,
231  max=self.ub,
232  left_open=self.left_open,
233  right_open=self.right_open,
234  )
235 
236 
238  """
239  Scardapane, Simone, et al. "Group sparse regularization for deep neural networks."
240  Neurocomputing 241 (2017): 81-89.
241 
242  This regularizer computes l1 norm of a weight matrix based on groups.
243  There are essentially three stages in the computation:
244  1. Compute the l2 norm on all the members of each group
245  2. Scale each l2 norm by the size of each group
246  3. Compute the l1 norm of the scaled l2 norms
247  """
248  def __init__(self, reg_lambda, groups, stabilizing_val=0):
249  """
250  Args:
251  reg_lambda: The weight of the regularization term.
252  groups: A list of integers describing the size of each group.
253  The length of the list is the number of groups.
254 
255  Optional Args:
256  stabilizing_val: The computation of GroupL1Norm involves the Sqrt
257  operator. When values are small, its gradient can be numerically
258  unstable and causing gradient explosion. Adding this term to
259  stabilize gradient calculation. Recommended value of this term is
260  1e-8, but it depends on the specific scenarios. If the implementation
261  of the gradient operator of Sqrt has taken into stability into
262  consideration, this term won't be necessary.
263  """
264  super(GroupL1Norm, self).__init__()
265  assert (
266  (reg_lambda) >= 0
267  ), "regularization weight should be 0 or positive"
268  assert isinstance(groups, list), "groups needs to be a list"
269 
270  self.reg_lambda = (reg_lambda)
271  self.groups = groups
272  self.stabilizing_val = stabilizing_val
273 
274  def _run_on_loss(self, net, param_init_net, param, grad=None):
275  """
276  Args:
277  param: The input blob to regularize. It should be a weight matrix
278  blob with shape (output_dim, input_dim). input_dim should be
279  equal to the sum of self.groups.
280 
281  Returns:
282  group_l1_norm: The output blob after applying regularization.
283 
284  These are the steps of computation:
285  1. square all elements
286  2. sum by row
287  3. lengthssum by group
288  4. square_root all elements
289  5. normalize each group based on group size
290  6. compute l1 norm of each group
291  7. scale the result with the regularization lambda
292  """
293  squared = net.Sqr(param)
294  reduced_sum = net.ReduceSum(squared, axes=[0], keepdims=0)
295  lengths_sum = net.LengthsSum(
296  [
297  reduced_sum,
298  net.GivenTensorIntFill(
299  [], 1, shape=[len(self.groups)], values=self.groups
300  ),
301  ]
302  )
303 
304  if self.stabilizing_val:
305  net.Add(
306  [lengths_sum, net.ConstantFill([], 1, value=self.stabilizing_val)],
307  [lengths_sum],
308  broadcast=1,
309  )
310 
311  sqrt = net.Sqrt(lengths_sum)
312 
313  # Here we combine step 5 and step 7 into one operator call to
314  # improve efficiency: values = np.sqrt(self.groups) * self.reg_lambda
315  l2_scaled = net.Mul(
316  [
317  sqrt,
318  net.GivenTensorFill(
319  [],
320  shape=[len(self.groups)],
321  values=np.sqrt(self.groups) * self.reg_lambda
322  )
323  ],
324  ['normalized_l2_norm_scaled']
325  )
326 
327  group_l1_norm = net.LpNorm(l2_scaled, ['group_l1_nrom'], p=1)
328 
329  return group_l1_norm
def __init__(self, reg_lambda, discount_policy="inv", discount_options=None)
Definition: regularizer.py:146
def __init__(self, reg_lambda, groups, stabilizing_val=0)
Definition: regularizer.py:248
def _ensure_clipped(self, net, param, grad=None, min=None, max=None, open_range=False, left_open=False, right_open=False)
Definition: regularizer.py:55