3 from __future__
import absolute_import, division, print_function, unicode_literals
10 AFTER_OPTIMIZER =
"after_optimizer" 19 Adds regularization to train_net for given parameter. Its factor ahead of 20 regularization is given when initialization. 21 The param should be a BlobReference. 24 def __call__(self, net, param_init_net, param, grad=None, by=None):
26 by_enum = utils.EnumClassKeyVals(RegularizationBy)
27 assert by
in by_enum.values(), (
28 "Regularizer of type {} is called with invalid by={}, " 29 "not in {}".format(self.__class__, by, by_enum.values())
31 run_func =
"_run_" + by
34 ),
"Regularizer of type {} does not implement function {}".format(
35 self.__class__, run_func
37 return getattr(self, run_func)(net, param_init_net, param, grad)
39 def _run_on_loss(self, net, param_init_net, param, grad=None):
42 def _run_after_optimizer(self, net, param_init_net, param, grad):
58 if min
is not None and (open_range
or left_open)
63 if max
is not None and (open_range
or right_open)
67 [param, grad.indices, grad.values]
68 if isinstance(grad, core.GradientSlice)
71 net.EnsureClipped(input_blobs, [param], min=min, max=max)
75 def __init__(self, reg_lambda):
76 super(L1Norm, self).__init__()
77 assert reg_lambda >= 0,
"factor ahead of regularization should be 0 or positive" 81 def _run_on_loss(self, net, param_init_net, param, grad=None):
82 output_blob = net.NextScopedBlob(param +
"_l1_regularization")
83 net.LpNorm([param], [output_blob], p=1)
84 net.Scale([output_blob], [output_blob], scale=self.
reg_lambda)
89 def __init__(self, reg_lambda):
90 super(L2Norm, self).__init__()
91 assert reg_lambda >= 0,
"factor ahead of regularization should be 0 or positive" 95 def _run_on_loss(self, net, param_init_net, param, grad=None):
96 output_blob = net.NextScopedBlob(param +
"_l2_regularization")
97 net.LpNorm([param], [output_blob], p=2)
98 net.Scale([output_blob], [output_blob], scale=self.
reg_lambda)
103 def __init__(self, norm=1.0):
104 super(MaxNorm, self).__init__()
107 def _run_after_optimizer(self, net, param_init_net, param, grad):
108 assert self.
norm > 0,
"norm should be bigger than 0." 109 if isinstance(grad, core.GradientSlice):
111 [param, grad.indices, grad.values],
117 raise NotImplementedError(
"MaxNorm is not supported for dense parameters")
121 def __init__(self, norm=1.0):
122 super(ConstantNorm, self).__init__()
125 def _run_after_optimizer(self, net, param_init_net, param, grad):
126 assert self.
norm > 0,
"norm should be bigger than 0." 127 if isinstance(grad, core.GradientSlice):
129 [param, grad.indices, grad.values],
135 raise NotImplementedError(
136 "ConstantNorm is not supported for dense parameters" 142 Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science, 143 35(67-68), 7. Chapter 19 146 def __init__(self, reg_lambda, discount_policy="inv", discount_options=None):
148 discount is a positive weight that is decreasing, and here it is implemented 149 similar to the learning rate. It is specified by a learning rate policy and 150 corresponding options 153 assert reg_lambda > 0,
"factor ahead of regularization should be 0 or positive" 158 def _run_on_loss(self, net, param_init_net, param, grad=None):
159 iteration = utils.BuildUniqueMutexIter(param_init_net, net)
161 discount = net.NextScopedBlob(param +
"_log_barrier_discount")
172 param_non_neg = net.NextScopedBlob(param +
"_non_neg")
173 net.Clip([param], [param_non_neg], min=self.
kEpsilon)
174 param_log = net.NextScopedBlob(param +
"_log")
175 net.Log([param_non_neg], [param_log])
176 param_log_sum = net.NextScopedBlob(param +
"_log_sum")
177 net.SumElements([param_log], [param_log_sum])
178 output_blob = net.NextScopedBlob(param +
"_log_barrier")
179 net.Mul([param_log_sum, discount], [output_blob], broadcast=1)
182 def _run_after_optimizer(self, net, param_init_net, param, grad):
188 Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science, 189 35(67-68), 7. Chapter 16 193 self, lb=
None, ub=
None, left_open=
False, right_open=
False, epsilon=
None 195 super(BoundedGradientProjection, self).__init__()
196 lb = float(lb)
if lb
is not None else None 197 ub = float(ub)
if ub
is not None else None 198 epsilon = float(epsilon)
if epsilon
is not None else self.
kEpsilon 199 assert epsilon > 0,
"Bounded Gradient Projection with invalid eps={eps}".format(
206 lb + (epsilon
if left_open
else 0.)
207 <= ub - (epsilon
if right_open
else 0.)
210 "Bounded Gradient Projection with invalid " 211 "{lp}ub={ub}, lb={lb}{rp}, eps={eps}".format(
214 lp=
"(" if left_open
else "[",
215 rp=
")" if right_open
else "]",
225 def _run_after_optimizer(self, net, param_init_net, param, grad):
239 Scardapane, Simone, et al. "Group sparse regularization for deep neural networks." 240 Neurocomputing 241 (2017): 81-89. 242 This regularizer computes l1 norm of a weight matrix based on groups. 243 There are essentially three stages in the computation: 244 1. Compute the l2 norm on all the members of each group 245 2. Scale each l2 norm by the size of each group 246 3. Compute the l1 norm of the scaled l2 norms 248 def __init__(self, reg_lambda, groups, stabilizing_val=0):
251 reg_lambda: The weight of the regularization term. 252 groups: A list of integers describing the size of each group. 253 The length of the list is the number of groups. 256 stabilizing_val: The computation of GroupL1Norm involves the Sqrt 257 operator. When values are small, its gradient can be numerically 258 unstable and causing gradient explosion. Adding this term to 259 stabilize gradient calculation. Recommended value of this term is 260 1e-8, but it depends on the specific scenarios. If the implementation 261 of the gradient operator of Sqrt has taken into stability into 262 consideration, this term won't be necessary. 267 ),
"regularization weight should be 0 or positive" 268 assert isinstance(groups, list),
"groups needs to be a list" 274 def _run_on_loss(self, net, param_init_net, param, grad=None):
277 param: The input blob to regularize. It should be a weight matrix 278 blob with shape (output_dim, input_dim). input_dim should be 279 equal to the sum of self.groups. 282 group_l1_norm: The output blob after applying regularization. 284 These are the steps of computation: 285 1. square all elements 287 3. lengthssum by group 288 4. square_root all elements 289 5. normalize each group based on group size 290 6. compute l1 norm of each group 291 7. scale the result with the regularization lambda 293 squared = net.Sqr(param)
294 reduced_sum = net.ReduceSum(squared, axes=[0], keepdims=0)
295 lengths_sum = net.LengthsSum(
298 net.GivenTensorIntFill(
311 sqrt = net.Sqrt(lengths_sum)
324 [
'normalized_l2_norm_scaled']
327 group_l1_norm = net.LpNorm(l2_scaled, [
'group_l1_nrom'], p=1)
def __init__(self, reg_lambda, discount_policy="inv", discount_options=None)
def __init__(self, reg_lambda, groups, stabilizing_val=0)
def _ensure_clipped(self, net, param, grad=None, min=None, max=None, open_range=False, left_open=False, right_open=False)