1 from __future__
import absolute_import
2 from __future__
import division
3 from __future__
import print_function
4 from __future__
import unicode_literals
7 from caffe2.proto
import caffe2_pb2
13 logger = logging.getLogger(__name__)
24 GRAD_CLIP_METHODS = [BY_NORM, BY_VALUE]
25 CLIP_GRADIENT_NORM_TYPES = [L2_NORM, L1_NORM]
27 def __init__(self, grad_clip_method, clip_norm_type='l2_norm',
28 clip_threshold=0.1, use_parameter_norm=
False,
29 compute_norm_ratio=
False, clip_max=1, clip_min=-1,
30 blobs_to_include=
None, blobs_to_exclude=
None):
32 Clips gradient to avoid gradient magnitude explosion or vanishing gradient. 35 grad_clip_method: ways to clip the gradients 36 clip_norm_type: type of norm used in the necessary computation 37 clip_threshold: threshold used to determine whether to clip 38 use_parameter_norm: a boolean to indicate whether to incorporate 39 the norm of the parameter 40 compute_norm_ratio: a boolean to compute the ratio between gradient norm 41 and parameter norm explicitly for debugging purpose 42 clip_max: when clipping by_value, any value that is greater than 43 clip_max will be clipped to clip_max 44 clip_min: when clipping by_value, any value that is smaller than 45 clip_min will be clipped to clip_min 46 blobs_to_include: names of blobs whose gradient is to be clipped. If it is set 47 to none, all param 's gradient in grad_map will be clipped. 48 blobs_to_exclude: names of blobs whose gradient is not to be clipped. 52 "This method of clipping, {}, has not been implemented.".format(
54 if clip_norm_type
is not None:
56 "This method of clipping, {}, has not been implemented.".format(
69 def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
70 modify_output_record=
False):
72 assert grad_map
is not None 74 CPU = core.DeviceOption(caffe2_pb2.CPU)
78 final_param_map = grad_map
82 if not net.BlobIsDefined(param):
83 raise Exception(
'param {0} is not defined in net {1}'.format(
85 final_param_map[param] = grad_map[param]
89 final_param_map.pop(blob,
None)
91 for param, grad
in final_param_map.items():
94 if isinstance(grad, core.GradientSlice):
97 device = get_param_device(
100 param_to_device=blob_to_device,
104 with core.DeviceScope(device):
111 grad_norm = net.LpNorm(
113 net.NextScopedBlob(prefix=str(grad) +
'_l{}_norm'.format(p)),
118 grad_norm = net.Pow([grad_norm], exponent=0.5)
120 op_inputs = [grad, grad_norm]
123 param_norm = net.LpNorm(
126 prefix=str(param) +
'_l{}_norm'.format(p)),
131 param_norm = net.Pow([param_norm], exponent=0.5)
133 op_inputs.append(param_norm)
137 [grad_norm, param_norm],
139 prefix=str(param) +
'_norm_ratio')]
142 net.ClipTensorByScaling(
list CLIP_GRADIENT_NORM_TYPES
def __init__(self, grad_clip_method, clip_norm_type='l2_norm', clip_threshold=0.1, use_parameter_norm=False, compute_norm_ratio=False, clip_max=1, clip_min=-1, blobs_to_include=None, blobs_to_exclude=None)