Caffe2 - Python API
A deep learning, cross platform ML framework
gradient_clipping.py
1 from __future__ import absolute_import
2 from __future__ import division
3 from __future__ import print_function
4 from __future__ import unicode_literals
5 
6 from caffe2.python import core
7 from caffe2.proto import caffe2_pb2
8 from caffe2.python.optimizer import get_param_device
9 from caffe2.python.modeling.net_modifier import NetModifier
10 
11 import logging
12 
13 logger = logging.getLogger(__name__)
14 
15 
17 
18  L1_NORM = 'l1_norm'
19  L2_NORM = 'l2_norm'
20 
21  BY_NORM = 'by_norm'
22  BY_VALUE = 'by_value'
23 
24  GRAD_CLIP_METHODS = [BY_NORM, BY_VALUE]
25  CLIP_GRADIENT_NORM_TYPES = [L2_NORM, L1_NORM]
26 
27  def __init__(self, grad_clip_method, clip_norm_type='l2_norm',
28  clip_threshold=0.1, use_parameter_norm=False,
29  compute_norm_ratio=False, clip_max=1, clip_min=-1,
30  blobs_to_include=None, blobs_to_exclude=None):
31  """
32  Clips gradient to avoid gradient magnitude explosion or vanishing gradient.
33 
34  Args:
35  grad_clip_method: ways to clip the gradients
36  clip_norm_type: type of norm used in the necessary computation
37  clip_threshold: threshold used to determine whether to clip
38  use_parameter_norm: a boolean to indicate whether to incorporate
39  the norm of the parameter
40  compute_norm_ratio: a boolean to compute the ratio between gradient norm
41  and parameter norm explicitly for debugging purpose
42  clip_max: when clipping by_value, any value that is greater than
43  clip_max will be clipped to clip_max
44  clip_min: when clipping by_value, any value that is smaller than
45  clip_min will be clipped to clip_min
46  blobs_to_include: names of blobs whose gradient is to be clipped. If it is set
47  to none, all param 's gradient in grad_map will be clipped.
48  blobs_to_exclude: names of blobs whose gradient is not to be clipped.
49  """
50 
51  assert grad_clip_method in self.GRAD_CLIP_METHODS, (
52  "This method of clipping, {}, has not been implemented.".format(
53  clip_norm_type))
54  if clip_norm_type is not None:
55  assert clip_norm_type in self.CLIP_GRADIENT_NORM_TYPES, (
56  "This method of clipping, {}, has not been implemented.".format(
57  clip_norm_type))
58 
59  self.grad_clip_method = grad_clip_method
60  self.clip_norm_type = clip_norm_type
61  self.clip_threshold = float(clip_threshold)
62  self.use_parameter_norm = use_parameter_norm
63  self.compute_norm_ratio = compute_norm_ratio
64  self.clip_max = float(clip_max)
65  self.clip_min = float(clip_min)
66  self.blobs_to_include = blobs_to_include
67  self.blobs_to_exclude = blobs_to_exclude
68 
69  def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
70  modify_output_record=False):
71 
72  assert grad_map is not None
73 
74  CPU = core.DeviceOption(caffe2_pb2.CPU)
75 
76  final_param_map = {}
77  if self.blobs_to_include is None:
78  final_param_map = grad_map
79  else:
80  for blob in self.blobs_to_include:
81  param = core.BlobReference(blob)
82  if not net.BlobIsDefined(param):
83  raise Exception('param {0} is not defined in net {1}'.format(
84  param, net.Name()))
85  final_param_map[param] = grad_map[param]
86 
87  if self.blobs_to_exclude is not None:
88  for blob in self.blobs_to_exclude:
89  final_param_map.pop(blob, None)
90 
91  for param, grad in final_param_map.items():
92  # currently sparse gradients won't be clipped
93  # futher implementation is needed to enable it
94  if isinstance(grad, core.GradientSlice):
95  continue
96 
97  device = get_param_device(
98  param,
99  grad_map[str(param)],
100  param_to_device=blob_to_device,
101  default_device=CPU,
102  )
103 
104  with core.DeviceScope(device):
105  if self.grad_clip_method == self.BY_NORM:
106  if self.clip_norm_type == self.L2_NORM:
107  p = 2
108  elif self.clip_norm_type == self.L1_NORM:
109  p = 1
110 
111  grad_norm = net.LpNorm(
112  [grad],
113  net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)),
114  p=p,
115  )
116 
117  if p == 2:
118  grad_norm = net.Pow([grad_norm], exponent=0.5)
119 
120  op_inputs = [grad, grad_norm]
121 
122  if self.use_parameter_norm:
123  param_norm = net.LpNorm(
124  [param],
125  net.NextScopedBlob(
126  prefix=str(param) + '_l{}_norm'.format(p)),
127  p=p,
128  )
129 
130  if p == 2:
131  param_norm = net.Pow([param_norm], exponent=0.5)
132 
133  op_inputs.append(param_norm)
134 
135  if self.compute_norm_ratio:
136  net.Div(
137  [grad_norm, param_norm],
138  [net.NextScopedBlob(
139  prefix=str(param) + '_norm_ratio')]
140  )
141 
142  net.ClipTensorByScaling(
143  op_inputs,
144  [grad],
145  threshold=self.clip_threshold,
146  )
147  elif self.grad_clip_method == self.BY_VALUE:
148  net.Clip(
149  [grad],
150  [grad],
151  max=self.clip_max,
152  min=self.clip_min,
153  )
def __init__(self, grad_clip_method, clip_norm_type='l2_norm', clip_threshold=0.1, use_parameter_norm=False, compute_norm_ratio=False, clip_max=1, clip_min=-1, blobs_to_include=None, blobs_to_exclude=None)