Caffe2 - Python API
A deep learning, cross platform ML framework
memonger.py
1 # Copyright (c) 2016-present, Facebook, Inc.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 ##############################################################################
15 
16 ## @package memonger
17 # Module caffe2.python.memonger
18 from __future__ import absolute_import
19 from __future__ import division
20 from __future__ import print_function
21 from __future__ import unicode_literals
22 
23 import networkx as nx
24 import collections
25 import time
26 import copy
27 from caffe2.python import workspace, core
28 from caffe2.proto import caffe2_pb2
29 import enum
30 import logging
31 from future.utils import viewitems, viewvalues
33 
34 log = logging.getLogger("memonger")
35 log.setLevel(logging.INFO)
36 LiveRange = collections.namedtuple('LiveRange', ["defined", "used", "size"])
37 
38 
39 def share_grad_blobs(
40  net,
41  losses,
42  param_grads,
43  namescope,
44  dont_share_blobs=None,
45  share_activations=False,
46  blob_shapes=None,
47 ):
48  '''
49  Implements similar optimization as Torch's shareGradInput():
50  for the gradients that are passed between layers, share blobs between
51  operators when possible. This yields significant memory savings with
52  deep networks.
53 
54  Returns an optimized protobuf (assign to net._net)
55  '''
56  def is_grad_blob(b):
57  name = str(b)
58  # Note: need to look at _{namescope} pattern as it matches
59  # to handle the auto-split gradients
60  return name.endswith("_grad") and (name.startswith(namescope) or
61  name.startswith("_" + namescope)) and name not in param_grads
62 
63  def is_grad_op(op):
64  # TODO: something smarter
65  for b in list(op.input) + list(op.output):
66  if is_grad_blob(b):
67  return True
68  return False
69 
70  log.warn("NOTE: Executing memonger to optimize gradient memory")
71 
72  # Collect ops that have something to do with gradients
73  if namescope != "" and not namescope.endswith("/"):
74  namescope += "/"
75 
76  netproto = copy.deepcopy(net.Proto())
77  activations = []
78  external_output = set(net.Proto().external_output)
79 
80  # Hacky way to get activations, think of a better way
81  for op in net.Proto().op:
82  for b in op.output:
83  if b + "_w" in op.input and b not in external_output:
84  activations.append(b)
85 
86  # Remove last activations, as they are usually accessed externally
87  activations = set(activations[:-2])
88 
89  # Gradient ops
90  grad_op_indices = []
91  for idx, op in enumerate(netproto.op):
92  if (is_grad_op(op)):
93  grad_op_indices.append(idx)
94 
95  shared_blobs = set()
96  for op in net.Proto().op:
97  for b in list(op.input) + list(op.output):
98  if is_grad_blob(b) or (share_activations and b in activations):
99  shared_blobs.add(b)
100  start_time = time.time()
101  optim_str = C.memonger_compute_blob_recycling_for_dag(
102  netproto.SerializeToString(),
103  [str(s).encode('utf-8') for s in losses],
104  grad_op_indices,
105  set(str(s).encode('utf-8') for s in shared_blobs),
106  namescope.encode('utf-8'),
107  set() if dont_share_blobs is None else dont_share_blobs,
108  {} if blob_shapes is None else blob_shapes
109  )
110 
111  log.info("Memonger memory optimization took {} secs".format(
112  time.time() - start_time),
113  )
114 
115  optim = caffe2_pb2.NetDef()
116  optim.ParseFromString(optim_str)
117  assert verify_graph_equality(net.Proto(), optim), \
118  "Memonger graph is not equal to original."
119  assert verify_inplace_blobs(net.Proto(), optim), \
120  "Inplace assignments differ in memonger net."
121  return optim
122 
123 
124 def optimize_inference_for_dag(net, input_blobs, namescope=""):
125  netproto = copy.deepcopy(net.Proto())
126  external_input = set(net.Proto().external_input)
127  external_output = set(net.Proto().external_output)
128 
129  def is_activation_blob(b):
130  return b not in external_input and b not in external_output
131 
132  activation_blobs = set()
133  seen_as_output = set()
134  ops = list(net.Proto().op)
135  op_indices = [index for index, op in enumerate(net.Proto().op)]
136 
137  # Sanity check: check that all external inputs are properlyh accounted
138  # and that no gradient ops are included in 'net'
139  for op in ops:
140  for b in op.input:
141  if is_activation_blob(b):
142  activation_blobs.add(b)
143  if b not in seen_as_output:
144  assert False, "{} not in external input".format(b)
145  for b in op.output:
146  if is_activation_blob(b):
147  activation_blobs.add(b)
148  seen_as_output = seen_as_output.union(set(op.output))
149  assert not op.is_gradient_op, \
150  "You can only pass inference-only nets to optimize_inference_for_dag"
151  start_time = time.time()
152  optim_str = C.memonger_compute_blob_recycling_for_dag(
153  netproto.SerializeToString(),
154  [str(s).encode('utf-8') for s in input_blobs],
155  op_indices,
156  set(str(s).encode('utf-8') for s in activation_blobs),
157  namescope.encode('utf-8'),
158  set(),
159  {}
160  )
161 
162  log.info("Memonger memory optimization took {} secs".format(
163  time.time() - start_time),
164  )
165 
166  optim = caffe2_pb2.NetDef()
167  optim.ParseFromString(optim_str)
168 
169  assert verify_graph_equality(net.Proto(), optim), \
170  "Memonger graph is not equal to original."
171  assert verify_inplace_blobs(net.Proto(), optim), \
172  "Inplace assignments differ in memonger net."
173  return optim
174 
175 
176 def estimate_memory_usage(protos, shapes, types, devicescope):
177  import numpy as np
178  '''
179  Estimate memory usage of a model. This is an estimate because
180  we assume a single threaded execution and miss some internal
181  memory usage of operators. Only estimates the memory for a given
182  device scope.
183 
184  Also, currently it does not handle correctly if blob sizes vary
185  during execution, as it uses only the final blob size.
186 
187  Returns (total, highwater, by op type) memory allocation in bytes.
188  '''
189  sizeofs = {
190  caffe2_pb2.TensorProto.DOUBLE: 8,
191  caffe2_pb2.TensorProto.FLOAT: 4,
192  caffe2_pb2.TensorProto.FLOAT16: 2,
193  caffe2_pb2.TensorProto.INT32: 4,
194  caffe2_pb2.TensorProto.INT8: 1,
195  caffe2_pb2.TensorProto.UINT8: 1,
196  caffe2_pb2.TensorProto.UINT16: 2,
197  caffe2_pb2.TensorProto.INT16: 2,
198  caffe2_pb2.TensorProto.BOOL: 1,
199  caffe2_pb2.TensorProto.INT64: 8,
200  }
201 
202  def split_net(proto):
203  ops = [op for op in proto.op if
204  op.device_option == devicescope or op.type in {"Free", "Alias"}]
205  del proto.op[:]
206  proto.op.extend(ops)
207  return proto
208 
209  def num_bytes(blob):
210  if blob not in shapes or blob not in types:
211  log.warning("Unknown blob encountered: {}".format(blob))
212  return 0
213  sizeof = sizeofs[types[blob]]
214  return sizeof * np.prod(shapes[blob])
215 
216  protos = [split_net(proto) for proto in protos]
217  allocs_by_ops = collections.defaultdict(lambda: 0)
218 
219  # Evaluate
220  current_allocated = 0
221  max_allocated = 0
222  total_allocated = 0
223  allocated = set()
224  for proto in protos:
225  for op in proto.op:
226  if op.type == "Free" or op.type == "Alias":
227  for o in op.output:
228  if o in allocated:
229  current_allocated -= num_bytes(o)
230  allocated.remove(o)
231  else:
232  for output in op.output:
233  if output not in allocated:
234  nbytes = num_bytes(output)
235  total_allocated += nbytes
236  current_allocated += nbytes
237  max_allocated = max(max_allocated, current_allocated)
238  allocated.add(output)
239  allocs_by_ops[op.type] += nbytes
240 
241  return (total_allocated, max_allocated, allocs_by_ops)
242 
243 
244 def release_blobs_when_used(netproto, dont_free_blobs, selector_fun=None):
245  '''
246  Insert Free-ops after a blob has been used the last time, so that its
247  memory can be reclaimed. Use this only with efficient caching memory
248  managers (such as CUB, --caffe2_cuda_memory_pool=cub).
249 
250  Blobs used with Alias op won't be freed.
251 
252  @dont_free_blobs: is a set of blobs that should not be freed
253  @selector_fun: optional lambda that return True if blob name
254  can be released. Use for easy special filtering, like
255  excluding blobs with "loss" in the name.
256 
257  Returns a new protobuffer. To use with a model, use:
258  model.net._net = memonger.release_blobs_when_used(..)
259  '''
260  input_blobs = set()
261  can_release = set()
262  alias_blobs = set()
263  netproto = copy.deepcopy(netproto)
264 
265  for op in netproto.op:
266  if op.type == 'Alias':
267  alias_blobs.add(op.input[0])
268  continue
269  for inp in op.input:
270  input_blobs.add(inp)
271  for outp in op.output:
272  if outp not in input_blobs:
273  if selector_fun is None or selector_fun(outp):
274  can_release.add(outp)
275 
276  # Remove such blobs that are not input at all and external outputs
277  can_release = can_release - set(netproto.external_output)
278  can_release = can_release.intersection(input_blobs)
279  can_release = can_release - dont_free_blobs
280  can_release = can_release - alias_blobs
281 
282  ops = list(netproto.op)
283 
284  # .. then find last use of each can-release blob, and insert a Free op
285  for j in reversed(range(0, len(netproto.op))):
286  op = netproto.op[j]
287  for inp in op.input:
288  if inp in can_release:
289  can_release.remove(inp)
290  ops.insert(j + 1, core.CreateOperator("Free", [inp], [inp]))
291 
292  del netproto.op[:]
293  netproto.op.extend(ops)
294  return netproto
295 
296 
297 def _find_source_nodes(g):
298  ''' Return nodes without predecessors '''
299  ret = []
300  for cn in g:
301  cur_pred = list(g.predecessors(cn))
302  if not cur_pred:
303  ret.append(cn)
304  return ret
305 
306 
307 def _find_target_nodes(g):
308  ''' Return nodes without successors '''
309  ret = []
310  for cn in g:
311  cur_succ = list(g.successors(cn))
312  if not cur_succ:
313  ret.append(cn)
314  return ret
315 
316 
317 def _add_single_target_ifneeded(g):
318  targets = _find_target_nodes(g)
319  assert len(targets) >= 1
320  if len(targets) == 1:
321  return g
322  ret = copy.deepcopy(g)
323 
324  def _next_available_idx(g):
325  ret = -1
326  for cn in g:
327  if cn > ret:
328  ret = cn
329  ret += 1
330  return ret
331 
332  target_node_idx = _next_available_idx(g)
333  ret.add_node(target_node_idx)
334  for cn in targets:
335  ret.add_edge(cn, target_node_idx)
336 
337  return ret
338 
339 
340 def _get_path(pred_list, dist_list):
341  ''' Get the path from nx.bellman_ford()'s output '''
342 
343  # distances are negative
344  assert all(dist_list[x] <= 0 for x in dist_list)
345  # node with longest distance to source is the target
346  target = min(dist_list, key=lambda x: dist_list[x])
347 
348  ret = []
349  cur = target
350 
351 
352  while cur is not None:
353  ret.append(cur)
354  # Hack to get networkx 2.0 happy: it uses list in pred.
355  # TODO(tulloch): are there cases with multiple predecessors?
356  try:
357  cur = pred_list[cur][0]
358  except TypeError:
359  cur = pred_list[cur]
360 
361  return list(reversed(ret))
362 
363 
364 def _get_longest_paths(g, source_nodes):
365  ''' Get the longest path for nodes in 'source_nodes'
366  Find with bellman_ford() by setting weight = -1
367  '''
368 
369  ng = copy.deepcopy(g)
370  for u, v in ng.edges():
371  ng[u][v]["weight"] = -1
372 
373  ret = {}
374  for cn in source_nodes:
375  pred, dist = nx.bellman_ford(ng, cn, weight="weight")
376  path = _get_path(pred, dist)
377  assert path[0] == cn
378  assert len(path) - 1 == -dist[path[-1]]
379  ret[cn] = path
380 
381  return ret
382 
383 
384 def _build_tree(paths):
385  ''' Build a tree for given paths based on common elements.
386  Last elements of all paths are the same, which is the root of the tree.
387  '''
388  assert all(cp[-1] == paths[0][-1] for cp in paths)
389  g = nx.DiGraph()
390  node_set = {y for x in paths for y in x}
391  g.add_nodes_from(node_set)
392  for cp in paths:
393  for ce in zip(cp[0:-1], cp[1:]):
394  g.add_edge(ce[1], ce[0])
395 
396  root = paths[0][-1]
397  _compute_tree_height(g, root)
398 
399  return (g, root)
400 
401 
402 def _compute_tree_height(g, root):
403  ''' Compute the heights of the tree for all nodes
404  Height of leaves are 0
405  '''
406  def _get_height(root):
407  children = list(g.successors(root))
408  height = 0
409  if children:
410  child_heights = [_get_height(x) for x in children]
411  height = max(child_heights) + 1
412  g.node[root]["height"] = height
413  return height
414 
415  _get_height(root)
416 
417 
418 def _sort_tree_leaves(g, root):
419  ''' For each node, sort its child nodes based on the height of the nodes.
420  Return the leaf nodes of the tree after sorting.
421  '''
422  def _get_height(root):
423  return g.node[root]["height"]
424 
425  def _get_sorted_leaves(root):
426  children = list(g.successors(root))
427  if not children:
428  return [root]
429  child_heights = [_get_height(x) for x in children]
430  order = sorted(range(len(children)), key=lambda x: child_heights[x])
431  ret = []
432  for co in order:
433  cr = children[co]
434  ret += _get_sorted_leaves(cr)
435 
436  return ret
437 
438  return _get_sorted_leaves(root)
439 
440 
441 def topological_sort_traversal_longest_path(g):
442  ''' The graph 'g' may contain several source nodes (nodes without incoming
443  edge), which could be in any order and still be a valid
444  topological sorting result. We would like to arrange these source nodes
445  so that the average live spans of the computed blobs are shorter.
446  The idea is to sort the source nodes based on the length of their path to
447  the target node so that the one with longer path is used first.
448  This is done by:
449  - Add a single target node if there are multiple target nodes in 'g'.
450  - Find the longest path between each source and the target node.
451  - Convert the longest paths to a tree with the target node being the root
452  and source nodes being the leaves.
453  - Sort the nodes of the tree based on the height of the tree.
454  '''
455  gt = _add_single_target_ifneeded(g)
456  source_nodes = _find_source_nodes(gt)
457  lpaths = _get_longest_paths(gt, source_nodes)
458  tree, root = _build_tree(list(viewvalues(lpaths)))
459  sorted_sources = _sort_tree_leaves(tree, root)
460  assert(sorted(sorted_sources) == sorted(source_nodes))
461 
462  if nx.__version__ < '2.0':
463  ret = nx.topological_sort(g, sorted_sources)
464  else:
465  # Manually making a sorted descendent list
466  dependency_order = list(sorted_sources)
467  seen_nodes = set(sorted_sources)
468  for s in sorted_sources:
469  desc = nx.descendants(g, s)
470  for d in desc:
471  if d not in seen_nodes:
472  seen_nodes.add(d)
473  dependency_order.append(d)
474  sort_key = dict((v, len(dependency_order) - i) for i, v in enumerate(dependency_order))
475  ret = nx.algorithms.dag.lexicographical_topological_sort(
476  g, key=lambda x: sort_key[x])
477  ret = list(ret)
478  assert(len(ret) == len(g.node))
479  return ret
480 
481 
482 def topological_sort_traversal(g):
483  return list(nx.topological_sort(g))
484 
485 
486 def compute_ranges(linearized_ops, blob_sizes=None):
487  if not blob_sizes:
488  log.warning('Provide blob sizes to get more accurate assignments.')
489 
490  blobs = collections.defaultdict(
491  lambda: LiveRange(defined=None, used=None, size=None))
492  for i, op in enumerate(linearized_ops):
493  for blob in op.input:
494  used = blobs[blob].used
495  if used is None:
496  used = i
497  else:
498  used = max(used, i)
499  blobs[blob] = blobs[blob]._replace(used=used)
500  blob_size = blob_sizes[blob] if blob_sizes else None
501  assert not blob_sizes or blob_size is not None
502  blobs[blob] = blobs[blob]._replace(size=blob_size)
503  for blob in op.output:
504  defined = blobs[blob].defined
505  if defined is None:
506  defined = i
507  else:
508  defined = min(defined, i)
509  blobs[blob] = blobs[blob]._replace(defined=defined)
510  blob_size = blob_sizes[blob] if blob_sizes else None
511  assert not blob_sizes or blob_size is not None
512  blobs[blob] = blobs[blob]._replace(size=blob_size)
513 
514  return blobs
515 
516 
517 def is_compatible(candidate_range, assignment, static_blobs):
518  (name, range_) = assignment[-1]
519  if name in static_blobs:
520  return False
521  if candidate_range.defined is None or range_.defined is None \
522  or range_.used is None:
523  return False
524  return candidate_range.defined > range_.used
525 
526 
527 def compute_blob_assignments(assignments):
528  blob_assignments = {}
529  for assignment in assignments:
530  if len(assignment) == 1:
531  continue
532  last_blob, _ = assignment[-1]
533  for (blob, _) in assignment:
534  blob_assignments[blob] = last_blob
535  return blob_assignments
536 
537 
538 def _get_max_size(assignment):
539  if not assignment:
540  return 0
541  ret = max([x[1].size for x in assignment])
542  ret = 0 if ret is None else ret
543  return ret
544 
545 
546 def get_memory_usage(assignments):
547  ret = 0
548  for cur in assignments:
549  ret += _get_max_size(cur)
550  return ret
551 
552 
553 def compute_assignments_greedy(ranges_sorted, init_assignments=None):
554  assignments = init_assignments or []
555  visited = {y[0] for x in assignments for y in x}
556 
557  for (name, range_) in ranges_sorted:
558  if name in visited:
559  continue
560  assigned = False
561  best_assignment = 0
562  min_dist = float("inf")
563  candidate_size = range_.size or 0
564  for idx, assignment in enumerate(assignments):
565  if is_compatible(range_, assignment, []):
566  assigned = True
567  dist = abs(_get_max_size(assignment) - candidate_size)
568  if dist < min_dist:
569  min_dist = dist
570  best_assignment = idx
571  if assigned:
572  assignment = assignments[best_assignment]
573  assignment.append((name, range_))
574  else:
575  assignments.append([(name, range_)])
576  return assignments
577 
578 
579 def _get_count(assignments):
580  ''' Return number of blobs in assignments '''
581  if assignments:
582  return sum([len(x) for x in assignments])
583  return 0
584 
585 
586 def compute_assignments_dp(ranges_sorted, init_assignment, counter=None):
587  ''' Compute assignment for blobs in 'ranges_sorted' on top of 'init_assignment'
588  using dynamic programming + recursion.
589 
590  ranges_sorted: blobs sorted by 'used'
591  init_assignment: assignment to start with, blobs in 'ranges_sorted' should
592  not be used in 'init_assignment'
593 
594  Using f(b, k, init) to represent the best assignment for blobs b[0:k]
595  given initial assignment 'init', we have
596  f(b, k, init) = f(b, j, init) +
597  find_best(b[j:k], f(b, j, init))
598  where j is the index of the last best assignment that is independent of
599  blob b[k - 1] (b[k - 1] is compatible with all assignments in
600  f(b, j, init)), and find_best(b1, init1) gives the best assignment
601  for blobs in 'b1' based on the initial assignment 'init1', and blobs
602  b1[0:-1] should be incompatible with b1[-1]. f(b, len(b), []) gives
603  the best assignment for blobs 'b'.
604 
605  For find_best(b, init), since b[0:-1] are not compatible with b[-1], we
606  could reduce it to a smaller problem to find best assignment for b[0:-1]
607  as
608  find_best(b, init) = min {
609  f(b[0:-1], len(b) - 1, init - x) + [x, b[-1]] for x in init, or
610  f(b[0:-1], len(b) - 1, init) + [b[-1]]
611  }
612  where min{} gives the assignment with minimum memory usage.
613  '''
614 
615  def _get_compatible_prev(candidate_range, best_assignments, cur_idx):
616  ''' Find closest position k of best_assignments that is independent of
617  candidate_range that candiate_range is compatible with all assignments
618  in best_assignments[k].
619  Return -1 if not found.
620  '''
621  def is_compatible_all(candidate_range, assignments):
622  ''' return true if compatiable for all assignments in assignments '''
623  return all([is_compatible(candidate_range[1], x, []) for x in assignments])
624 
625  ii = cur_idx - 1
626  while ii >= 0:
627  cba = best_assignments[ii]
628  if is_compatible_all(candidate_range, cba):
629  return ii
630  ii -= 1
631  return -1
632 
633  def _find_best(ranges, init_assignment, prev_best_assignment, counter):
634  ''' Find the best assignment for blobs 'ranges' given an initialized
635  assignment 'init_assignment'.
636 
637  Blobs in ranges[0:-1] should be incompatible with blob range[-1].
638  'prev_best_assignment': best assignment for blobs in ranges[:-1]
639 
640  By assigning ranges[-1] to each assignment k in 'init_assignment' or
641  in a new assignment, the problem becomes a smaller problem to find
642  the best assignment for ranges[0:-1] given the initial assignment
643  init_assigment[0:k, (k+1):-1].
644  '''
645  # Blob to check
646  find_range = ranges[-1]
647  # Blobs in ranges[0:-1] are incompatible with ranges[-1] so that we can
648  # reduce it to a smaller problem.
649  assert all(not is_compatible(x[1], [find_range], []) for x in ranges[0:-1])
650 
651  sz = len(init_assignment)
652  best_candidates = []
653  # Try to assign 'find_range' to each assignment in init_assignment
654  for ii in range(sz):
655  if not is_compatible(find_range[1], init_assignment[ii], []):
656  continue
657  cur_best = copy.deepcopy(init_assignment)
658  cur_best[ii].append(find_range)
659  if len(ranges) > 1:
660  cur_best_tmp = [x for i, x in enumerate(cur_best) if i != ii]
661  # reduce to a smaller dp problem
662  cur_best_tmp = compute_assignments_dp(
663  ranges[:-1], cur_best_tmp, counter)
664  cur_best = cur_best_tmp + [cur_best[ii]]
665  best_candidates.append(cur_best)
666  # Try to put 'find_range' in a new assignment
667  best_candidates.append(prev_best_assignment + [[find_range]])
668 
669  ret = min(best_candidates, key=lambda x: get_memory_usage(x))
670  return ret
671 
672  if not counter:
673  counter = [0]
674  counter[0] += 1
675 
676  if counter and counter[0] % 5000 == 0:
677  rs = [ranges_sorted[0][1].defined, ranges_sorted[-1][1].used]
678  log.info('Finding assignments {} ({} -> {})...'.format(
679  counter[0], rs[0], rs[1]))
680 
681  init_assignment = init_assignment or []
682  # best_assignments[k]: best assignments for first k blobs ranges_sorted[0:(k+1)]
683  best_assignments = []
684  # Find best assignment for blobs ranges_sorted[0:ii]
685  for ii, cur_range in enumerate(ranges_sorted):
686  # closest best_assignment that is independent of ranges_sorted[ii]
687  prev_idx = _get_compatible_prev(cur_range, best_assignments, ii)
688  prev_best = copy.deepcopy(init_assignment) if prev_idx < 0 else \
689  copy.deepcopy(best_assignments[prev_idx])
690  # Need to find best assignment for blobs in 'ranges_part'
691  ranges_part = ranges_sorted[(prev_idx + 1):(ii + 1)]
692  cur_best = _find_best(
693  ranges_part, prev_best,
694  best_assignments[-1] if best_assignments else init_assignment,
695  counter)
696  assert _get_count(cur_best) == _get_count(prev_best) + len(ranges_part)
697  best_assignments.append(copy.deepcopy(cur_best))
698 
699  assert len(best_assignments) == len(ranges_sorted)
700 
701  best = best_assignments[-1]
702 
703  return best
704 
705 
706 def get_updated_ranges(ranges, max_live=None):
707  ''' Set LiveRange.defined = -1 if it is None
708  Set LiveRange.used = max_live if it is None
709  Set LiveRanee.size = 1 if it is None
710  '''
711 
712  def _get_max_live(ranges):
713  max_live = max(x[1].used for x in ranges if x[1].used) + 1
714  return max_live
715 
716  def _update_range(x, max_live, size):
717  cx = x
718  if x[1].defined is None:
719  cx = (cx[0], cx[1]._replace(defined=-1))
720  if x[1].used is None:
721  cx = (cx[0], cx[1]._replace(used=max_live))
722  if x[1].size is None:
723  cx = (cx[0], cx[1]._replace(size=size))
724  return cx
725 
726  if max_live is None:
727  max_live = _get_max_live(ranges)
728  ranges = [_update_range(x, max_live, 1) for x in ranges]
729 
730  return ranges
731 
732 
733 def compute_assignments(ranges, static_blobs, algo):
734  '''
735  algo: Method used to find assignments (AssignmentAlgorithm.GREEDY or
736  AssignmentAlgorithm.DYNAMIC_PROGRAMMING).
737  AssignmentAlgorithm.DYNAMIC_PROGRAMMING gives optimal solution at the
738  cost of more computation.
739  AssignmentAlgorithm.GREEDY may be better in the case 'blob_sizes' is
740  not provided.
741  '''
742 
743  # Sort the ranges based on when they are last used.
744  # If LiveRange.used is None, then the blob is never used and could
745  # be consumed externally. Sort these to the end of the list as opposed
746  # to the beginning so that they can be shared as well.
747  ranges = sorted(
748  viewitems(ranges),
749  key=lambda p: (p[1].used is None, p[1].used),
750  )
751  # Update None values
752  ranges = get_updated_ranges(ranges)
753 
754  # Sharable blobs
755  ranges_sharable = [x for x in ranges if x[0] not in static_blobs]
756  # Static blobs, not sharable
757  ranges_static = [x for x in ranges if x[0] in static_blobs]
758 
759  log.info("Total sharable blobs {}".format(len(ranges_sharable)))
760 
761  best_assignment = []
762  if algo == AssignmentAlgorithm.DYNAMIC_PROGRAMMING:
763  best_assignment = compute_assignments_dp(ranges_sharable, [])
764  elif algo == AssignmentAlgorithm.GREEDY:
765  best_assignment = compute_assignments_greedy(ranges_sharable, [])
766  else:
767  assert "Invalid algo name {}".format(algo)
768  best_assignment += [[x] for x in ranges_static]
769 
770  # verify_assignments(best_assignment)
771 
772  return best_assignment
773 
774 
775 def verify_assignments(assignments):
776  for cur in assignments:
777  for x, y in zip(cur[0:-1], cur[1:]):
778  assert x[1].used < y[1].defined
779 
780 
781 def compute_interference_graph(ops):
782  g = nx.DiGraph()
783  for i, op in enumerate(ops):
784  g.add_node(i, op=op)
785  for i, parent_op in enumerate(ops):
786  for j, child_op in enumerate(ops):
787  if i >= j:
788  continue
789  if any(output in child_op.input for output in parent_op.output):
790  deps = set(child_op.input).intersection(parent_op.output)
791  g.add_edge(i, j, deps=deps)
792  assert nx.is_directed_acyclic_graph(g), child_op
793  return g
794 
795 
796 Optimization = collections.namedtuple(
797  'Optimization', ['net', 'assignments', 'blob_assignments'])
798 
799 
800 def apply_assignments(net, blob_assignments):
801  def canonical_name(blob):
802  if blob not in blob_assignments:
803  return blob
804  return blob_assignments[blob]
805 
806  for op in net.op:
807  # Descend into subnets of the recurrent network
808  if op.type.startswith('RecurrentNetwork'):
809  apply_recurrent_blob_assignments(op, blob_assignments, canonical_name)
810 
811  for i, input_ in enumerate(op.input):
812  op.input[i] = canonical_name(input_)
813  for i, output in enumerate(op.output):
814  op.output[i] = canonical_name(output)
815 
816 
817 
818 def apply_recurrent_blob_assignments(op, blob_assignments, canonical_name):
819  log.debug("Applying assignments to recurrent op: {}".format(op.type))
820  step_args = [a for a in op.arg if a.name.endswith("step_net")]
821  for step_arg in step_args:
822  apply_assignments(step_arg.n, blob_assignments)
823  for i, einp in enumerate(step_arg.n.external_input):
824  if einp in blob_assignments:
825  step_arg.n.external_input[i] = canonical_name(einp)
826  # Store renamings
827  for blob, renamed in viewitems(blob_assignments):
828  if blob in list(op.input) + list(op.output):
829  a = caffe2_pb2.Argument()
830  a.name = blob + ".rename"
831  a.s = str(renamed).encode("ascii")
832  op.arg.extend([a])
833 
834 
835 class AssignmentAlgorithm(enum.Enum):
836  GREEDY = 0
837  DYNAMIC_PROGRAMMING = 1
838 
839 
840 def optimize_inference_fast(net, static_blobs):
841  optim = caffe2_pb2.NetDef()
842  optim_str = C.memonger_optimize_inference_net(
843  net.SerializeToString(),
844  [str(s).encode('utf-8') for s in static_blobs]
845  )
846  optim.ParseFromString(optim_str)
847  return optim
848 
849 
850 def optimize_interference(net, static_blobs,
851  ordering_function=topological_sort_traversal,
852  blob_sizes=None,
853  algo=AssignmentAlgorithm.GREEDY):
854  """
855  ordering_function: topological_sort_traversal or
856  topological_sort_traversal_longest_path.
857  topological_sort_traversal_longest_path gives better
858  results but needs a bit more computation.
859  algo: Method used to find assignments (AssignmentAlgorithm.GREEDY or
860  AssignmentAlgorithm.DYNAMIC_PROGRAMMING).
861  AssignmentAlgorithm.DYNAMIC_PROGRAMMING gives optimal solution at the
862  cost of more computation.
863  AssignmentAlgorithm.GREEDY may be better in the case 'blob_sizes' is
864  not provided.
865  """
866 
867  """
868  1) Use a BFS traversal of the execution graph to generate an
869  ordering of the node executions.
870  2) Generate use-def ranges for each `blob` in the BFS traversal
871  order.
872  3) Assign blobs to `canonical blobs`
873  4) Rename blobs to canonical blobs
874  """
875 
876  net = copy.deepcopy(net)
877  g = compute_interference_graph(net.op)
878  ordering = ordering_function(g)
879  linearized_ops = [net.op[i] for i in ordering]
880 
881  # Reorder ops in net based on the computed linearlized order.
882  # If the graph has multiple topological orderings and if the NetDef's
883  # ordering differs from the order used to compute ranges, then the
884  # runtime might end up overwriting blobs before they are used.
885  del net.op[:]
886  net.op.extend(linearized_ops)
887 
888  ranges = compute_ranges(linearized_ops, blob_sizes)
889  assignments = compute_assignments(ranges, static_blobs, algo)
890  blob_assignments = compute_blob_assignments(assignments)
891  apply_assignments(net, blob_assignments)
892  return Optimization(
893  net=net,
894  blob_assignments=blob_assignments,
895  assignments=assignments)
896 
897 
898 def verify_inplace_blobs(net_a, net_b):
899  """
900  Verifies that net_a and net_b have the same in-place blob assignments.
901  Particularly, that memonger did not add an in-place assignment when that
902  did not exist before.
903  """
904  def get_inplaces(op):
905  out = list(op.output)
906  inplaces = []
907  for j, inp in enumerate(op.input):
908  if inp in out:
909  inplaces.append([j, out.index(inp)])
910  return inplaces
911 
912  for op_a, op_b in zip(net_a.op, net_b.op):
913  if op_a.type != op_b.type:
914  return False
915  if get_inplaces(op_a) != get_inplaces(op_b):
916  return False
917  return True
918 
919 
920 def verify_graph_equality(net_a, net_b):
921  """
922  Determines if the execution of two graphs are identical.
923  That is, all inputs blobs are mapped to the same output blobs
924  for each operator in their respective positions.
925 
926  This is meant to check the output of memonger with the original graph.
927  It assumes that the nets have same external input and output.
928 
929  O(E) runtime + O(1) amortized cost to hash for python dict
930  """
931 
932  def parent_list(ops):
933  parent_list = [[] for _ in ops]
934  edge_owner = {}
935  for i, op in enumerate(ops):
936  for blob in op.input:
937  parent_id = edge_owner.get(blob)
938  if parent_id is not None:
939  parent_list[i].append(parent_id)
940  for blob in op.output:
941  edge_owner[blob] = i
942 
943  return parent_list
944 
945  # Operator wise equality checks
946  if (len(net_a.op) != len(net_b.op)):
947  return False
948  for op_a, op_b in zip(net_a.op, net_b.op):
949  if (op_a.type != op_b.type or
950  op_a.device_option != op_b.device_option or
951  op_a.engine != op_b.engine):
952  return False
953 
954  # Print debug info
955  parent_list_a = parent_list(net_a.op)
956  parent_list_b = parent_list(net_b.op)
957  if parent_list_a != parent_list_b:
958  j = 0
959  for a, b in zip(parent_list_a, parent_list_b):
960  if a != b:
961  print("Difference {} vs {} \n {}".format(
962  j, net_a.op[j], net_b.op[j]))
963  print("Parents: {} vs {}".format(a, b))
964 
965  j += 1
966 
967  # Net wise equality check
968  return parent_list_a == parent_list_b
969 
970 
971 Statistics = collections.namedtuple(
972  'Statistics', ['baseline_nbytes', 'optimized_nbytes'])
973 
974 
975 def blob_nbytes(blob):
976  sz = 0
977  try:
978  sz = workspace.FetchBlob(blob).nbytes
979  except Exception:
980  log.warning('Error when fetching blob {}'.format(blob))
981  return sz
982 
983 
984 def compute_statistics(assignments):
985  blob_bytes = {
986  blob: blob_nbytes(blob) for assignment in assignments
987  for (blob, _) in assignment}
988  baseline_nbytes = sum(viewvalues(blob_bytes))
989  optimized_nbytes = sum(
990  max(blob_bytes[blob] for (blob, _) in assignment)
991  for assignment in assignments)
992  return Statistics(
993  baseline_nbytes=baseline_nbytes,
994  optimized_nbytes=optimized_nbytes)
995 
996 
997 def collect_blob_sizes(net):
998  blobs = {}
999  for op in net.op:
1000  for blob in op.input:
1001  blobs[blob] = blob_nbytes(blob)
1002  for blob in op.output:
1003  blobs[blob] = blob_nbytes(blob)
1004 
1005  return blobs