3 from __future__
import absolute_import
4 from __future__
import division
5 from __future__
import print_function
6 from __future__
import unicode_literals
13 from caffe2.proto
import caffe2_pb2
16 from future.utils
import viewitems, viewvalues
19 log = logging.getLogger(
"memonger")
20 log.setLevel(logging.INFO)
21 LiveRange = collections.namedtuple(
'LiveRange', [
"defined",
"used",
"size"])
29 dont_share_blobs=
None,
30 share_activations=
False,
34 Implements similar optimization as Torch's shareGradInput(): 35 for the gradients that are passed between layers, share blobs between 36 operators when possible. This yields significant memory savings with 39 Returns an optimized protobuf (assign to net._net) 45 return name.endswith(
"_grad")
and (name.startswith(namescope)
or 46 name.startswith(
"_" + namescope))
and name
not in param_grads
50 for b
in list(op.input) + list(op.output):
55 log.warn(
"NOTE: Executing memonger to optimize gradient memory")
58 if namescope !=
"" and not namescope.endswith(
"/"):
61 netproto = copy.deepcopy(net.Proto())
63 external_output = set(net.Proto().external_output)
66 for op
in net.Proto().op:
68 if b +
"_w" in op.input
and b
not in external_output:
72 activations = set(activations[:-2])
76 for idx, op
in enumerate(netproto.op):
78 grad_op_indices.append(idx)
81 for op
in net.Proto().op:
82 for b
in list(op.input) + list(op.output):
83 if is_grad_blob(b)
or (share_activations
and b
in activations):
85 start_time = time.time()
86 optim_str = C.memonger_compute_blob_recycling_for_dag(
87 netproto.SerializeToString(),
88 [str(s).encode(
'utf-8')
for s
in losses],
90 set(str(s).encode(
'utf-8')
for s
in shared_blobs),
91 namescope.encode(
'utf-8'),
92 set()
if dont_share_blobs
is None else dont_share_blobs,
93 {}
if blob_shapes
is None else blob_shapes
96 log.info(
"Memonger memory optimization took {} secs".format(
97 time.time() - start_time),
100 optim = caffe2_pb2.NetDef()
101 optim.ParseFromString(optim_str)
102 assert verify_graph_equality(net.Proto(), optim), \
103 "Memonger graph is not equal to original." 104 assert verify_inplace_blobs(net.Proto(), optim), \
105 "Inplace assignments differ in memonger net." 109 def optimize_inference_for_dag(net, input_blobs, namescope=""):
110 netproto = copy.deepcopy(net.Proto())
111 external_input = set(net.Proto().external_input)
112 external_output = set(net.Proto().external_output)
114 def is_activation_blob(b):
115 return b
not in external_input
and b
not in external_output
117 activation_blobs = set()
118 seen_as_output = set()
119 ops = list(net.Proto().op)
120 op_indices = [index
for index, op
in enumerate(net.Proto().op)]
126 if is_activation_blob(b):
127 activation_blobs.add(b)
128 if b
not in seen_as_output:
129 assert False,
"{} not in external input".format(b)
131 if is_activation_blob(b):
132 activation_blobs.add(b)
133 seen_as_output = seen_as_output.union(set(op.output))
134 assert not op.is_gradient_op, \
135 "You can only pass inference-only nets to optimize_inference_for_dag" 136 start_time = time.time()
137 optim_str = C.memonger_compute_blob_recycling_for_dag(
138 netproto.SerializeToString(),
139 [str(s).encode(
'utf-8')
for s
in input_blobs],
141 set(str(s).encode(
'utf-8')
for s
in activation_blobs),
142 namescope.encode(
'utf-8'),
147 log.info(
"Memonger memory optimization took {} secs".format(
148 time.time() - start_time),
151 optim = caffe2_pb2.NetDef()
152 optim.ParseFromString(optim_str)
154 assert verify_graph_equality(net.Proto(), optim), \
155 "Memonger graph is not equal to original." 156 assert verify_inplace_blobs(net.Proto(), optim), \
157 "Inplace assignments differ in memonger net." 161 def estimate_memory_usage(protos, shapes, types, devicescope):
164 Estimate memory usage of a model. This is an estimate because 165 we assume a single threaded execution and miss some internal 166 memory usage of operators. Only estimates the memory for a given 169 Also, currently it does not handle correctly if blob sizes vary 170 during execution, as it uses only the final blob size. 172 Returns (total, highwater, by op type) memory allocation in bytes. 175 caffe2_pb2.TensorProto.DOUBLE: 8,
176 caffe2_pb2.TensorProto.FLOAT: 4,
177 caffe2_pb2.TensorProto.FLOAT16: 2,
178 caffe2_pb2.TensorProto.INT32: 4,
179 caffe2_pb2.TensorProto.INT8: 1,
180 caffe2_pb2.TensorProto.UINT8: 1,
181 caffe2_pb2.TensorProto.UINT16: 2,
182 caffe2_pb2.TensorProto.INT16: 2,
183 caffe2_pb2.TensorProto.BOOL: 1,
184 caffe2_pb2.TensorProto.INT64: 8,
187 def split_net(proto):
188 ops = [op
for op
in proto.op
if 189 op.device_option == devicescope
or op.type
in {
"Free",
"Alias"}]
195 if blob
not in shapes
or blob
not in types:
196 log.warning(
"Unknown blob encountered: {}".format(blob))
198 sizeof = sizeofs[types[blob]]
199 return sizeof * np.prod(shapes[blob])
201 protos = [split_net(proto)
for proto
in protos]
202 allocs_by_ops = collections.defaultdict(
lambda: 0)
205 current_allocated = 0
211 if op.type ==
"Free" or op.type ==
"Alias":
214 current_allocated -= num_bytes(o)
217 for output
in op.output:
218 if output
not in allocated:
219 nbytes = num_bytes(output)
220 total_allocated += nbytes
221 current_allocated += nbytes
222 max_allocated = max(max_allocated, current_allocated)
223 allocated.add(output)
224 allocs_by_ops[op.type] += nbytes
226 return (total_allocated, max_allocated, allocs_by_ops)
229 def release_blobs_when_used(netproto, dont_free_blobs, selector_fun=None):
231 Insert Free-ops after a blob has been used the last time, so that its 232 memory can be reclaimed. Use this only with efficient caching memory 233 managers (such as CUB, --caffe2_cuda_memory_pool=cub). 235 Blobs used with Alias op won't be freed. 237 @dont_free_blobs: is a set of blobs that should not be freed 238 @selector_fun: optional lambda that return True if blob name 239 can be released. Use for easy special filtering, like 240 excluding blobs with "loss" in the name. 242 Returns a new protobuffer. To use with a model, use: 243 model.net._net = memonger.release_blobs_when_used(..) 248 netproto = copy.deepcopy(netproto)
250 for op
in netproto.op:
251 if op.type ==
'Alias':
252 alias_blobs.add(op.input[0])
256 for outp
in op.output:
257 if outp
not in input_blobs:
258 if selector_fun
is None or selector_fun(outp):
259 can_release.add(outp)
262 can_release = can_release - set(netproto.external_output)
263 can_release = can_release.intersection(input_blobs)
264 can_release = can_release - dont_free_blobs
265 can_release = can_release - alias_blobs
267 ops = list(netproto.op)
270 for j
in reversed(range(0, len(netproto.op))):
273 if inp
in can_release:
274 can_release.remove(inp)
275 ops.insert(j + 1, core.CreateOperator(
"Free", [inp], [inp]))
278 netproto.op.extend(ops)
282 def _find_source_nodes(g):
283 ''' Return nodes without predecessors ''' 286 cur_pred = list(g.predecessors(cn))
292 def _find_target_nodes(g):
293 ''' Return nodes without successors ''' 296 cur_succ = list(g.successors(cn))
302 def _add_single_target_ifneeded(g):
303 targets = _find_target_nodes(g)
304 assert len(targets) >= 1
305 if len(targets) == 1:
307 ret = copy.deepcopy(g)
309 def _next_available_idx(g):
317 target_node_idx = _next_available_idx(g)
318 ret.add_node(target_node_idx)
320 ret.add_edge(cn, target_node_idx)
325 def _get_path(pred_list, dist_list):
326 ''' Get the path from nx.bellman_ford()'s output ''' 329 assert all(dist_list[x] <= 0
for x
in dist_list)
331 target = min(dist_list, key=
lambda x: dist_list[x])
337 while cur
is not None:
342 cur = pred_list[cur][0]
346 return list(reversed(ret))
349 def _get_longest_paths(g, source_nodes):
350 ''' Get the longest path for nodes in 'source_nodes' 351 Find with bellman_ford() by setting weight = -1 354 ng = copy.deepcopy(g)
355 for u, v
in ng.edges():
356 ng[u][v][
"weight"] = -1
359 for cn
in source_nodes:
360 pred, dist = nx.bellman_ford(ng, cn, weight=
"weight")
361 path = _get_path(pred, dist)
363 assert len(path) - 1 == -dist[path[-1]]
369 def _build_tree(paths):
370 ''' Build a tree for given paths based on common elements. 371 Last elements of all paths are the same, which is the root of the tree. 373 assert all(cp[-1] == paths[0][-1]
for cp
in paths)
375 node_set = {y
for x
in paths
for y
in x}
376 g.add_nodes_from(node_set)
378 for ce
in zip(cp[0:-1], cp[1:]):
379 g.add_edge(ce[1], ce[0])
382 _compute_tree_height(g, root)
387 def _compute_tree_height(g, root):
388 ''' Compute the heights of the tree for all nodes 389 Height of leaves are 0 391 def _get_height(root):
392 children = list(g.successors(root))
395 child_heights = [_get_height(x)
for x
in children]
396 height = max(child_heights) + 1
397 g.node[root][
"height"] = height
403 def _sort_tree_leaves(g, root):
404 ''' For each node, sort its child nodes based on the height of the nodes. 405 Return the leaf nodes of the tree after sorting. 407 def _get_height(root):
408 return g.node[root][
"height"]
410 def _get_sorted_leaves(root):
411 children = list(g.successors(root))
414 child_heights = [_get_height(x)
for x
in children]
415 order = sorted(range(len(children)), key=
lambda x: child_heights[x])
419 ret += _get_sorted_leaves(cr)
423 return _get_sorted_leaves(root)
426 def topological_sort_traversal_longest_path(g):
427 ''' The graph 'g' may contain several source nodes (nodes without incoming 428 edge), which could be in any order and still be a valid 429 topological sorting result. We would like to arrange these source nodes 430 so that the average live spans of the computed blobs are shorter. 431 The idea is to sort the source nodes based on the length of their path to 432 the target node so that the one with longer path is used first. 434 - Add a single target node if there are multiple target nodes in 'g'. 435 - Find the longest path between each source and the target node. 436 - Convert the longest paths to a tree with the target node being the root 437 and source nodes being the leaves. 438 - Sort the nodes of the tree based on the height of the tree. 440 gt = _add_single_target_ifneeded(g)
441 source_nodes = _find_source_nodes(gt)
442 lpaths = _get_longest_paths(gt, source_nodes)
443 tree, root = _build_tree(list(viewvalues(lpaths)))
444 sorted_sources = _sort_tree_leaves(tree, root)
445 assert(sorted(sorted_sources) == sorted(source_nodes))
447 if nx.__version__ <
'2.0':
448 ret = nx.topological_sort(g, sorted_sources)
451 dependency_order = list(sorted_sources)
452 seen_nodes = set(sorted_sources)
453 for s
in sorted_sources:
454 desc = nx.descendants(g, s)
456 if d
not in seen_nodes:
458 dependency_order.append(d)
459 sort_key = dict((v, len(dependency_order) - i)
for i, v
in enumerate(dependency_order))
460 ret = nx.algorithms.dag.lexicographical_topological_sort(
461 g, key=
lambda x: sort_key[x])
463 assert(len(ret) == len(g.node))
467 def topological_sort_traversal(g):
468 return list(nx.topological_sort(g))
471 def compute_ranges(linearized_ops, blob_sizes=None):
473 log.warning(
'Provide blob sizes to get more accurate assignments.')
475 blobs = collections.defaultdict(
476 lambda: LiveRange(defined=
None, used=
None, size=
None))
477 for i, op
in enumerate(linearized_ops):
478 for blob
in op.input:
479 used = blobs[blob].used
484 blobs[blob] = blobs[blob]._replace(used=used)
485 blob_size = blob_sizes[blob]
if blob_sizes
else None 486 assert not blob_sizes
or blob_size
is not None 487 blobs[blob] = blobs[blob]._replace(size=blob_size)
488 for blob
in op.output:
489 defined = blobs[blob].defined
493 defined = min(defined, i)
494 blobs[blob] = blobs[blob]._replace(defined=defined)
495 blob_size = blob_sizes[blob]
if blob_sizes
else None 496 assert not blob_sizes
or blob_size
is not None 497 blobs[blob] = blobs[blob]._replace(size=blob_size)
502 def is_compatible(candidate_range, assignment, static_blobs):
503 (name, range_) = assignment[-1]
504 if name
in static_blobs:
506 if candidate_range.defined
is None or range_.defined
is None \
507 or range_.used
is None:
509 return candidate_range.defined > range_.used
512 def compute_blob_assignments(assignments):
513 blob_assignments = {}
514 for assignment
in assignments:
515 if len(assignment) == 1:
517 last_blob, _ = assignment[-1]
518 for (blob, _)
in assignment:
519 blob_assignments[blob] = last_blob
520 return blob_assignments
523 def _get_max_size(assignment):
526 ret = max([x[1].size
for x
in assignment])
527 ret = 0
if ret
is None else ret
531 def get_memory_usage(assignments):
533 for cur
in assignments:
534 ret += _get_max_size(cur)
538 def compute_assignments_greedy(ranges_sorted, init_assignments=None):
539 assignments = init_assignments
or []
540 visited = {y[0]
for x
in assignments
for y
in x}
542 for (name, range_)
in ranges_sorted:
547 min_dist = float(
"inf")
548 candidate_size = range_.size
or 0
549 for idx, assignment
in enumerate(assignments):
550 if is_compatible(range_, assignment, []):
552 dist = abs(_get_max_size(assignment) - candidate_size)
555 best_assignment = idx
557 assignment = assignments[best_assignment]
558 assignment.append((name, range_))
560 assignments.append([(name, range_)])
564 def _get_count(assignments):
565 ''' Return number of blobs in assignments ''' 567 return sum([len(x)
for x
in assignments])
571 def compute_assignments_dp(ranges_sorted, init_assignment, counter=None):
572 ''' Compute assignment for blobs in 'ranges_sorted' on top of 'init_assignment' 573 using dynamic programming + recursion. 575 ranges_sorted: blobs sorted by 'used' 576 init_assignment: assignment to start with, blobs in 'ranges_sorted' should 577 not be used in 'init_assignment' 579 Using f(b, k, init) to represent the best assignment for blobs b[0:k] 580 given initial assignment 'init', we have 581 f(b, k, init) = f(b, j, init) + 582 find_best(b[j:k], f(b, j, init)) 583 where j is the index of the last best assignment that is independent of 584 blob b[k - 1] (b[k - 1] is compatible with all assignments in 585 f(b, j, init)), and find_best(b1, init1) gives the best assignment 586 for blobs in 'b1' based on the initial assignment 'init1', and blobs 587 b1[0:-1] should be incompatible with b1[-1]. f(b, len(b), []) gives 588 the best assignment for blobs 'b'. 590 For find_best(b, init), since b[0:-1] are not compatible with b[-1], we 591 could reduce it to a smaller problem to find best assignment for b[0:-1] 593 find_best(b, init) = min { 594 f(b[0:-1], len(b) - 1, init - x) + [x, b[-1]] for x in init, or 595 f(b[0:-1], len(b) - 1, init) + [b[-1]] 597 where min{} gives the assignment with minimum memory usage. 600 def _get_compatible_prev(candidate_range, best_assignments, cur_idx):
601 ''' Find closest position k of best_assignments that is independent of 602 candidate_range that candiate_range is compatible with all assignments 603 in best_assignments[k]. 604 Return -1 if not found. 606 def is_compatible_all(candidate_range, assignments):
607 ''' return true if compatiable for all assignments in assignments ''' 608 return all([is_compatible(candidate_range[1], x, [])
for x
in assignments])
612 cba = best_assignments[ii]
613 if is_compatible_all(candidate_range, cba):
618 def _find_best(ranges, init_assignment, prev_best_assignment, counter):
619 ''' Find the best assignment for blobs 'ranges' given an initialized 620 assignment 'init_assignment'. 622 Blobs in ranges[0:-1] should be incompatible with blob range[-1]. 623 'prev_best_assignment': best assignment for blobs in ranges[:-1] 625 By assigning ranges[-1] to each assignment k in 'init_assignment' or 626 in a new assignment, the problem becomes a smaller problem to find 627 the best assignment for ranges[0:-1] given the initial assignment 628 init_assigment[0:k, (k+1):-1]. 631 find_range = ranges[-1]
634 assert all(
not is_compatible(x[1], [find_range], [])
for x
in ranges[0:-1])
636 sz = len(init_assignment)
640 if not is_compatible(find_range[1], init_assignment[ii], []):
642 cur_best = copy.deepcopy(init_assignment)
643 cur_best[ii].append(find_range)
645 cur_best_tmp = [x
for i, x
in enumerate(cur_best)
if i != ii]
647 cur_best_tmp = compute_assignments_dp(
648 ranges[:-1], cur_best_tmp, counter)
649 cur_best = cur_best_tmp + [cur_best[ii]]
650 best_candidates.append(cur_best)
652 best_candidates.append(prev_best_assignment + [[find_range]])
654 ret = min(best_candidates, key=
lambda x: get_memory_usage(x))
661 if counter
and counter[0] % 5000 == 0:
662 rs = [ranges_sorted[0][1].defined, ranges_sorted[-1][1].used]
663 log.info(
'Finding assignments {} ({} -> {})...'.format(
664 counter[0], rs[0], rs[1]))
666 init_assignment = init_assignment
or []
668 best_assignments = []
670 for ii, cur_range
in enumerate(ranges_sorted):
672 prev_idx = _get_compatible_prev(cur_range, best_assignments, ii)
673 prev_best = copy.deepcopy(init_assignment)
if prev_idx < 0
else \
674 copy.deepcopy(best_assignments[prev_idx])
676 ranges_part = ranges_sorted[(prev_idx + 1):(ii + 1)]
677 cur_best = _find_best(
678 ranges_part, prev_best,
679 best_assignments[-1]
if best_assignments
else init_assignment,
681 assert _get_count(cur_best) == _get_count(prev_best) + len(ranges_part)
682 best_assignments.append(copy.deepcopy(cur_best))
684 assert len(best_assignments) == len(ranges_sorted)
686 best = best_assignments[-1]
691 def get_updated_ranges(ranges, max_live=None):
692 ''' Set LiveRange.defined = -1 if it is None 693 Set LiveRange.used = max_live if it is None 694 Set LiveRanee.size = 1 if it is None 697 def _get_max_live(ranges):
698 max_live = max(x[1].used
for x
in ranges
if x[1].used) + 1
701 def _update_range(x, max_live, size):
703 if x[1].defined
is None:
704 cx = (cx[0], cx[1]._replace(defined=-1))
705 if x[1].used
is None:
706 cx = (cx[0], cx[1]._replace(used=max_live))
707 if x[1].size
is None:
708 cx = (cx[0], cx[1]._replace(size=size))
712 max_live = _get_max_live(ranges)
713 ranges = [_update_range(x, max_live, 1)
for x
in ranges]
718 def compute_assignments(ranges, static_blobs, algo):
720 algo: Method used to find assignments (AssignmentAlgorithm.GREEDY or 721 AssignmentAlgorithm.DYNAMIC_PROGRAMMING). 722 AssignmentAlgorithm.DYNAMIC_PROGRAMMING gives optimal solution at the 723 cost of more computation. 724 AssignmentAlgorithm.GREEDY may be better in the case 'blob_sizes' is 734 key=
lambda p: (p[1].used
is None, p[1].used),
737 ranges = get_updated_ranges(ranges)
740 ranges_sharable = [x
for x
in ranges
if x[0]
not in static_blobs]
742 ranges_static = [x
for x
in ranges
if x[0]
in static_blobs]
744 log.info(
"Total sharable blobs {}".format(len(ranges_sharable)))
747 if algo == AssignmentAlgorithm.DYNAMIC_PROGRAMMING:
748 best_assignment = compute_assignments_dp(ranges_sharable, [])
749 elif algo == AssignmentAlgorithm.GREEDY:
750 best_assignment = compute_assignments_greedy(ranges_sharable, [])
752 assert "Invalid algo name {}".format(algo)
753 best_assignment += [[x]
for x
in ranges_static]
757 return best_assignment
760 def verify_assignments(assignments):
761 for cur
in assignments:
762 for x, y
in zip(cur[0:-1], cur[1:]):
763 assert x[1].used < y[1].defined
766 def compute_interference_graph(ops):
768 for i, op
in enumerate(ops):
770 for i, parent_op
in enumerate(ops):
771 for j, child_op
in enumerate(ops):
774 if any(output
in child_op.input
for output
in parent_op.output):
775 deps = set(child_op.input).intersection(parent_op.output)
776 g.add_edge(i, j, deps=deps)
777 assert nx.is_directed_acyclic_graph(g), child_op
781 Optimization = collections.namedtuple(
782 'Optimization', [
'net',
'assignments',
'blob_assignments'])
785 def apply_assignments(net, blob_assignments):
786 def canonical_name(blob):
787 if blob
not in blob_assignments:
789 return blob_assignments[blob]
793 if op.type.startswith(
'RecurrentNetwork'):
794 apply_recurrent_blob_assignments(op, blob_assignments, canonical_name)
796 for i, input_
in enumerate(op.input):
797 op.input[i] = canonical_name(input_)
798 for i, output
in enumerate(op.output):
799 op.output[i] = canonical_name(output)
803 def apply_recurrent_blob_assignments(op, blob_assignments, canonical_name):
804 log.debug(
"Applying assignments to recurrent op: {}".format(op.type))
805 step_args = [a
for a
in op.arg
if a.name.endswith(
"step_net")]
806 for step_arg
in step_args:
807 apply_assignments(step_arg.n, blob_assignments)
808 for i, einp
in enumerate(step_arg.n.external_input):
809 if einp
in blob_assignments:
810 step_arg.n.external_input[i] = canonical_name(einp)
812 for blob, renamed
in viewitems(blob_assignments):
813 if blob
in list(op.input) + list(op.output):
814 a = caffe2_pb2.Argument()
815 a.name = blob +
".rename" 816 a.s = str(renamed).encode(
"ascii")
822 DYNAMIC_PROGRAMMING = 1
825 def optimize_inference_fast(net, static_blobs):
826 optim = caffe2_pb2.NetDef()
827 optim_str = C.memonger_optimize_inference_net(
828 net.SerializeToString(),
829 [str(s).encode(
'utf-8')
for s
in static_blobs]
831 optim.ParseFromString(optim_str)
835 def optimize_interference(net, static_blobs,
836 ordering_function=topological_sort_traversal,
838 algo=AssignmentAlgorithm.GREEDY):
840 ordering_function: topological_sort_traversal or 841 topological_sort_traversal_longest_path. 842 topological_sort_traversal_longest_path gives better 843 results but needs a bit more computation. 844 algo: Method used to find assignments (AssignmentAlgorithm.GREEDY or 845 AssignmentAlgorithm.DYNAMIC_PROGRAMMING). 846 AssignmentAlgorithm.DYNAMIC_PROGRAMMING gives optimal solution at the 847 cost of more computation. 848 AssignmentAlgorithm.GREEDY may be better in the case 'blob_sizes' is 853 1) Use a BFS traversal of the execution graph to generate an 854 ordering of the node executions. 855 2) Generate use-def ranges for each `blob` in the BFS traversal 857 3) Assign blobs to `canonical blobs` 858 4) Rename blobs to canonical blobs 861 net = copy.deepcopy(net)
862 g = compute_interference_graph(net.op)
863 ordering = ordering_function(g)
864 linearized_ops = [net.op[i]
for i
in ordering]
871 net.op.extend(linearized_ops)
873 ranges = compute_ranges(linearized_ops, blob_sizes)
874 assignments = compute_assignments(ranges, static_blobs, algo)
875 blob_assignments = compute_blob_assignments(assignments)
876 apply_assignments(net, blob_assignments)
879 blob_assignments=blob_assignments,
880 assignments=assignments)
883 def verify_inplace_blobs(net_a, net_b):
885 Verifies that net_a and net_b have the same in-place blob assignments. 886 Particularly, that memonger did not add an in-place assignment when that 887 did not exist before. 889 def get_inplaces(op):
890 out = list(op.output)
892 for j, inp
in enumerate(op.input):
894 inplaces.append([j, out.index(inp)])
897 for op_a, op_b
in zip(net_a.op, net_b.op):
898 if op_a.type != op_b.type:
900 if get_inplaces(op_a) != get_inplaces(op_b):
905 def verify_graph_equality(net_a, net_b):
907 Determines if the execution of two graphs are identical. 908 That is, all inputs blobs are mapped to the same output blobs 909 for each operator in their respective positions. 911 This is meant to check the output of memonger with the original graph. 912 It assumes that the nets have same external input and output. 914 O(E) runtime + O(1) amortized cost to hash for python dict 917 def parent_list(ops):
918 parent_list = [[]
for _
in ops]
920 for i, op
in enumerate(ops):
921 for blob
in op.input:
922 parent_id = edge_owner.get(blob)
923 if parent_id
is not None:
924 parent_list[i].append(parent_id)
925 for blob
in op.output:
931 if (len(net_a.op) != len(net_b.op)):
933 for op_a, op_b
in zip(net_a.op, net_b.op):
934 if (op_a.type != op_b.type
or 935 op_a.device_option != op_b.device_option
or 936 op_a.engine != op_b.engine):
940 parent_list_a = parent_list(net_a.op)
941 parent_list_b = parent_list(net_b.op)
942 if parent_list_a != parent_list_b:
944 for a, b
in zip(parent_list_a, parent_list_b):
946 print(
"Difference {} vs {} \n {}".format(
947 j, net_a.op[j], net_b.op[j]))
948 print(
"Parents: {} vs {}".format(a, b))
953 return parent_list_a == parent_list_b
956 Statistics = collections.namedtuple(
957 'Statistics', [
'baseline_nbytes',
'optimized_nbytes'])
960 def blob_nbytes(blob):
963 sz = workspace.FetchBlob(blob).nbytes
965 log.warning(
'Error when fetching blob {}'.format(blob))
969 def compute_statistics(assignments):
971 blob: blob_nbytes(blob)
for assignment
in assignments
972 for (blob, _)
in assignment}
973 baseline_nbytes = sum(viewvalues(blob_bytes))
974 optimized_nbytes = sum(
975 max(blob_bytes[blob]
for (blob, _)
in assignment)
976 for assignment
in assignments)
978 baseline_nbytes=baseline_nbytes,
979 optimized_nbytes=optimized_nbytes)
982 def collect_blob_sizes(net):
985 for blob
in op.input:
986 blobs[blob] = blob_nbytes(blob)
987 for blob
in op.output:
988 blobs[blob] = blob_nbytes(blob)