Caffe2 - Python API
A deep learning, cross platform ML framework
rewrite_graph.py
1 from __future__ import absolute_import
2 from __future__ import division
3 from __future__ import print_function
4 from __future__ import unicode_literals
5 
6 import copy
7 from caffe2.proto import caffe2_pb2
8 from caffe2.python import core
10 
11 
12 def rewrite_init_net_simple(net):
13  for op in net.op:
14  op.device_option.device_type = caffe2_pb2.IDEEP
15 
16 def last_producer(ops, blob):
17  for (i, op) in reversed(list(enumerate(ops))):
18  if blob in op.output:
19  return i
20  raise ValueError("Failed to find last producer of blob, %s", blob)
21 
22 
23 def fix_BoxWithNMSLimit(net):
24  outputs = set()
25  for op in net.op:
26  if op.type == 'BoxWithNMSLimit':
27  outputs.add(op.output[0])
28  outputs.add(op.output[1])
29  outputs.add(op.output[2])
30  for op in net.op:
31  if op.type == 'CopyIDEEPToCPU':
32  if op.input[0] in outputs:
33  print("Chaning CopyIDEEPToCPU to Copy for {}".format(op.input[0]))
34  op.type = 'Copy'
35  op.device_option.device_type = caffe2_pb2.CPU
36 
37 
38 def rewrite_run_net_simple(net):
39  # Simple rewrite for now - assume entire graph can be executed
40  # with MKL, so just insert copy ops for external_input[0] and
41  # external_output[0]
42  def mkl_tmp(name):
43  return "{}__MKL__".format(name)
44 
45  input_blob = net.external_input[0]
46  if input_blob != net.op[0].input[0]:
47  raise Exception(
48  "Input blob: {} is not consumed by first op: {}".format(
49  input_blob, net.op[0]))
50  # Modify input/outputs to point to copied MKL blobs.
51  from_cpu = "CopyCPUToIDEEP"
52  to_cpu = "CopyIDEEPToCPU"
53  copy_input_op = core.CreateOperator(
54  from_cpu, input_blob, mkl_tmp(input_blob))
55  net.op[0].input[0] = mkl_tmp(input_blob)
56 
57  copy_output_ops = [
58  core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
59  for output_blob in net.external_output]
60 
61  for output_blob in net.external_output:
62  last_producer_idx = last_producer(net.op, output_blob)
63  renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
64  for blob in net.op[last_producer_idx].output]
65  net.op[last_producer_idx].output[:] = renamed_outputs
66  # Rename any subsequent consumers of an output blob.
67  for op in net.op[last_producer_idx + 1:]:
68  renamed_input = [blob if blob != output_blob else mkl_tmp(blob)
69  for blob in op.input]
70  op.input[:] = renamed_input
71 
72  ops = [copy_input_op] + net.op[:] + copy_output_ops
73  del net.op[:]
74  net.op.extend(ops)
75  device = caffe2_pb2.IDEEP
76  for op in net.op:
77  op.device_option.MergeFrom(
78  core.DeviceOption(device_type=device))
79  op.engine = ""
80 
81  # Temporarily disbale conv+relu fusion until we verify further
82  # net.ParseFromString(
83  # C.transform_optimizeForIDEEP(net.SerializeToString()))
84  fix_BoxWithNMSLimit(net)
85 
86 
87 def rewrite_run_net_simple_xrayocr_lstm(net):
88  # For xrayocr model with lstm, only rewrite the non-lstm part of the net to
89  # enable mkl, then copy the temporary output blob at the break point
90  # and all external inputs for lstm part to cpu, and execuate rest of the net
91  # (two lstm) on cpu
92  # This only works for the xrayocr lstm model which uses the first 'Shape' op
93  # to decide the break point, and after two lstm it's external_output
94  # directly so there's no need to copy back to ideep/mkl
95 
96  def mkl_tmp(name):
97  return "{}__MKL__".format(name)
98 
99  def cpu_tmp(name):
100  return "{}__CPU__".format(name)
101 
102  input_blob = net.external_input[0]
103  if input_blob != net.op[0].input[0]:
104  raise Exception(
105  "Input blob: {} is not consumed by first op: {}".format(
106  input_blob, net.op[0]))
107  # Modify input/outputs to point to copied MKL blobs.
108  from_cpu = "CopyCPUToIDEEP"
109  to_cpu = "CopyIDEEPToCPU"
110  copy_input_op = core.CreateOperator(
111  from_cpu, input_blob, mkl_tmp(input_blob))
112  net.op[0].input[0] = mkl_tmp(input_blob)
113 
114  # the net may contain some external_inputs falsely added during ONNX->Caffe2
115  # This should be taken care of in early steps during pytorch_to_caffe2,
116  # but if not it can cause issue in follow up steps, so check here to confirm
117  for input_blob in net.external_input:
118  for op in net.op:
119  # look for if the external_input blob is output of any op in the net
120  assert input_blob not in op.output
121 
122  external_output = None
123  external_inputs_to_cpu = set()
124  find_first_shape_op = False
125  cpu_op_start_idx = -1
126  for op_idx, op in enumerate(net.op):
127  # the first Shape op mark the starting point of LSTM chunk of the net
128  if not find_first_shape_op:
129  if op.type == 'Shape':
130  external_output = op.input
131  find_first_shape_op = True
132  cpu_op_start_idx = op_idx
133  else:
134  # any external input in the LSTM part need to be copied to CPU
135  for in_blob in op.input:
136  if in_blob in net.external_input:
137  external_inputs_to_cpu.add(in_blob)
138 
139  # make sure we found the expected break point of the net
140  assert external_output is not None
141 
142  # create op to copy external input blobs used in LSTM part from IDEEP to CPU
143  copy_extra_input_ops = []
144  for in_blob in external_inputs_to_cpu:
145  copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob,
146  cpu_tmp(in_blob)))
147  # rename input blobs in LSTM part to use the CPU copy
148  for op in net.op[cpu_op_start_idx:]:
149  renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob)
150  for blob in op.input]
151  op.input[:] = renamed_input
152 
153  copy_output_ops = [
154  core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
155  for output_blob in external_output]
156 
157  for output_blob in external_output:
158  last_producer_idx = last_producer(net.op, output_blob)
159  renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
160  for blob in net.op[last_producer_idx].output]
161  net.op[last_producer_idx].output[:] = renamed_outputs
162 
163  # rearrange all ops in correct order
164  ops = [copy_input_op] + net.op[:cpu_op_start_idx] \
165  + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:]
166  del net.op[:]
167  net.op.extend(ops)
168 
169  device = caffe2_pb2.IDEEP
170  for op in net.op:
171  # the first Shape op mark the starting point of LSTM chunk of the net
172  if op.type == 'Shape':
173  # all LSTM ops should run on CPU
174  device = caffe2_pb2.CPU
175  op.device_option.MergeFrom(
176  core.DeviceOption(device_type=device))
177  op.engine = ""
178 
179  # RecurrentNetwork has a nested step_net that needs special treatment
180  if op.type == 'RecurrentNetwork':
181  for arg in op.arg:
182  if arg.name == 'step_net':
183  for nested_op in arg.n.op:
184  # set device to CPU
185  nested_op.device_option.MergeFrom(
186  core.DeviceOption(device_type=device))
187  nested_op.engine = ""
188 
189  # rename inputs in op of nested net
190  renamed_input = []
191  for blob in nested_op.input:
192  renamed_input.append(blob
193  if blob not in external_inputs_to_cpu
194  else cpu_tmp(blob))
195  nested_op.input[:] = renamed_input
196 
197  # rename external inputs of nested net
198  new_external_input = []
199  for blob in arg.n.external_input:
200  new_external_input.append(blob
201  if blob not in external_inputs_to_cpu
202  else cpu_tmp(blob))
203  arg.n.external_input[:] = new_external_input
204 
205  # Temporarily disbale conv+relu fusion until we verify further
206  # net.ParseFromString(
207  # C.transform_optimizeForIDEEP(net.SerializeToString()))
208  fix_BoxWithNMSLimit(net)
209 
210 
211 def rewrite_model_helper_simple(model):
212  model = copy.deepcopy(model)
213  # All parameter initialization should run on MKL
214  rewrite_init_net_simple(model.param_init_net.Proto())
215  rewrite_run_net_simple(model.net.Proto())
216  return model