Caffe2 - Python API
A deep learning, cross platform ML framework
lstm_benchmark.py
1 ## @package lstm_benchmark
2 # Module caffe2.python.lstm_benchmark
3 from __future__ import absolute_import
4 from __future__ import division
5 from __future__ import print_function
6 from __future__ import unicode_literals
7 
8 from caffe2.proto import caffe2_pb2
9 from caffe2.python import workspace, core, utils, rnn_cell, model_helper
10 from caffe2.python import recurrent
11 
12 import argparse
13 import numpy as np
14 import time
15 
16 import logging
17 
18 logging.basicConfig()
19 log = logging.getLogger("lstm_bench")
20 log.setLevel(logging.DEBUG)
21 
22 
23 def generate_data(T, shape, num_labels, fixed_shape):
24  '''
25  Fill a queue with input data
26  '''
27  log.info("Generating T={} sequence batches".format(T))
28 
29  generate_input_init_net = core.Net('generate_input_init')
30  queue = generate_input_init_net.CreateBlobsQueue(
31  [], "inputqueue", num_blobs=1, capacity=T,
32  )
33  label_queue = generate_input_init_net.CreateBlobsQueue(
34  [], "labelqueue", num_blobs=1, capacity=T,
35  )
36 
37  workspace.RunNetOnce(generate_input_init_net)
38  generate_input_net = core.Net('generate_input')
39 
40  generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
41  generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
42  np.random.seed(2603)
43 
44  entry_counts = []
45  for t in range(T):
46  if (t % (max(10, T // 10)) == 0):
47  print("Generating data {}/{}".format(t, T))
48  # Randomize the seqlength
49  random_shape = (
50  [np.random.randint(1, shape[0])] + shape[1:]
51  if t > 0 and not fixed_shape else shape
52  )
53  X = np.random.rand(*random_shape).astype(np.float32)
54  batch_size = random_shape[1]
55  L = num_labels * batch_size
56  labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
57  workspace.FeedBlob("scratch", X)
58  workspace.FeedBlob("label_scr", labels)
59  workspace.RunNetOnce(generate_input_net.Proto())
60  entry_counts.append(random_shape[0] * random_shape[1])
61 
62  log.info("Finished data generation")
63 
64  return queue, label_queue, entry_counts
65 
66 
67 def create_model(args, queue, label_queue, input_shape):
68  model = model_helper.ModelHelper(name="LSTM_bench")
69  seq_lengths, target = \
70  model.net.AddExternalInputs(
71  'seq_lengths',
72  'target',
73  )
74 
75  input_blob = model.net.DequeueBlobs(queue, "input_data")
76  labels = model.net.DequeueBlobs(label_queue, "label")
77 
78  init_blobs = []
79  if args.implementation in ["own", "static", "static_dag"]:
80  T = None
81  if "static" in args.implementation:
82  assert args.fixed_shape, \
83  "Random input length is not static RNN compatible"
84  T = args.seq_length
85  print("Using static RNN of size {}".format(T))
86 
87  for i in range(args.num_layers):
88  hidden_init, cell_init = model.net.AddExternalInputs(
89  "hidden_init_{}".format(i),
90  "cell_init_{}".format(i)
91  )
92  init_blobs.extend([hidden_init, cell_init])
93 
94  output, last_hidden, _, last_state = rnn_cell.LSTM(
95  model=model,
96  input_blob=input_blob,
97  seq_lengths=seq_lengths,
98  initial_states=init_blobs,
99  dim_in=args.input_dim,
100  dim_out=[args.hidden_dim] * args.num_layers,
101  scope="lstm1",
102  memory_optimization=args.memory_optimization,
103  forward_only=args.forward_only,
104  drop_states=True,
105  return_last_layer_only=True,
106  static_rnn_unroll_size=T,
107  )
108 
109  if "dag" in args.implementation:
110  print("Using DAG net type")
111  model.net.Proto().type = 'dag'
112  model.net.Proto().num_workers = 4
113 
114  elif args.implementation == "cudnn":
115  # We need to feed a placeholder input so that RecurrentInitOp
116  # can infer the dimensions.
117  init_blobs = model.net.AddExternalInputs("hidden_init", "cell_init")
118  model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
119  output, last_hidden, _ = rnn_cell.cudnn_LSTM(
120  model=model,
121  input_blob=input_blob,
122  initial_states=init_blobs,
123  dim_in=args.input_dim,
124  dim_out=args.hidden_dim,
125  scope="cudnnlstm",
126  num_layers=args.num_layers,
127  )
128 
129  else:
130  assert False, "Unknown implementation"
131 
132  weights = model.net.UniformFill(labels, "weights")
133  softmax, loss = model.net.SoftmaxWithLoss(
134  [model.Flatten(output), labels, weights],
135  ['softmax', 'loss'],
136  )
137 
138  if not args.forward_only:
139  model.AddGradientOperators([loss])
140 
141  # carry states over
142  for init_blob in init_blobs:
143  model.net.Copy(last_hidden, init_blob)
144 
145  sz = args.hidden_dim
146  if args.implementation == "cudnn":
147  sz *= args.num_layers
148  workspace.FeedBlob(init_blob, np.zeros(
149  [1, args.batch_size, sz], dtype=np.float32
150  ))
151 
152  if args.rnn_executor:
153  for op in model.net.Proto().op:
154  if op.type.startswith('RecurrentNetwork'):
155  recurrent.set_rnn_executor_config(
156  op,
157  num_threads=args.rnn_executor_num_threads,
158  max_cuda_streams=args.rnn_executor_max_cuda_streams,
159  )
160  return model, output
161 
162 
163 def Caffe2LSTM(args):
164  T = args.data_size // args.batch_size
165 
166  input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
167  queue, label_queue, entry_counts = generate_data(T // args.seq_length,
168  input_blob_shape,
169  args.hidden_dim,
170  args.fixed_shape)
171 
172  workspace.FeedBlob(
173  "seq_lengths",
174  np.array([args.seq_length] * args.batch_size, dtype=np.int32)
175  )
176 
177  model, output = create_model(args, queue, label_queue, input_blob_shape)
178 
179  workspace.RunNetOnce(model.param_init_net)
180  workspace.CreateNet(model.net)
181 
182  start_time = time.time()
183  num_iters = T // args.seq_length
184  total_iters = 0
185 
186  # Run the Benchmark
187  log.info("------ Warming up ------")
188  workspace.RunNet(model.net.Proto().name)
189 
190  if (args.gpu):
191  log.info("Memory stats:")
192  stats = utils.GetGPUMemoryUsageStats()
193  log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
194 
195  log.info("------ Starting benchmark ------")
196  start_time = time.time()
197  last_time = time.time()
198  for iteration in range(1, num_iters, args.iters_to_report):
199  iters_once = min(args.iters_to_report, num_iters - iteration)
200  total_iters += iters_once
201  workspace.RunNet(model.net.Proto().name, iters_once)
202 
203  new_time = time.time()
204  log.info(
205  "Iter: {} / {}. Entries Per Second: {}k.".format(
206  iteration,
207  num_iters,
208  np.sum(entry_counts[iteration:iteration + iters_once]) /
209  (new_time - last_time) // 100 / 10,
210  )
211  )
212  last_time = new_time
213 
214  log.info("Done. Total EPS excluding 1st iteration: {}k {}".format(
215  np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10,
216  " (with RNN executor)" if args.rnn_executor else "",
217  ))
218 
219  if (args.gpu):
220  log.info("Memory stats:")
221  stats = utils.GetGPUMemoryUsageStats()
222  log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
223  if (stats['max_total'] != stats['total']):
224  log.warning(
225  "Max usage differs from current total usage: {} > {}".
226  format(stats['max_total'], stats['total'])
227  )
228  log.warning("This means that costly deallocations occured.")
229 
230  return time.time() - start_time
231 
232 
233 @utils.debug
234 def Benchmark(args):
235  return Caffe2LSTM(args)
236 
237 
238 def GetArgumentParser():
239  parser = argparse.ArgumentParser(description="LSTM benchmark.")
240 
241  parser.add_argument(
242  "--hidden_dim",
243  type=int,
244  default=800,
245  help="Hidden dimension",
246  )
247  parser.add_argument(
248  "--input_dim",
249  type=int,
250  default=40,
251  help="Input dimension",
252  )
253  parser.add_argument(
254  "--batch_size",
255  type=int,
256  default=128,
257  help="The batch size."
258  )
259  parser.add_argument(
260  "--seq_length",
261  type=int,
262  default=20,
263  help="Max sequence length"
264  )
265  parser.add_argument(
266  "--data_size",
267  type=int,
268  default=1000000,
269  help="Number of data points to generate"
270  )
271  parser.add_argument(
272  "--iters_to_report",
273  type=int,
274  default=20,
275  help="Number of iteration to report progress"
276  )
277  parser.add_argument(
278  "--gpu",
279  action="store_true",
280  help="Run all on GPU",
281  )
282  parser.add_argument(
283  "--implementation",
284  type=str,
285  default="own",
286  help="'cudnn', 'own', 'static' or 'static_dag'",
287  )
288  parser.add_argument(
289  "--fixed_shape",
290  action="store_true",
291  help=("Whether to randomize shape of input batches. "
292  "Static RNN requires fixed shape"),
293  )
294  parser.add_argument(
295  "--memory_optimization",
296  action="store_true",
297  help="Whether to use memory optimized LSTM or not",
298  )
299  parser.add_argument(
300  "--forward_only",
301  action="store_true",
302  help="Whether to run only forward pass"
303  )
304  parser.add_argument(
305  "--num_layers",
306  type=int,
307  default=1,
308  help="Number of LSTM layers. All output dimensions are going to be"
309  "of hidden_dim size",
310  )
311  parser.add_argument(
312  "--rnn_executor",
313  action="store_true",
314  help="Whether to use RNN executor"
315  )
316  parser.add_argument(
317  "--rnn_executor_num_threads",
318  type=int,
319  default=None,
320  help="Number of threads used by CPU RNN Executor"
321  )
322  parser.add_argument(
323  "--rnn_executor_max_cuda_streams",
324  type=int,
325  default=None,
326  help="Maximum number of CUDA streams used by RNN executor on GPU"
327  )
328  return parser
329 
330 
331 if __name__ == '__main__':
332  args, extra_args = GetArgumentParser().parse_known_args()
333 
334  rnn_executor_opt = 1 if args.rnn_executor else 0
335 
336  workspace.GlobalInit([
337  'caffe2',
338  '--caffe2_log_level=0',
339  '--caffe2_print_blob_sizes_at_exit=0',
340  '--caffe2_rnn_executor={}'.format(rnn_executor_opt),
341  '--caffe2_gpu_memory_tracking=1'] + extra_args)
342 
343  device = core.DeviceOption(
344  workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 4)
345 
346  with core.DeviceScope(device):
347  Benchmark(args)