Caffe2 - Python API
A deep learning, cross platform ML framework
lstm_benchmark.py
1 # Copyright (c) 2016-present, Facebook, Inc.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 ##############################################################################
15 
16 ## @package lstm_benchmark
17 # Module caffe2.python.lstm_benchmark
18 from __future__ import absolute_import
19 from __future__ import division
20 from __future__ import print_function
21 from __future__ import unicode_literals
22 
23 from caffe2.proto import caffe2_pb2
24 from caffe2.python import workspace, core, utils, rnn_cell, model_helper
25 from caffe2.python import recurrent
26 
27 import argparse
28 import numpy as np
29 import time
30 
31 import logging
32 
33 logging.basicConfig()
34 log = logging.getLogger("lstm_bench")
35 log.setLevel(logging.DEBUG)
36 
37 
38 def generate_data(T, shape, num_labels, fixed_shape):
39  '''
40  Fill a queue with input data
41  '''
42  log.info("Generating T={} sequence batches".format(T))
43 
44  generate_input_init_net = core.Net('generate_input_init')
45  queue = generate_input_init_net.CreateBlobsQueue(
46  [], "inputqueue", num_blobs=1, capacity=T,
47  )
48  label_queue = generate_input_init_net.CreateBlobsQueue(
49  [], "labelqueue", num_blobs=1, capacity=T,
50  )
51 
52  workspace.RunNetOnce(generate_input_init_net)
53  generate_input_net = core.Net('generate_input')
54 
55  generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
56  generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
57  np.random.seed(2603)
58 
59  entry_counts = []
60  for t in range(T):
61  if (t % (max(10, T // 10)) == 0):
62  print("Generating data {}/{}".format(t, T))
63  # Randomize the seqlength
64  random_shape = (
65  [np.random.randint(1, shape[0])] + shape[1:]
66  if t > 0 and not fixed_shape else shape
67  )
68  X = np.random.rand(*random_shape).astype(np.float32)
69  batch_size = random_shape[1]
70  L = num_labels * batch_size
71  labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
72  workspace.FeedBlob("scratch", X)
73  workspace.FeedBlob("label_scr", labels)
74  workspace.RunNetOnce(generate_input_net.Proto())
75  entry_counts.append(random_shape[0] * random_shape[1])
76 
77  log.info("Finished data generation")
78 
79  return queue, label_queue, entry_counts
80 
81 
82 def create_model(args, queue, label_queue, input_shape):
83  model = model_helper.ModelHelper(name="LSTM_bench")
84  seq_lengths, target = \
85  model.net.AddExternalInputs(
86  'seq_lengths',
87  'target',
88  )
89 
90  input_blob = model.net.DequeueBlobs(queue, "input_data")
91  labels = model.net.DequeueBlobs(label_queue, "label")
92 
93  init_blobs = []
94  if args.implementation in ["own", "static", "static_dag"]:
95  T = None
96  if "static" in args.implementation:
97  assert args.fixed_shape, \
98  "Random input length is not static RNN compatible"
99  T = args.seq_length
100  print("Using static RNN of size {}".format(T))
101 
102  for i in range(args.num_layers):
103  hidden_init, cell_init = model.net.AddExternalInputs(
104  "hidden_init_{}".format(i),
105  "cell_init_{}".format(i)
106  )
107  init_blobs.extend([hidden_init, cell_init])
108 
109  output, last_hidden, _, last_state = rnn_cell.LSTM(
110  model=model,
111  input_blob=input_blob,
112  seq_lengths=seq_lengths,
113  initial_states=init_blobs,
114  dim_in=args.input_dim,
115  dim_out=[args.hidden_dim] * args.num_layers,
116  scope="lstm1",
117  memory_optimization=args.memory_optimization,
118  forward_only=args.forward_only,
119  drop_states=True,
120  return_last_layer_only=True,
121  static_rnn_unroll_size=T,
122  )
123 
124  if "dag" in args.implementation:
125  print("Using DAG net type")
126  model.net.Proto().type = 'dag'
127  model.net.Proto().num_workers = 4
128 
129  elif args.implementation == "cudnn":
130  # We need to feed a placeholder input so that RecurrentInitOp
131  # can infer the dimensions.
132  init_blobs = model.net.AddExternalInputs("hidden_init", "cell_init")
133  model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
134  output, last_hidden, _ = rnn_cell.cudnn_LSTM(
135  model=model,
136  input_blob=input_blob,
137  initial_states=init_blobs,
138  dim_in=args.input_dim,
139  dim_out=args.hidden_dim,
140  scope="cudnnlstm",
141  num_layers=args.num_layers,
142  )
143 
144  else:
145  assert False, "Unknown implementation"
146 
147  weights = model.net.UniformFill(labels, "weights")
148  softmax, loss = model.net.SoftmaxWithLoss(
149  [model.Flatten(output), labels, weights],
150  ['softmax', 'loss'],
151  )
152 
153  if not args.forward_only:
154  model.AddGradientOperators([loss])
155 
156  # carry states over
157  for init_blob in init_blobs:
158  model.net.Copy(last_hidden, init_blob)
159 
160  sz = args.hidden_dim
161  if args.implementation == "cudnn":
162  sz *= args.num_layers
163  workspace.FeedBlob(init_blob, np.zeros(
164  [1, args.batch_size, sz], dtype=np.float32
165  ))
166 
167  if args.rnn_executor:
168  for op in model.net.Proto().op:
169  if op.type.startswith('RecurrentNetwork'):
170  recurrent.set_rnn_executor_config(
171  op,
172  num_threads=args.rnn_executor_num_threads,
173  max_cuda_streams=args.rnn_executor_max_cuda_streams,
174  )
175  return model, output
176 
177 
178 def Caffe2LSTM(args):
179  T = args.data_size // args.batch_size
180 
181  input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
182  queue, label_queue, entry_counts = generate_data(T // args.seq_length,
183  input_blob_shape,
184  args.hidden_dim,
185  args.fixed_shape)
186 
187  workspace.FeedBlob(
188  "seq_lengths",
189  np.array([args.seq_length] * args.batch_size, dtype=np.int32)
190  )
191 
192  model, output = create_model(args, queue, label_queue, input_blob_shape)
193 
194  workspace.RunNetOnce(model.param_init_net)
195  workspace.CreateNet(model.net)
196 
197  start_time = time.time()
198  num_iters = T // args.seq_length
199  total_iters = 0
200 
201  # Run the Benchmark
202  log.info("------ Warming up ------")
203  workspace.RunNet(model.net.Proto().name)
204 
205  if (args.gpu):
206  log.info("Memory stats:")
207  stats = utils.GetGPUMemoryUsageStats()
208  log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
209 
210  log.info("------ Starting benchmark ------")
211  start_time = time.time()
212  last_time = time.time()
213  for iteration in range(1, num_iters, args.iters_to_report):
214  iters_once = min(args.iters_to_report, num_iters - iteration)
215  total_iters += iters_once
216  workspace.RunNet(model.net.Proto().name, iters_once)
217 
218  new_time = time.time()
219  log.info(
220  "Iter: {} / {}. Entries Per Second: {}k.".format(
221  iteration,
222  num_iters,
223  np.sum(entry_counts[iteration:iteration + iters_once]) /
224  (new_time - last_time) // 100 / 10,
225  )
226  )
227  last_time = new_time
228 
229  log.info("Done. Total EPS excluding 1st iteration: {}k {}".format(
230  np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10,
231  " (with RNN executor)" if args.rnn_executor else "",
232  ))
233 
234  if (args.gpu):
235  log.info("Memory stats:")
236  stats = utils.GetGPUMemoryUsageStats()
237  log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
238  if (stats['max_total'] != stats['total']):
239  log.warning(
240  "Max usage differs from current total usage: {} > {}".
241  format(stats['max_total'], stats['total'])
242  )
243  log.warning("This means that costly deallocations occured.")
244 
245  return time.time() - start_time
246 
247 
248 @utils.debug
249 def Benchmark(args):
250  return Caffe2LSTM(args)
251 
252 
253 def GetArgumentParser():
254  parser = argparse.ArgumentParser(description="LSTM benchmark.")
255 
256  parser.add_argument(
257  "--hidden_dim",
258  type=int,
259  default=800,
260  help="Hidden dimension",
261  )
262  parser.add_argument(
263  "--input_dim",
264  type=int,
265  default=40,
266  help="Input dimension",
267  )
268  parser.add_argument(
269  "--batch_size",
270  type=int,
271  default=128,
272  help="The batch size."
273  )
274  parser.add_argument(
275  "--seq_length",
276  type=int,
277  default=20,
278  help="Max sequence length"
279  )
280  parser.add_argument(
281  "--data_size",
282  type=int,
283  default=1000000,
284  help="Number of data points to generate"
285  )
286  parser.add_argument(
287  "--iters_to_report",
288  type=int,
289  default=20,
290  help="Number of iteration to report progress"
291  )
292  parser.add_argument(
293  "--gpu",
294  action="store_true",
295  help="Run all on GPU",
296  )
297  parser.add_argument(
298  "--implementation",
299  type=str,
300  default="own",
301  help="'cudnn', 'own', 'static' or 'static_dag'",
302  )
303  parser.add_argument(
304  "--fixed_shape",
305  action="store_true",
306  help=("Whether to randomize shape of input batches. "
307  "Static RNN requires fixed shape"),
308  )
309  parser.add_argument(
310  "--memory_optimization",
311  action="store_true",
312  help="Whether to use memory optimized LSTM or not",
313  )
314  parser.add_argument(
315  "--forward_only",
316  action="store_true",
317  help="Whether to run only forward pass"
318  )
319  parser.add_argument(
320  "--num_layers",
321  type=int,
322  default=1,
323  help="Number of LSTM layers. All output dimensions are going to be"
324  "of hidden_dim size",
325  )
326  parser.add_argument(
327  "--rnn_executor",
328  action="store_true",
329  help="Whether to use RNN executor"
330  )
331  parser.add_argument(
332  "--rnn_executor_num_threads",
333  type=int,
334  default=None,
335  help="Number of threads used by CPU RNN Executor"
336  )
337  parser.add_argument(
338  "--rnn_executor_max_cuda_streams",
339  type=int,
340  default=None,
341  help="Maximum number of CUDA streams used by RNN executor on GPU"
342  )
343  return parser
344 
345 
346 if __name__ == '__main__':
347  args, extra_args = GetArgumentParser().parse_known_args()
348 
349  rnn_executor_opt = 1 if args.rnn_executor else 0
350 
351  workspace.GlobalInit([
352  'caffe2',
353  '--caffe2_log_level=0',
354  '--caffe2_print_blob_sizes_at_exit=0',
355  '--caffe2_rnn_executor={}'.format(rnn_executor_opt),
356  '--caffe2_gpu_memory_tracking=1'] + extra_args)
357 
358  device = core.DeviceOption(
359  caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 4)
360 
361  with core.DeviceScope(device):
362  Benchmark(args)