3 from __future__
import absolute_import
4 from __future__
import division
5 from __future__
import print_function
6 from __future__
import unicode_literals
8 from caffe2.proto
import caffe2_pb2
9 from caffe2.python import workspace, core, utils, rnn_cell, model_helper
19 log = logging.getLogger(
"lstm_bench")
20 log.setLevel(logging.DEBUG)
23 def generate_data(T, shape, num_labels, fixed_shape):
25 Fill a queue with input data 27 log.info(
"Generating T={} sequence batches".format(T))
29 generate_input_init_net = core.Net(
'generate_input_init')
30 queue = generate_input_init_net.CreateBlobsQueue(
31 [],
"inputqueue", num_blobs=1, capacity=T,
33 label_queue = generate_input_init_net.CreateBlobsQueue(
34 [],
"labelqueue", num_blobs=1, capacity=T,
37 workspace.RunNetOnce(generate_input_init_net)
38 generate_input_net = core.Net(
'generate_input')
40 generate_input_net.EnqueueBlobs([queue,
"scratch"], [
"scratch"])
41 generate_input_net.EnqueueBlobs([label_queue,
"label_scr"], [
"label_scr"])
46 if (t % (max(10, T // 10)) == 0):
47 print(
"Generating data {}/{}".format(t, T))
50 [np.random.randint(1, shape[0])] + shape[1:]
51 if t > 0
and not fixed_shape
else shape
53 X = np.random.rand(*random_shape).astype(np.float32)
54 batch_size = random_shape[1]
55 L = num_labels * batch_size
56 labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
57 workspace.FeedBlob(
"scratch", X)
58 workspace.FeedBlob(
"label_scr", labels)
59 workspace.RunNetOnce(generate_input_net.Proto())
60 entry_counts.append(random_shape[0] * random_shape[1])
62 log.info(
"Finished data generation")
64 return queue, label_queue, entry_counts
67 def create_model(args, queue, label_queue, input_shape):
68 model = model_helper.ModelHelper(name=
"LSTM_bench")
69 seq_lengths, target = \
70 model.net.AddExternalInputs(
75 input_blob = model.net.DequeueBlobs(queue,
"input_data")
76 labels = model.net.DequeueBlobs(label_queue,
"label")
79 if args.implementation
in [
"own",
"static",
"static_dag"]:
81 if "static" in args.implementation:
82 assert args.fixed_shape, \
83 "Random input length is not static RNN compatible" 85 print(
"Using static RNN of size {}".format(T))
87 for i
in range(args.num_layers):
88 hidden_init, cell_init = model.net.AddExternalInputs(
89 "hidden_init_{}".format(i),
90 "cell_init_{}".format(i)
92 init_blobs.extend([hidden_init, cell_init])
94 output, last_hidden, _, last_state = rnn_cell.LSTM(
96 input_blob=input_blob,
97 seq_lengths=seq_lengths,
98 initial_states=init_blobs,
99 dim_in=args.input_dim,
100 dim_out=[args.hidden_dim] * args.num_layers,
102 memory_optimization=args.memory_optimization,
103 forward_only=args.forward_only,
105 return_last_layer_only=
True,
106 static_rnn_unroll_size=T,
109 if "dag" in args.implementation:
110 print(
"Using DAG net type")
111 model.net.Proto().type =
'dag' 112 model.net.Proto().num_workers = 4
114 elif args.implementation ==
"cudnn":
117 init_blobs = model.net.AddExternalInputs(
"hidden_init",
"cell_init")
118 model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
119 output, last_hidden, _ = rnn_cell.cudnn_LSTM(
121 input_blob=input_blob,
122 initial_states=init_blobs,
123 dim_in=args.input_dim,
124 dim_out=args.hidden_dim,
126 num_layers=args.num_layers,
130 assert False,
"Unknown implementation" 132 weights = model.net.UniformFill(labels,
"weights")
133 softmax, loss = model.net.SoftmaxWithLoss(
134 [model.Flatten(output), labels, weights],
138 if not args.forward_only:
139 model.AddGradientOperators([loss])
142 for init_blob
in init_blobs:
143 model.net.Copy(last_hidden, init_blob)
146 if args.implementation ==
"cudnn":
147 sz *= args.num_layers
148 workspace.FeedBlob(init_blob, np.zeros(
149 [1, args.batch_size, sz], dtype=np.float32
152 if args.rnn_executor:
153 for op
in model.net.Proto().op:
154 if op.type.startswith(
'RecurrentNetwork'):
155 recurrent.set_rnn_executor_config(
157 num_threads=args.rnn_executor_num_threads,
158 max_cuda_streams=args.rnn_executor_max_cuda_streams,
163 def Caffe2LSTM(args):
164 T = args.data_size // args.batch_size
166 input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
167 queue, label_queue, entry_counts = generate_data(T // args.seq_length,
174 np.array([args.seq_length] * args.batch_size, dtype=np.int32)
177 model, output = create_model(args, queue, label_queue, input_blob_shape)
179 workspace.RunNetOnce(model.param_init_net)
180 workspace.CreateNet(model.net)
182 start_time = time.time()
183 num_iters = T // args.seq_length
187 log.info(
"------ Warming up ------")
188 workspace.RunNet(model.net.Proto().name)
191 log.info(
"Memory stats:")
192 stats = utils.GetGPUMemoryUsageStats()
193 log.info(
"GPU memory:\t{} MB".format(stats[
'max_total'] / 1024 / 1024))
195 log.info(
"------ Starting benchmark ------")
196 start_time = time.time()
197 last_time = time.time()
198 for iteration
in range(1, num_iters, args.iters_to_report):
199 iters_once = min(args.iters_to_report, num_iters - iteration)
200 total_iters += iters_once
201 workspace.RunNet(model.net.Proto().name, iters_once)
203 new_time = time.time()
205 "Iter: {} / {}. Entries Per Second: {}k.".format(
208 np.sum(entry_counts[iteration:iteration + iters_once]) /
209 (new_time - last_time) // 100 / 10,
214 log.info(
"Done. Total EPS excluding 1st iteration: {}k {}".format(
215 np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10,
216 " (with RNN executor)" if args.rnn_executor
else "",
220 log.info(
"Memory stats:")
221 stats = utils.GetGPUMemoryUsageStats()
222 log.info(
"GPU memory:\t{} MB".format(stats[
'max_total'] / 1024 / 1024))
223 if (stats[
'max_total'] != stats[
'total']):
225 "Max usage differs from current total usage: {} > {}".
226 format(stats[
'max_total'], stats[
'total'])
228 log.warning(
"This means that costly deallocations occured.")
230 return time.time() - start_time
235 return Caffe2LSTM(args)
238 def GetArgumentParser():
239 parser = argparse.ArgumentParser(description=
"LSTM benchmark.")
245 help=
"Hidden dimension",
251 help=
"Input dimension",
257 help=
"The batch size." 263 help=
"Max sequence length" 269 help=
"Number of data points to generate" 275 help=
"Number of iteration to report progress" 280 help=
"Run all on GPU",
286 help=
"'cudnn', 'own', 'static' or 'static_dag'",
291 help=(
"Whether to randomize shape of input batches. " 292 "Static RNN requires fixed shape"),
295 "--memory_optimization",
297 help=
"Whether to use memory optimized LSTM or not",
302 help=
"Whether to run only forward pass" 308 help=
"Number of LSTM layers. All output dimensions are going to be" 309 "of hidden_dim size",
314 help=
"Whether to use RNN executor" 317 "--rnn_executor_num_threads",
320 help=
"Number of threads used by CPU RNN Executor" 323 "--rnn_executor_max_cuda_streams",
326 help=
"Maximum number of CUDA streams used by RNN executor on GPU" 331 if __name__ ==
'__main__':
332 args, extra_args = GetArgumentParser().parse_known_args()
334 rnn_executor_opt = 1
if args.rnn_executor
else 0
336 workspace.GlobalInit([
338 '--caffe2_log_level=0',
339 '--caffe2_print_blob_sizes_at_exit=0',
340 '--caffe2_rnn_executor={}'.format(rnn_executor_opt),
341 '--caffe2_gpu_memory_tracking=1'] + extra_args)
343 device = core.DeviceOption(
344 workspace.GpuDeviceType
if args.gpu
else caffe2_pb2.CPU, 4)
346 with core.DeviceScope(device):