Caffe2 - Python API
A deep learning, cross platform ML framework
train.py
1 ## @package train
2 # Module caffe2.python.models.seq2seq.train
3 from __future__ import absolute_import
4 from __future__ import division
5 from __future__ import print_function
6 from __future__ import unicode_literals
7 
8 import argparse
9 import collections
10 import logging
11 import math
12 import numpy as np
13 import random
14 import time
15 import sys
16 import os
17 
18 import caffe2.proto.caffe2_pb2 as caffe2_pb2
19 from caffe2.python import core, workspace, data_parallel_model
20 import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
21 from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
22 
23 
24 logger = logging.getLogger(__name__)
25 logger.setLevel(logging.INFO)
26 logger.addHandler(logging.StreamHandler(sys.stderr))
27 
28 Batch = collections.namedtuple('Batch', [
29  'encoder_inputs',
30  'encoder_lengths',
31  'decoder_inputs',
32  'decoder_lengths',
33  'targets',
34  'target_weights',
35 ])
36 
37 
38 def prepare_batch(batch):
39  encoder_lengths = [len(entry[0]) for entry in batch]
40  max_encoder_length = max(encoder_lengths)
41  decoder_lengths = []
42  max_decoder_length = max([len(entry[1]) for entry in batch])
43 
44  batch_encoder_inputs = []
45  batch_decoder_inputs = []
46  batch_targets = []
47  batch_target_weights = []
48 
49  for source_seq, target_seq in batch:
50  encoder_pads = (
51  [seq2seq_util.PAD_ID] * (max_encoder_length - len(source_seq))
52  )
53  batch_encoder_inputs.append(
54  list(reversed(source_seq)) + encoder_pads
55  )
56 
57  decoder_pads = (
58  [seq2seq_util.PAD_ID] * (max_decoder_length - len(target_seq))
59  )
60  target_seq_with_go_token = [seq2seq_util.GO_ID] + target_seq
61  decoder_lengths.append(len(target_seq_with_go_token))
62  batch_decoder_inputs.append(target_seq_with_go_token + decoder_pads)
63 
64  target_seq_with_eos = target_seq + [seq2seq_util.EOS_ID]
65  targets = target_seq_with_eos + decoder_pads
66  batch_targets.append(targets)
67 
68  if len(source_seq) + len(target_seq) == 0:
69  target_weights = [0] * len(targets)
70  else:
71  target_weights = [
72  1 if target != seq2seq_util.PAD_ID else 0
73  for target in targets
74  ]
75  batch_target_weights.append(target_weights)
76 
77  return Batch(
78  encoder_inputs=np.array(
79  batch_encoder_inputs,
80  dtype=np.int32,
81  ).transpose(),
82  encoder_lengths=np.array(encoder_lengths, dtype=np.int32),
83  decoder_inputs=np.array(
84  batch_decoder_inputs,
85  dtype=np.int32,
86  ).transpose(),
87  decoder_lengths=np.array(decoder_lengths, dtype=np.int32),
88  targets=np.array(
89  batch_targets,
90  dtype=np.int32,
91  ).transpose(),
92  target_weights=np.array(
93  batch_target_weights,
94  dtype=np.float32,
95  ).transpose(),
96  )
97 
98 
99 class Seq2SeqModelCaffe2(object):
100 
101  def _build_model(
102  self,
103  init_params,
104  ):
105  model = Seq2SeqModelHelper(init_params=init_params)
106  self._build_shared(model)
107  self._build_embeddings(model)
108 
109  forward_model = Seq2SeqModelHelper(init_params=init_params)
110  self._build_shared(forward_model)
111  self._build_embeddings(forward_model)
112 
113  if self.num_gpus == 0:
114  loss_blobs = self.model_build_fun(model)
115  model.AddGradientOperators(loss_blobs)
117  model,
118  scope='norm_clipped_grad_update'
119  )
120  self.forward_model_build_fun(forward_model)
121 
122  else:
123  assert (self.batch_size % self.num_gpus) == 0
124 
125  data_parallel_model.Parallelize_GPU(
126  forward_model,
127  input_builder_fun=lambda m: None,
128  forward_pass_builder_fun=self.forward_model_build_fun,
129  param_update_builder_fun=None,
130  devices=list(range(self.num_gpus)),
131  )
132 
133  def clipped_grad_update_bound(model):
135  model,
136  scope='norm_clipped_grad_update',
137  )
138 
139  data_parallel_model.Parallelize_GPU(
140  model,
141  input_builder_fun=lambda m: None,
142  forward_pass_builder_fun=self.model_build_fun,
143  param_update_builder_fun=clipped_grad_update_bound,
144  devices=list(range(self.num_gpus)),
145  )
147  model,
148  scope='norm_clipped_sparse_grad_update',
149  )
150  self.model = model
151  self.forward_net = forward_model.net
152 
153  def _build_shared(self, model):
154  optimizer_params = self.model_params['optimizer_params']
155  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
156  self.learning_rate = model.AddParam(
157  name='learning_rate',
158  init_value=float(optimizer_params['learning_rate']),
159  trainable=False,
160  )
161  self.global_step = model.AddParam(
162  name='global_step',
163  init_value=0,
164  trainable=False,
165  )
166  self.start_time = model.AddParam(
167  name='start_time',
168  init_value=time.time(),
169  trainable=False,
170  )
171 
172  def _build_embeddings(self, model):
173  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
174  sqrt3 = math.sqrt(3)
175  self.encoder_embeddings = model.param_init_net.UniformFill(
176  [],
177  'encoder_embeddings',
178  shape=[
179  self.source_vocab_size,
180  self.model_params['encoder_embedding_size'],
181  ],
182  min=-sqrt3,
183  max=sqrt3,
184  )
185  model.params.append(self.encoder_embeddings)
186  self.decoder_embeddings = model.param_init_net.UniformFill(
187  [],
188  'decoder_embeddings',
189  shape=[
190  self.target_vocab_size,
191  self.model_params['decoder_embedding_size'],
192  ],
193  min=-sqrt3,
194  max=sqrt3,
195  )
196  model.params.append(self.decoder_embeddings)
197 
198  def model_build_fun(self, model, forward_only=False, loss_scale=None):
199  encoder_inputs = model.net.AddExternalInput(
200  workspace.GetNameScope() + 'encoder_inputs',
201  )
202  encoder_lengths = model.net.AddExternalInput(
203  workspace.GetNameScope() + 'encoder_lengths',
204  )
205  decoder_inputs = model.net.AddExternalInput(
206  workspace.GetNameScope() + 'decoder_inputs',
207  )
208  decoder_lengths = model.net.AddExternalInput(
209  workspace.GetNameScope() + 'decoder_lengths',
210  )
211  targets = model.net.AddExternalInput(
212  workspace.GetNameScope() + 'targets',
213  )
214  target_weights = model.net.AddExternalInput(
215  workspace.GetNameScope() + 'target_weights',
216  )
217  attention_type = self.model_params['attention']
218  assert attention_type in ['none', 'regular', 'dot']
219 
220  (
221  encoder_outputs,
222  weighted_encoder_outputs,
223  final_encoder_hidden_states,
224  final_encoder_cell_states,
225  encoder_units_per_layer,
226  ) = seq2seq_util.build_embedding_encoder(
227  model=model,
228  encoder_params=self.encoder_params,
229  num_decoder_layers=len(self.model_params['decoder_layer_configs']),
230  inputs=encoder_inputs,
231  input_lengths=encoder_lengths,
232  vocab_size=self.source_vocab_size,
233  embeddings=self.encoder_embeddings,
234  embedding_size=self.model_params['encoder_embedding_size'],
235  use_attention=(attention_type != 'none'),
236  num_gpus=self.num_gpus,
237  )
238 
239  (
240  decoder_outputs,
241  decoder_output_size,
242  ) = seq2seq_util.build_embedding_decoder(
243  model,
244  decoder_layer_configs=self.model_params['decoder_layer_configs'],
245  inputs=decoder_inputs,
246  input_lengths=decoder_lengths,
247  encoder_lengths=encoder_lengths,
248  encoder_outputs=encoder_outputs,
249  weighted_encoder_outputs=weighted_encoder_outputs,
250  final_encoder_hidden_states=final_encoder_hidden_states,
251  final_encoder_cell_states=final_encoder_cell_states,
252  encoder_units_per_layer=encoder_units_per_layer,
253  vocab_size=self.target_vocab_size,
254  embeddings=self.decoder_embeddings,
255  embedding_size=self.model_params['decoder_embedding_size'],
256  attention_type=attention_type,
257  forward_only=False,
258  num_gpus=self.num_gpus,
259  )
260 
261  output_logits = seq2seq_util.output_projection(
262  model=model,
263  decoder_outputs=decoder_outputs,
264  decoder_output_size=decoder_output_size,
265  target_vocab_size=self.target_vocab_size,
266  decoder_softmax_size=self.model_params['decoder_softmax_size'],
267  )
268  targets, _ = model.net.Reshape(
269  [targets],
270  ['targets', 'targets_old_shape'],
271  shape=[-1],
272  )
273  target_weights, _ = model.net.Reshape(
274  [target_weights],
275  ['target_weights', 'target_weights_old_shape'],
276  shape=[-1],
277  )
278  _, loss_per_word = model.net.SoftmaxWithLoss(
279  [output_logits, targets, target_weights],
280  ['OutputProbs_INVALID', 'loss_per_word'],
281  only_loss=True,
282  )
283 
284  num_words = model.net.SumElements(
285  [target_weights],
286  'num_words',
287  )
288  total_loss_scalar = model.net.Mul(
289  [loss_per_word, num_words],
290  'total_loss_scalar',
291  )
292  total_loss_scalar_weighted = model.net.Scale(
293  [total_loss_scalar],
294  'total_loss_scalar_weighted',
295  scale=1.0 / self.batch_size,
296  )
297  return [total_loss_scalar_weighted]
298 
299  def forward_model_build_fun(self, model, loss_scale=None):
300  return self.model_build_fun(
301  model=model,
302  forward_only=True,
303  loss_scale=loss_scale
304  )
305 
306  def _calc_norm_ratio(self, model, params, scope, ONE):
307  with core.NameScope(scope):
308  grad_squared_sums = []
309  for i, param in enumerate(params):
310  logger.info(param)
311  grad = (
312  model.param_to_grad[param]
313  if not isinstance(
314  model.param_to_grad[param],
315  core.GradientSlice,
316  ) else model.param_to_grad[param].values
317  )
318  grad_squared = model.net.Sqr(
319  [grad],
320  'grad_{}_squared'.format(i),
321  )
322  grad_squared_sum = model.net.SumElements(
323  grad_squared,
324  'grad_{}_squared_sum'.format(i),
325  )
326  grad_squared_sums.append(grad_squared_sum)
327 
328  grad_squared_full_sum = model.net.Sum(
329  grad_squared_sums,
330  'grad_squared_full_sum',
331  )
332  global_norm = model.net.Pow(
333  grad_squared_full_sum,
334  'global_norm',
335  exponent=0.5,
336  )
337  clip_norm = model.param_init_net.ConstantFill(
338  [],
339  'clip_norm',
340  shape=[],
341  value=float(self.model_params['max_gradient_norm']),
342  )
343  max_norm = model.net.Max(
344  [global_norm, clip_norm],
345  'max_norm',
346  )
347  norm_ratio = model.net.Div(
348  [clip_norm, max_norm],
349  'norm_ratio',
350  )
351  return norm_ratio
352 
353  def _apply_norm_ratio(
354  self, norm_ratio, model, params, learning_rate, scope, ONE
355  ):
356  for param in params:
357  param_grad = model.param_to_grad[param]
358  nlr = model.net.Negative(
359  [learning_rate],
360  'negative_learning_rate',
361  )
362  with core.NameScope(scope):
363  update_coeff = model.net.Mul(
364  [nlr, norm_ratio],
365  'update_coeff',
366  broadcast=1,
367  )
368  if isinstance(param_grad, core.GradientSlice):
369  param_grad_values = param_grad.values
370 
371  model.net.ScatterWeightedSum(
372  [
373  param,
374  ONE,
375  param_grad.indices,
376  param_grad_values,
377  update_coeff,
378  ],
379  param,
380  )
381  else:
382  model.net.WeightedSum(
383  [
384  param,
385  ONE,
386  param_grad,
387  update_coeff,
388  ],
389  param,
390  )
391 
392  def norm_clipped_grad_update(self, model, scope):
393 
394  if self.num_gpus == 0:
395  learning_rate = self.learning_rate
396  else:
397  learning_rate = model.CopyCPUToGPU(self.learning_rate, 'LR')
398 
399  params = []
400  for param in model.GetParams(top_scope=True):
401  if param in model.param_to_grad:
402  if not isinstance(
403  model.param_to_grad[param],
404  core.GradientSlice,
405  ):
406  params.append(param)
407 
408  ONE = model.param_init_net.ConstantFill(
409  [],
410  'ONE',
411  shape=[1],
412  value=1.0,
413  )
414  logger.info('Dense trainable variables: ')
415  norm_ratio = self._calc_norm_ratio(model, params, scope, ONE)
416  self._apply_norm_ratio(
417  norm_ratio, model, params, learning_rate, scope, ONE
418  )
419 
420  def norm_clipped_sparse_grad_update(self, model, scope):
421  learning_rate = self.learning_rate
422 
423  params = []
424  for param in model.GetParams(top_scope=True):
425  if param in model.param_to_grad:
426  if isinstance(
427  model.param_to_grad[param],
428  core.GradientSlice,
429  ):
430  params.append(param)
431 
432  ONE = model.param_init_net.ConstantFill(
433  [],
434  'ONE',
435  shape=[1],
436  value=1.0,
437  )
438  logger.info('Sparse trainable variables: ')
439  norm_ratio = self._calc_norm_ratio(model, params, scope, ONE)
440  self._apply_norm_ratio(
441  norm_ratio, model, params, learning_rate, scope, ONE
442  )
443 
444  def total_loss_scalar(self):
445  if self.num_gpus == 0:
446  return workspace.FetchBlob('total_loss_scalar')
447  else:
448  total_loss = 0
449  for i in range(self.num_gpus):
450  name = 'gpu_{}/total_loss_scalar'.format(i)
451  gpu_loss = workspace.FetchBlob(name)
452  total_loss += gpu_loss
453  return total_loss
454 
455  def _init_model(self):
456  workspace.RunNetOnce(self.model.param_init_net)
457 
458  def create_net(net):
459  workspace.CreateNet(
460  net,
461  input_blobs=[str(i) for i in net.external_inputs],
462  )
463 
464  create_net(self.model.net)
465  create_net(self.forward_net)
466 
467  def __init__(
468  self,
469  model_params,
470  source_vocab_size,
471  target_vocab_size,
472  num_gpus=1,
473  num_cpus=1,
474  ):
475  self.model_params = model_params
476  self.encoder_type = 'rnn'
477  self.encoder_params = model_params['encoder_type']
478  self.source_vocab_size = source_vocab_size
479  self.target_vocab_size = target_vocab_size
480  self.num_gpus = num_gpus
481  self.num_cpus = num_cpus
482  self.batch_size = model_params['batch_size']
483 
484  workspace.GlobalInit([
485  'caffe2',
486  # NOTE: modify log level for debugging purposes
487  '--caffe2_log_level=0',
488  # NOTE: modify log level for debugging purposes
489  '--v=0',
490  # Fail gracefully if one of the threads fails
491  '--caffe2_handle_executor_threads_exceptions=1',
492  '--caffe2_mkl_num_threads=' + str(self.num_cpus),
493  ])
494 
495  def __enter__(self):
496  return self
497 
498  def __exit__(self, exc_type, exc_value, traceback):
499  workspace.ResetWorkspace()
500 
501  def initialize_from_scratch(self):
502  logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Start')
503  self._build_model(init_params=True)
504  self._init_model()
505  logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Finish')
506 
507  def get_current_step(self):
508  return workspace.FetchBlob(self.global_step)[0]
509 
510  def inc_current_step(self):
511  workspace.FeedBlob(
512  self.global_step,
513  np.array([self.get_current_step() + 1]),
514  )
515 
516  def step(
517  self,
518  batch,
519  forward_only
520  ):
521  if self.num_gpus < 1:
522  batch_obj = prepare_batch(batch)
523  for batch_obj_name, batch_obj_value in zip(
524  Batch._fields,
525  batch_obj,
526  ):
527  workspace.FeedBlob(batch_obj_name, batch_obj_value)
528  else:
529  for i in range(self.num_gpus):
530  gpu_batch = batch[i::self.num_gpus]
531  batch_obj = prepare_batch(gpu_batch)
532  for batch_obj_name, batch_obj_value in zip(
533  Batch._fields,
534  batch_obj,
535  ):
536  name = 'gpu_{}/{}'.format(i, batch_obj_name)
537  if batch_obj_name in ['encoder_inputs', 'decoder_inputs']:
538  dev = core.DeviceOption(caffe2_pb2.CPU)
539  else:
540  dev = core.DeviceOption(workspace.GpuDeviceType, i)
541  workspace.FeedBlob(name, batch_obj_value, device_option=dev)
542 
543  if forward_only:
544  workspace.RunNet(self.forward_net)
545  else:
546  workspace.RunNet(self.model.net)
547  self.inc_current_step()
548 
549  return self.total_loss_scalar()
550 
551  def save(self, checkpoint_path_prefix, current_step):
552  checkpoint_path = '{0}-{1}'.format(
553  checkpoint_path_prefix,
554  current_step,
555  )
556 
557  assert workspace.RunOperatorOnce(core.CreateOperator(
558  'Save',
559  self.model.GetAllParams(),
560  [],
561  absolute_path=True,
562  db=checkpoint_path,
563  db_type='minidb',
564  ))
565 
566  checkpoint_config_path = os.path.join(
567  os.path.dirname(checkpoint_path_prefix),
568  'checkpoint',
569  )
570  with open(checkpoint_config_path, 'w') as checkpoint_config_file:
571  checkpoint_config_file.write(
572  'model_checkpoint_path: "' + checkpoint_path + '"\n'
573  'all_model_checkpoint_paths: "' + checkpoint_path + '"\n'
574  )
575  logger.info('Saved checkpoint file to ' + checkpoint_path)
576 
577  return checkpoint_path
578 
579 def gen_batches(source_corpus, target_corpus, source_vocab, target_vocab,
580  batch_size, max_length):
581  with open(source_corpus) as source, open(target_corpus) as target:
582  parallel_sentences = []
583  for source_sentence, target_sentence in zip(source, target):
584  numerized_source_sentence = seq2seq_util.get_numberized_sentence(
585  source_sentence,
586  source_vocab,
587  )
588  numerized_target_sentence = seq2seq_util.get_numberized_sentence(
589  target_sentence,
590  target_vocab,
591  )
592  if (
593  len(numerized_source_sentence) > 0 and
594  len(numerized_target_sentence) > 0 and
595  (
596  max_length is None or (
597  len(numerized_source_sentence) <= max_length and
598  len(numerized_target_sentence) <= max_length
599  )
600  )
601  ):
602  parallel_sentences.append((
603  numerized_source_sentence,
604  numerized_target_sentence,
605  ))
606  parallel_sentences.sort(key=lambda s_t: (len(s_t[0]), len(s_t[1])))
607 
608  batches, batch = [], []
609  for sentence_pair in parallel_sentences:
610  batch.append(sentence_pair)
611  if len(batch) >= batch_size:
612  batches.append(batch)
613  batch = []
614  if len(batch) > 0:
615  while len(batch) < batch_size:
616  batch.append(batch[-1])
617  assert len(batch) == batch_size
618  batches.append(batch)
619  random.shuffle(batches)
620  return batches
621 
622 
623 def run_seq2seq_model(args, model_params=None):
624  source_vocab = seq2seq_util.gen_vocab(
625  args.source_corpus,
626  args.unk_threshold,
627  )
628  target_vocab = seq2seq_util.gen_vocab(
629  args.target_corpus,
630  args.unk_threshold,
631  )
632  logger.info('Source vocab size {}'.format(len(source_vocab)))
633  logger.info('Target vocab size {}'.format(len(target_vocab)))
634 
635  batches = gen_batches(args.source_corpus, args.target_corpus, source_vocab,
636  target_vocab, model_params['batch_size'],
637  args.max_length)
638  logger.info('Number of training batches {}'.format(len(batches)))
639 
640  batches_eval = gen_batches(args.source_corpus_eval, args.target_corpus_eval,
641  source_vocab, target_vocab,
642  model_params['batch_size'], args.max_length)
643  logger.info('Number of eval batches {}'.format(len(batches_eval)))
644 
645  with Seq2SeqModelCaffe2(
646  model_params=model_params,
647  source_vocab_size=len(source_vocab),
648  target_vocab_size=len(target_vocab),
649  num_gpus=args.num_gpus,
650  num_cpus=20,
651  ) as model_obj:
652  model_obj.initialize_from_scratch()
653  for i in range(args.epochs):
654  logger.info('Epoch {}'.format(i))
655  total_loss = 0
656  for batch in batches:
657  total_loss += model_obj.step(
658  batch=batch,
659  forward_only=False,
660  )
661  logger.info('\ttraining loss {}'.format(total_loss))
662  total_loss = 0
663  for batch in batches_eval:
664  total_loss += model_obj.step(
665  batch=batch,
666  forward_only=True,
667  )
668  logger.info('\teval loss {}'.format(total_loss))
669  if args.checkpoint is not None:
670  model_obj.save(args.checkpoint, i)
671 
672 
673 def main():
674  random.seed(31415)
675  parser = argparse.ArgumentParser(
676  description='Caffe2: Seq2Seq Training'
677  )
678  parser.add_argument('--source-corpus', type=str, default=None,
679  help='Path to source corpus in a text file format. Each '
680  'line in the file should contain a single sentence',
681  required=True)
682  parser.add_argument('--target-corpus', type=str, default=None,
683  help='Path to target corpus in a text file format',
684  required=True)
685  parser.add_argument('--max-length', type=int, default=None,
686  help='Maximal lengths of train and eval sentences')
687  parser.add_argument('--unk-threshold', type=int, default=50,
688  help='Threshold frequency under which token becomes '
689  'labeled unknown token')
690 
691  parser.add_argument('--batch-size', type=int, default=32,
692  help='Training batch size')
693  parser.add_argument('--epochs', type=int, default=10,
694  help='Number of iterations over training data')
695  parser.add_argument('--learning-rate', type=float, default=0.5,
696  help='Learning rate')
697  parser.add_argument('--max-gradient-norm', type=float, default=1.0,
698  help='Max global norm of gradients at the end of each '
699  'backward pass. We do clipping to match the number.')
700  parser.add_argument('--num-gpus', type=int, default=0,
701  help='Number of GPUs for data parallel model')
702 
703  parser.add_argument('--use-bidirectional-encoder', action='store_true',
704  help='Set flag to use bidirectional recurrent network '
705  'for first layer of encoder')
706  parser.add_argument('--use-attention', action='store_true',
707  help='Set flag to use seq2seq with attention model')
708  parser.add_argument('--source-corpus-eval', type=str, default=None,
709  help='Path to source corpus for evaluation in a text '
710  'file format', required=True)
711  parser.add_argument('--target-corpus-eval', type=str, default=None,
712  help='Path to target corpus for evaluation in a text '
713  'file format', required=True)
714  parser.add_argument('--encoder-cell-num-units', type=int, default=512,
715  help='Number of cell units per encoder layer')
716  parser.add_argument('--encoder-num-layers', type=int, default=2,
717  help='Number encoder layers')
718  parser.add_argument('--decoder-cell-num-units', type=int, default=512,
719  help='Number of cell units in the decoder layer')
720  parser.add_argument('--decoder-num-layers', type=int, default=2,
721  help='Number decoder layers')
722  parser.add_argument('--encoder-embedding-size', type=int, default=256,
723  help='Size of embedding in the encoder layer')
724  parser.add_argument('--decoder-embedding-size', type=int, default=512,
725  help='Size of embedding in the decoder layer')
726  parser.add_argument('--decoder-softmax-size', type=int, default=None,
727  help='Size of softmax layer in the decoder')
728 
729  parser.add_argument('--checkpoint', type=str, default=None,
730  help='Path to checkpoint')
731 
732  args = parser.parse_args()
733 
734  encoder_layer_configs = [
735  dict(
736  num_units=args.encoder_cell_num_units,
737  ),
738  ] * args.encoder_num_layers
739 
740  if args.use_bidirectional_encoder:
741  assert args.encoder_cell_num_units % 2 == 0
742  encoder_layer_configs[0]['num_units'] /= 2
743 
744  decoder_layer_configs = [
745  dict(
746  num_units=args.decoder_cell_num_units,
747  ),
748  ] * args.decoder_num_layers
749 
750  run_seq2seq_model(args, model_params=dict(
751  attention=('regular' if args.use_attention else 'none'),
752  decoder_layer_configs=decoder_layer_configs,
753  encoder_type=dict(
754  encoder_layer_configs=encoder_layer_configs,
755  use_bidirectional_encoder=args.use_bidirectional_encoder,
756  ),
757  batch_size=args.batch_size,
758  optimizer_params=dict(
759  learning_rate=args.learning_rate,
760  ),
761  encoder_embedding_size=args.encoder_embedding_size,
762  decoder_embedding_size=args.decoder_embedding_size,
763  decoder_softmax_size=args.decoder_softmax_size,
764  max_gradient_norm=args.max_gradient_norm,
765  ))
766 
767 
768 if __name__ == '__main__':
769  main()
def _calc_norm_ratio(self, model, params, scope, ONE)
Definition: train.py:306
def forward_model_build_fun(self, model, loss_scale=None)
Definition: train.py:299
def _apply_norm_ratio(self, norm_ratio, model, params, learning_rate, scope, ONE)
Definition: train.py:355
def norm_clipped_grad_update(self, model, scope)
Definition: train.py:392
def norm_clipped_sparse_grad_update(self, model, scope)
Definition: train.py:420
def model_build_fun(self, model, forward_only=False, loss_scale=None)
Definition: train.py:198