Caffe2 - Python API
A deep learning, cross platform ML framework
train.py
1 # Copyright (c) 2016-present, Facebook, Inc.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 ##############################################################################
15 
16 ## @package train
17 # Module caffe2.python.models.seq2seq.train
18 from __future__ import absolute_import
19 from __future__ import division
20 from __future__ import print_function
21 from __future__ import unicode_literals
22 
23 import argparse
24 import collections
25 import logging
26 import math
27 import numpy as np
28 import random
29 import time
30 import sys
31 import os
32 
33 import caffe2.proto.caffe2_pb2 as caffe2_pb2
34 from caffe2.python import core, workspace, data_parallel_model
35 import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
36 from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
37 
38 
39 logger = logging.getLogger(__name__)
40 logger.setLevel(logging.INFO)
41 logger.addHandler(logging.StreamHandler(sys.stderr))
42 
43 Batch = collections.namedtuple('Batch', [
44  'encoder_inputs',
45  'encoder_lengths',
46  'decoder_inputs',
47  'decoder_lengths',
48  'targets',
49  'target_weights',
50 ])
51 
52 
53 def prepare_batch(batch):
54  encoder_lengths = [len(entry[0]) for entry in batch]
55  max_encoder_length = max(encoder_lengths)
56  decoder_lengths = []
57  max_decoder_length = max([len(entry[1]) for entry in batch])
58 
59  batch_encoder_inputs = []
60  batch_decoder_inputs = []
61  batch_targets = []
62  batch_target_weights = []
63 
64  for source_seq, target_seq in batch:
65  encoder_pads = (
66  [seq2seq_util.PAD_ID] * (max_encoder_length - len(source_seq))
67  )
68  batch_encoder_inputs.append(
69  list(reversed(source_seq)) + encoder_pads
70  )
71 
72  decoder_pads = (
73  [seq2seq_util.PAD_ID] * (max_decoder_length - len(target_seq))
74  )
75  target_seq_with_go_token = [seq2seq_util.GO_ID] + target_seq
76  decoder_lengths.append(len(target_seq_with_go_token))
77  batch_decoder_inputs.append(target_seq_with_go_token + decoder_pads)
78 
79  target_seq_with_eos = target_seq + [seq2seq_util.EOS_ID]
80  targets = target_seq_with_eos + decoder_pads
81  batch_targets.append(targets)
82 
83  if len(source_seq) + len(target_seq) == 0:
84  target_weights = [0] * len(targets)
85  else:
86  target_weights = [
87  1 if target != seq2seq_util.PAD_ID else 0
88  for target in targets
89  ]
90  batch_target_weights.append(target_weights)
91 
92  return Batch(
93  encoder_inputs=np.array(
94  batch_encoder_inputs,
95  dtype=np.int32,
96  ).transpose(),
97  encoder_lengths=np.array(encoder_lengths, dtype=np.int32),
98  decoder_inputs=np.array(
99  batch_decoder_inputs,
100  dtype=np.int32,
101  ).transpose(),
102  decoder_lengths=np.array(decoder_lengths, dtype=np.int32),
103  targets=np.array(
104  batch_targets,
105  dtype=np.int32,
106  ).transpose(),
107  target_weights=np.array(
108  batch_target_weights,
109  dtype=np.float32,
110  ).transpose(),
111  )
112 
113 
114 class Seq2SeqModelCaffe2(object):
115 
116  def _build_model(
117  self,
118  init_params,
119  ):
120  model = Seq2SeqModelHelper(init_params=init_params)
121  self._build_shared(model)
122  self._build_embeddings(model)
123 
124  forward_model = Seq2SeqModelHelper(init_params=init_params)
125  self._build_shared(forward_model)
126  self._build_embeddings(forward_model)
127 
128  if self.num_gpus == 0:
129  loss_blobs = self.model_build_fun(model)
130  model.AddGradientOperators(loss_blobs)
132  model,
133  scope='norm_clipped_grad_update'
134  )
135  self.forward_model_build_fun(forward_model)
136 
137  else:
138  assert (self.batch_size % self.num_gpus) == 0
139 
140  data_parallel_model.Parallelize_GPU(
141  forward_model,
142  input_builder_fun=lambda m: None,
143  forward_pass_builder_fun=self.forward_model_build_fun,
144  param_update_builder_fun=None,
145  devices=list(range(self.num_gpus)),
146  )
147 
148  def clipped_grad_update_bound(model):
150  model,
151  scope='norm_clipped_grad_update',
152  )
153 
154  data_parallel_model.Parallelize_GPU(
155  model,
156  input_builder_fun=lambda m: None,
157  forward_pass_builder_fun=self.model_build_fun,
158  param_update_builder_fun=clipped_grad_update_bound,
159  devices=list(range(self.num_gpus)),
160  )
162  model,
163  scope='norm_clipped_sparse_grad_update',
164  )
165  self.model = model
166  self.forward_net = forward_model.net
167 
168  def _build_shared(self, model):
169  optimizer_params = self.model_params['optimizer_params']
170  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
171  self.learning_rate = model.AddParam(
172  name='learning_rate',
173  init_value=float(optimizer_params['learning_rate']),
174  trainable=False,
175  )
176  self.global_step = model.AddParam(
177  name='global_step',
178  init_value=0,
179  trainable=False,
180  )
181  self.start_time = model.AddParam(
182  name='start_time',
183  init_value=time.time(),
184  trainable=False,
185  )
186 
187  def _build_embeddings(self, model):
188  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
189  sqrt3 = math.sqrt(3)
190  self.encoder_embeddings = model.param_init_net.UniformFill(
191  [],
192  'encoder_embeddings',
193  shape=[
194  self.source_vocab_size,
195  self.model_params['encoder_embedding_size'],
196  ],
197  min=-sqrt3,
198  max=sqrt3,
199  )
200  model.params.append(self.encoder_embeddings)
201  self.decoder_embeddings = model.param_init_net.UniformFill(
202  [],
203  'decoder_embeddings',
204  shape=[
205  self.target_vocab_size,
206  self.model_params['decoder_embedding_size'],
207  ],
208  min=-sqrt3,
209  max=sqrt3,
210  )
211  model.params.append(self.decoder_embeddings)
212 
213  def model_build_fun(self, model, forward_only=False, loss_scale=None):
214  encoder_inputs = model.net.AddExternalInput(
215  workspace.GetNameScope() + 'encoder_inputs',
216  )
217  encoder_lengths = model.net.AddExternalInput(
218  workspace.GetNameScope() + 'encoder_lengths',
219  )
220  decoder_inputs = model.net.AddExternalInput(
221  workspace.GetNameScope() + 'decoder_inputs',
222  )
223  decoder_lengths = model.net.AddExternalInput(
224  workspace.GetNameScope() + 'decoder_lengths',
225  )
226  targets = model.net.AddExternalInput(
227  workspace.GetNameScope() + 'targets',
228  )
229  target_weights = model.net.AddExternalInput(
230  workspace.GetNameScope() + 'target_weights',
231  )
232  attention_type = self.model_params['attention']
233  assert attention_type in ['none', 'regular', 'dot']
234 
235  (
236  encoder_outputs,
237  weighted_encoder_outputs,
238  final_encoder_hidden_states,
239  final_encoder_cell_states,
240  encoder_units_per_layer,
241  ) = seq2seq_util.build_embedding_encoder(
242  model=model,
243  encoder_params=self.encoder_params,
244  num_decoder_layers=len(self.model_params['decoder_layer_configs']),
245  inputs=encoder_inputs,
246  input_lengths=encoder_lengths,
247  vocab_size=self.source_vocab_size,
248  embeddings=self.encoder_embeddings,
249  embedding_size=self.model_params['encoder_embedding_size'],
250  use_attention=(attention_type != 'none'),
251  num_gpus=self.num_gpus,
252  )
253 
254  (
255  decoder_outputs,
256  decoder_output_size,
257  ) = seq2seq_util.build_embedding_decoder(
258  model,
259  decoder_layer_configs=self.model_params['decoder_layer_configs'],
260  inputs=decoder_inputs,
261  input_lengths=decoder_lengths,
262  encoder_lengths=encoder_lengths,
263  encoder_outputs=encoder_outputs,
264  weighted_encoder_outputs=weighted_encoder_outputs,
265  final_encoder_hidden_states=final_encoder_hidden_states,
266  final_encoder_cell_states=final_encoder_cell_states,
267  encoder_units_per_layer=encoder_units_per_layer,
268  vocab_size=self.target_vocab_size,
269  embeddings=self.decoder_embeddings,
270  embedding_size=self.model_params['decoder_embedding_size'],
271  attention_type=attention_type,
272  forward_only=False,
273  num_gpus=self.num_gpus,
274  )
275 
276  output_logits = seq2seq_util.output_projection(
277  model=model,
278  decoder_outputs=decoder_outputs,
279  decoder_output_size=decoder_output_size,
280  target_vocab_size=self.target_vocab_size,
281  decoder_softmax_size=self.model_params['decoder_softmax_size'],
282  )
283  targets, _ = model.net.Reshape(
284  [targets],
285  ['targets', 'targets_old_shape'],
286  shape=[-1],
287  )
288  target_weights, _ = model.net.Reshape(
289  [target_weights],
290  ['target_weights', 'target_weights_old_shape'],
291  shape=[-1],
292  )
293  _, loss_per_word = model.net.SoftmaxWithLoss(
294  [output_logits, targets, target_weights],
295  ['OutputProbs_INVALID', 'loss_per_word'],
296  only_loss=True,
297  )
298 
299  num_words = model.net.SumElements(
300  [target_weights],
301  'num_words',
302  )
303  total_loss_scalar = model.net.Mul(
304  [loss_per_word, num_words],
305  'total_loss_scalar',
306  )
307  total_loss_scalar_weighted = model.net.Scale(
308  [total_loss_scalar],
309  'total_loss_scalar_weighted',
310  scale=1.0 / self.batch_size,
311  )
312  return [total_loss_scalar_weighted]
313 
314  def forward_model_build_fun(self, model, loss_scale=None):
315  return self.model_build_fun(
316  model=model,
317  forward_only=True,
318  loss_scale=loss_scale
319  )
320 
321  def _calc_norm_ratio(self, model, params, scope, ONE):
322  with core.NameScope(scope):
323  grad_squared_sums = []
324  for i, param in enumerate(params):
325  logger.info(param)
326  grad = (
327  model.param_to_grad[param]
328  if not isinstance(
329  model.param_to_grad[param],
330  core.GradientSlice,
331  ) else model.param_to_grad[param].values
332  )
333  grad_squared = model.net.Sqr(
334  [grad],
335  'grad_{}_squared'.format(i),
336  )
337  grad_squared_sum = model.net.SumElements(
338  grad_squared,
339  'grad_{}_squared_sum'.format(i),
340  )
341  grad_squared_sums.append(grad_squared_sum)
342 
343  grad_squared_full_sum = model.net.Sum(
344  grad_squared_sums,
345  'grad_squared_full_sum',
346  )
347  global_norm = model.net.Pow(
348  grad_squared_full_sum,
349  'global_norm',
350  exponent=0.5,
351  )
352  clip_norm = model.param_init_net.ConstantFill(
353  [],
354  'clip_norm',
355  shape=[],
356  value=float(self.model_params['max_gradient_norm']),
357  )
358  max_norm = model.net.Max(
359  [global_norm, clip_norm],
360  'max_norm',
361  )
362  norm_ratio = model.net.Div(
363  [clip_norm, max_norm],
364  'norm_ratio',
365  )
366  return norm_ratio
367 
368  def _apply_norm_ratio(
369  self, norm_ratio, model, params, learning_rate, scope, ONE
370  ):
371  for param in params:
372  param_grad = model.param_to_grad[param]
373  nlr = model.net.Negative(
374  [learning_rate],
375  'negative_learning_rate',
376  )
377  with core.NameScope(scope):
378  update_coeff = model.net.Mul(
379  [nlr, norm_ratio],
380  'update_coeff',
381  broadcast=1,
382  )
383  if isinstance(param_grad, core.GradientSlice):
384  param_grad_values = param_grad.values
385 
386  model.net.ScatterWeightedSum(
387  [
388  param,
389  ONE,
390  param_grad.indices,
391  param_grad_values,
392  update_coeff,
393  ],
394  param,
395  )
396  else:
397  model.net.WeightedSum(
398  [
399  param,
400  ONE,
401  param_grad,
402  update_coeff,
403  ],
404  param,
405  )
406 
407  def norm_clipped_grad_update(self, model, scope):
408 
409  if self.num_gpus == 0:
410  learning_rate = self.learning_rate
411  else:
412  learning_rate = model.CopyCPUToGPU(self.learning_rate, 'LR')
413 
414  params = []
415  for param in model.GetParams(top_scope=True):
416  if param in model.param_to_grad:
417  if not isinstance(
418  model.param_to_grad[param],
419  core.GradientSlice,
420  ):
421  params.append(param)
422 
423  ONE = model.param_init_net.ConstantFill(
424  [],
425  'ONE',
426  shape=[1],
427  value=1.0,
428  )
429  logger.info('Dense trainable variables: ')
430  norm_ratio = self._calc_norm_ratio(model, params, scope, ONE)
431  self._apply_norm_ratio(
432  norm_ratio, model, params, learning_rate, scope, ONE
433  )
434 
435  def norm_clipped_sparse_grad_update(self, model, scope):
436  learning_rate = self.learning_rate
437 
438  params = []
439  for param in model.GetParams(top_scope=True):
440  if param in model.param_to_grad:
441  if isinstance(
442  model.param_to_grad[param],
443  core.GradientSlice,
444  ):
445  params.append(param)
446 
447  ONE = model.param_init_net.ConstantFill(
448  [],
449  'ONE',
450  shape=[1],
451  value=1.0,
452  )
453  logger.info('Sparse trainable variables: ')
454  norm_ratio = self._calc_norm_ratio(model, params, scope, ONE)
455  self._apply_norm_ratio(
456  norm_ratio, model, params, learning_rate, scope, ONE
457  )
458 
459  def total_loss_scalar(self):
460  if self.num_gpus == 0:
461  return workspace.FetchBlob('total_loss_scalar')
462  else:
463  total_loss = 0
464  for i in range(self.num_gpus):
465  name = 'gpu_{}/total_loss_scalar'.format(i)
466  gpu_loss = workspace.FetchBlob(name)
467  total_loss += gpu_loss
468  return total_loss
469 
470  def _init_model(self):
471  workspace.RunNetOnce(self.model.param_init_net)
472 
473  def create_net(net):
474  workspace.CreateNet(
475  net,
476  input_blobs=[str(i) for i in net.external_inputs],
477  )
478 
479  create_net(self.model.net)
480  create_net(self.forward_net)
481 
482  def __init__(
483  self,
484  model_params,
485  source_vocab_size,
486  target_vocab_size,
487  num_gpus=1,
488  num_cpus=1,
489  ):
490  self.model_params = model_params
491  self.encoder_type = 'rnn'
492  self.encoder_params = model_params['encoder_type']
493  self.source_vocab_size = source_vocab_size
494  self.target_vocab_size = target_vocab_size
495  self.num_gpus = num_gpus
496  self.num_cpus = num_cpus
497  self.batch_size = model_params['batch_size']
498 
499  workspace.GlobalInit([
500  'caffe2',
501  # NOTE: modify log level for debugging purposes
502  '--caffe2_log_level=0',
503  # NOTE: modify log level for debugging purposes
504  '--v=0',
505  # Fail gracefully if one of the threads fails
506  '--caffe2_handle_executor_threads_exceptions=1',
507  '--caffe2_mkl_num_threads=' + str(self.num_cpus),
508  ])
509 
510  def __enter__(self):
511  return self
512 
513  def __exit__(self, exc_type, exc_value, traceback):
514  workspace.ResetWorkspace()
515 
516  def initialize_from_scratch(self):
517  logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Start')
518  self._build_model(init_params=True)
519  self._init_model()
520  logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Finish')
521 
522  def get_current_step(self):
523  return workspace.FetchBlob(self.global_step)[0]
524 
525  def inc_current_step(self):
526  workspace.FeedBlob(
527  self.global_step,
528  np.array([self.get_current_step() + 1]),
529  )
530 
531  def step(
532  self,
533  batch,
534  forward_only
535  ):
536  if self.num_gpus < 1:
537  batch_obj = prepare_batch(batch)
538  for batch_obj_name, batch_obj_value in zip(
539  Batch._fields,
540  batch_obj,
541  ):
542  workspace.FeedBlob(batch_obj_name, batch_obj_value)
543  else:
544  for i in range(self.num_gpus):
545  gpu_batch = batch[i::self.num_gpus]
546  batch_obj = prepare_batch(gpu_batch)
547  for batch_obj_name, batch_obj_value in zip(
548  Batch._fields,
549  batch_obj,
550  ):
551  name = 'gpu_{}/{}'.format(i, batch_obj_name)
552  if batch_obj_name in ['encoder_inputs', 'decoder_inputs']:
553  dev = core.DeviceOption(caffe2_pb2.CPU)
554  else:
555  dev = core.DeviceOption(caffe2_pb2.CUDA, i)
556  workspace.FeedBlob(name, batch_obj_value, device_option=dev)
557 
558  if forward_only:
559  workspace.RunNet(self.forward_net)
560  else:
561  workspace.RunNet(self.model.net)
562  self.inc_current_step()
563 
564  return self.total_loss_scalar()
565 
566  def save(self, checkpoint_path_prefix, current_step):
567  checkpoint_path = '{0}-{1}'.format(
568  checkpoint_path_prefix,
569  current_step,
570  )
571 
572  assert workspace.RunOperatorOnce(core.CreateOperator(
573  'Save',
574  self.model.GetAllParams(),
575  [],
576  absolute_path=True,
577  db=checkpoint_path,
578  db_type='minidb',
579  ))
580 
581  checkpoint_config_path = os.path.join(
582  os.path.dirname(checkpoint_path_prefix),
583  'checkpoint',
584  )
585  with open(checkpoint_config_path, 'w') as checkpoint_config_file:
586  checkpoint_config_file.write(
587  'model_checkpoint_path: "' + checkpoint_path + '"\n'
588  'all_model_checkpoint_paths: "' + checkpoint_path + '"\n'
589  )
590  logger.info('Saved checkpoint file to ' + checkpoint_path)
591 
592  return checkpoint_path
593 
594 def gen_batches(source_corpus, target_corpus, source_vocab, target_vocab,
595  batch_size, max_length):
596  with open(source_corpus) as source, open(target_corpus) as target:
597  parallel_sentences = []
598  for source_sentence, target_sentence in zip(source, target):
599  numerized_source_sentence = seq2seq_util.get_numberized_sentence(
600  source_sentence,
601  source_vocab,
602  )
603  numerized_target_sentence = seq2seq_util.get_numberized_sentence(
604  target_sentence,
605  target_vocab,
606  )
607  if (
608  len(numerized_source_sentence) > 0 and
609  len(numerized_target_sentence) > 0 and
610  (
611  max_length is None or (
612  len(numerized_source_sentence) <= max_length and
613  len(numerized_target_sentence) <= max_length
614  )
615  )
616  ):
617  parallel_sentences.append((
618  numerized_source_sentence,
619  numerized_target_sentence,
620  ))
621  parallel_sentences.sort(key=lambda s_t: (len(s_t[0]), len(s_t[1])))
622 
623  batches, batch = [], []
624  for sentence_pair in parallel_sentences:
625  batch.append(sentence_pair)
626  if len(batch) >= batch_size:
627  batches.append(batch)
628  batch = []
629  if len(batch) > 0:
630  while len(batch) < batch_size:
631  batch.append(batch[-1])
632  assert len(batch) == batch_size
633  batches.append(batch)
634  random.shuffle(batches)
635  return batches
636 
637 
638 def run_seq2seq_model(args, model_params=None):
639  source_vocab = seq2seq_util.gen_vocab(
640  args.source_corpus,
641  args.unk_threshold,
642  )
643  target_vocab = seq2seq_util.gen_vocab(
644  args.target_corpus,
645  args.unk_threshold,
646  )
647  logger.info('Source vocab size {}'.format(len(source_vocab)))
648  logger.info('Target vocab size {}'.format(len(target_vocab)))
649 
650  batches = gen_batches(args.source_corpus, args.target_corpus, source_vocab,
651  target_vocab, model_params['batch_size'],
652  args.max_length)
653  logger.info('Number of training batches {}'.format(len(batches)))
654 
655  batches_eval = gen_batches(args.source_corpus_eval, args.target_corpus_eval,
656  source_vocab, target_vocab,
657  model_params['batch_size'], args.max_length)
658  logger.info('Number of eval batches {}'.format(len(batches_eval)))
659 
660  with Seq2SeqModelCaffe2(
661  model_params=model_params,
662  source_vocab_size=len(source_vocab),
663  target_vocab_size=len(target_vocab),
664  num_gpus=args.num_gpus,
665  num_cpus=20,
666  ) as model_obj:
667  model_obj.initialize_from_scratch()
668  for i in range(args.epochs):
669  logger.info('Epoch {}'.format(i))
670  total_loss = 0
671  for batch in batches:
672  total_loss += model_obj.step(
673  batch=batch,
674  forward_only=False,
675  )
676  logger.info('\ttraining loss {}'.format(total_loss))
677  total_loss = 0
678  for batch in batches_eval:
679  total_loss += model_obj.step(
680  batch=batch,
681  forward_only=True,
682  )
683  logger.info('\teval loss {}'.format(total_loss))
684  if args.checkpoint is not None:
685  model_obj.save(args.checkpoint, i)
686 
687 
688 def main():
689  random.seed(31415)
690  parser = argparse.ArgumentParser(
691  description='Caffe2: Seq2Seq Training'
692  )
693  parser.add_argument('--source-corpus', type=str, default=None,
694  help='Path to source corpus in a text file format. Each '
695  'line in the file should contain a single sentence',
696  required=True)
697  parser.add_argument('--target-corpus', type=str, default=None,
698  help='Path to target corpus in a text file format',
699  required=True)
700  parser.add_argument('--max-length', type=int, default=None,
701  help='Maximal lengths of train and eval sentences')
702  parser.add_argument('--unk-threshold', type=int, default=50,
703  help='Threshold frequency under which token becomes '
704  'labeled unknown token')
705 
706  parser.add_argument('--batch-size', type=int, default=32,
707  help='Training batch size')
708  parser.add_argument('--epochs', type=int, default=10,
709  help='Number of iterations over training data')
710  parser.add_argument('--learning-rate', type=float, default=0.5,
711  help='Learning rate')
712  parser.add_argument('--max-gradient-norm', type=float, default=1.0,
713  help='Max global norm of gradients at the end of each '
714  'backward pass. We do clipping to match the number.')
715  parser.add_argument('--num-gpus', type=int, default=0,
716  help='Number of GPUs for data parallel model')
717 
718  parser.add_argument('--use-bidirectional-encoder', action='store_true',
719  help='Set flag to use bidirectional recurrent network '
720  'for first layer of encoder')
721  parser.add_argument('--use-attention', action='store_true',
722  help='Set flag to use seq2seq with attention model')
723  parser.add_argument('--source-corpus-eval', type=str, default=None,
724  help='Path to source corpus for evaluation in a text '
725  'file format', required=True)
726  parser.add_argument('--target-corpus-eval', type=str, default=None,
727  help='Path to target corpus for evaluation in a text '
728  'file format', required=True)
729  parser.add_argument('--encoder-cell-num-units', type=int, default=512,
730  help='Number of cell units per encoder layer')
731  parser.add_argument('--encoder-num-layers', type=int, default=2,
732  help='Number encoder layers')
733  parser.add_argument('--decoder-cell-num-units', type=int, default=512,
734  help='Number of cell units in the decoder layer')
735  parser.add_argument('--decoder-num-layers', type=int, default=2,
736  help='Number decoder layers')
737  parser.add_argument('--encoder-embedding-size', type=int, default=256,
738  help='Size of embedding in the encoder layer')
739  parser.add_argument('--decoder-embedding-size', type=int, default=512,
740  help='Size of embedding in the decoder layer')
741  parser.add_argument('--decoder-softmax-size', type=int, default=None,
742  help='Size of softmax layer in the decoder')
743 
744  parser.add_argument('--checkpoint', type=str, default=None,
745  help='Path to checkpoint')
746 
747  args = parser.parse_args()
748 
749  encoder_layer_configs = [
750  dict(
751  num_units=args.encoder_cell_num_units,
752  ),
753  ] * args.encoder_num_layers
754 
755  if args.use_bidirectional_encoder:
756  assert args.encoder_cell_num_units % 2 == 0
757  encoder_layer_configs[0]['num_units'] /= 2
758 
759  decoder_layer_configs = [
760  dict(
761  num_units=args.decoder_cell_num_units,
762  ),
763  ] * args.decoder_num_layers
764 
765  run_seq2seq_model(args, model_params=dict(
766  attention=('regular' if args.use_attention else 'none'),
767  decoder_layer_configs=decoder_layer_configs,
768  encoder_type=dict(
769  encoder_layer_configs=encoder_layer_configs,
770  use_bidirectional_encoder=args.use_bidirectional_encoder,
771  ),
772  batch_size=args.batch_size,
773  optimizer_params=dict(
774  learning_rate=args.learning_rate,
775  ),
776  encoder_embedding_size=args.encoder_embedding_size,
777  decoder_embedding_size=args.decoder_embedding_size,
778  decoder_softmax_size=args.decoder_softmax_size,
779  max_gradient_norm=args.max_gradient_norm,
780  ))
781 
782 
783 if __name__ == '__main__':
784  main()
def _calc_norm_ratio(self, model, params, scope, ONE)
Definition: train.py:321
def forward_model_build_fun(self, model, loss_scale=None)
Definition: train.py:314
def _apply_norm_ratio(self, norm_ratio, model, params, learning_rate, scope, ONE)
Definition: train.py:370
def norm_clipped_grad_update(self, model, scope)
Definition: train.py:407
def norm_clipped_sparse_grad_update(self, model, scope)
Definition: train.py:435
def model_build_fun(self, model, forward_only=False, loss_scale=None)
Definition: train.py:213