3 from __future__
import absolute_import
4 from __future__
import division
5 from __future__
import print_function
6 from __future__
import unicode_literals
18 import caffe2.proto.caffe2_pb2
as caffe2_pb2
19 from caffe2.python import core, workspace, data_parallel_model
24 logger = logging.getLogger(__name__)
25 logger.setLevel(logging.INFO)
26 logger.addHandler(logging.StreamHandler(sys.stderr))
28 Batch = collections.namedtuple(
'Batch', [
38 def prepare_batch(batch):
39 encoder_lengths = [len(entry[0])
for entry
in batch]
40 max_encoder_length = max(encoder_lengths)
42 max_decoder_length = max([len(entry[1])
for entry
in batch])
44 batch_encoder_inputs = []
45 batch_decoder_inputs = []
47 batch_target_weights = []
49 for source_seq, target_seq
in batch:
51 [seq2seq_util.PAD_ID] * (max_encoder_length - len(source_seq))
53 batch_encoder_inputs.append(
54 list(reversed(source_seq)) + encoder_pads
58 [seq2seq_util.PAD_ID] * (max_decoder_length - len(target_seq))
60 target_seq_with_go_token = [seq2seq_util.GO_ID] + target_seq
61 decoder_lengths.append(len(target_seq_with_go_token))
62 batch_decoder_inputs.append(target_seq_with_go_token + decoder_pads)
64 target_seq_with_eos = target_seq + [seq2seq_util.EOS_ID]
65 targets = target_seq_with_eos + decoder_pads
66 batch_targets.append(targets)
68 if len(source_seq) + len(target_seq) == 0:
69 target_weights = [0] * len(targets)
72 1
if target != seq2seq_util.PAD_ID
else 0
75 batch_target_weights.append(target_weights)
78 encoder_inputs=np.array(
82 encoder_lengths=np.array(encoder_lengths, dtype=np.int32),
83 decoder_inputs=np.array(
87 decoder_lengths=np.array(decoder_lengths, dtype=np.int32),
92 target_weights=np.array(
115 model.AddGradientOperators(loss_blobs)
118 scope=
'norm_clipped_grad_update' 125 data_parallel_model.Parallelize_GPU(
127 input_builder_fun=
lambda m:
None,
129 param_update_builder_fun=
None,
133 def clipped_grad_update_bound(model):
136 scope=
'norm_clipped_grad_update',
139 data_parallel_model.Parallelize_GPU(
141 input_builder_fun=
lambda m:
None,
143 param_update_builder_fun=clipped_grad_update_bound,
148 scope=
'norm_clipped_sparse_grad_update',
153 def _build_shared(self, model):
154 optimizer_params = self.
model_params[
'optimizer_params']
155 with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
157 name=
'learning_rate',
158 init_value=float(optimizer_params[
'learning_rate']),
168 init_value=time.time(),
172 def _build_embeddings(self, model):
173 with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
177 'encoder_embeddings',
188 'decoder_embeddings',
198 def model_build_fun(self, model, forward_only=False, loss_scale=None):
199 encoder_inputs = model.net.AddExternalInput(
200 workspace.GetNameScope() +
'encoder_inputs',
202 encoder_lengths = model.net.AddExternalInput(
203 workspace.GetNameScope() +
'encoder_lengths',
205 decoder_inputs = model.net.AddExternalInput(
206 workspace.GetNameScope() +
'decoder_inputs',
208 decoder_lengths = model.net.AddExternalInput(
209 workspace.GetNameScope() +
'decoder_lengths',
211 targets = model.net.AddExternalInput(
212 workspace.GetNameScope() +
'targets',
214 target_weights = model.net.AddExternalInput(
215 workspace.GetNameScope() +
'target_weights',
218 assert attention_type
in [
'none',
'regular',
'dot']
222 weighted_encoder_outputs,
223 final_encoder_hidden_states,
224 final_encoder_cell_states,
225 encoder_units_per_layer,
226 ) = seq2seq_util.build_embedding_encoder(
229 num_decoder_layers=len(self.
model_params[
'decoder_layer_configs']),
230 inputs=encoder_inputs,
231 input_lengths=encoder_lengths,
234 embedding_size=self.
model_params[
'encoder_embedding_size'],
235 use_attention=(attention_type !=
'none'),
242 ) = seq2seq_util.build_embedding_decoder(
244 decoder_layer_configs=self.
model_params[
'decoder_layer_configs'],
245 inputs=decoder_inputs,
246 input_lengths=decoder_lengths,
247 encoder_lengths=encoder_lengths,
248 encoder_outputs=encoder_outputs,
249 weighted_encoder_outputs=weighted_encoder_outputs,
250 final_encoder_hidden_states=final_encoder_hidden_states,
251 final_encoder_cell_states=final_encoder_cell_states,
252 encoder_units_per_layer=encoder_units_per_layer,
255 embedding_size=self.
model_params[
'decoder_embedding_size'],
256 attention_type=attention_type,
261 output_logits = seq2seq_util.output_projection(
263 decoder_outputs=decoder_outputs,
264 decoder_output_size=decoder_output_size,
266 decoder_softmax_size=self.
model_params[
'decoder_softmax_size'],
268 targets, _ = model.net.Reshape(
270 [
'targets',
'targets_old_shape'],
273 target_weights, _ = model.net.Reshape(
275 [
'target_weights',
'target_weights_old_shape'],
278 _, loss_per_word = model.net.SoftmaxWithLoss(
279 [output_logits, targets, target_weights],
280 [
'OutputProbs_INVALID',
'loss_per_word'],
284 num_words = model.net.SumElements(
288 total_loss_scalar = model.net.Mul(
289 [loss_per_word, num_words],
292 total_loss_scalar_weighted = model.net.Scale(
294 'total_loss_scalar_weighted',
297 return [total_loss_scalar_weighted]
299 def forward_model_build_fun(self, model, loss_scale=None):
303 loss_scale=loss_scale
306 def _calc_norm_ratio(self, model, params, scope, ONE):
307 with core.NameScope(scope):
308 grad_squared_sums = []
309 for i, param
in enumerate(params):
312 model.param_to_grad[param]
314 model.param_to_grad[param],
316 )
else model.param_to_grad[param].values
318 grad_squared = model.net.Sqr(
320 'grad_{}_squared'.format(i),
322 grad_squared_sum = model.net.SumElements(
324 'grad_{}_squared_sum'.format(i),
326 grad_squared_sums.append(grad_squared_sum)
328 grad_squared_full_sum = model.net.Sum(
330 'grad_squared_full_sum',
332 global_norm = model.net.Pow(
333 grad_squared_full_sum,
337 clip_norm = model.param_init_net.ConstantFill(
343 max_norm = model.net.Max(
344 [global_norm, clip_norm],
347 norm_ratio = model.net.Div(
348 [clip_norm, max_norm],
353 def _apply_norm_ratio(
354 self, norm_ratio, model, params, learning_rate, scope, ONE
357 param_grad = model.param_to_grad[param]
358 nlr = model.net.Negative(
360 'negative_learning_rate',
362 with core.NameScope(scope):
363 update_coeff = model.net.Mul(
368 if isinstance(param_grad, core.GradientSlice):
369 param_grad_values = param_grad.values
371 model.net.ScatterWeightedSum(
382 model.net.WeightedSum(
392 def norm_clipped_grad_update(self, model, scope):
400 for param
in model.GetParams(top_scope=
True):
401 if param
in model.param_to_grad:
403 model.param_to_grad[param],
408 ONE = model.param_init_net.ConstantFill(
414 logger.info(
'Dense trainable variables: ')
417 norm_ratio, model, params, learning_rate, scope, ONE
420 def norm_clipped_sparse_grad_update(self, model, scope):
424 for param
in model.GetParams(top_scope=
True):
425 if param
in model.param_to_grad:
427 model.param_to_grad[param],
432 ONE = model.param_init_net.ConstantFill(
438 logger.info(
'Sparse trainable variables: ')
441 norm_ratio, model, params, learning_rate, scope, ONE
444 def total_loss_scalar(self):
446 return workspace.FetchBlob(
'total_loss_scalar')
450 name =
'gpu_{}/total_loss_scalar'.format(i)
451 gpu_loss = workspace.FetchBlob(name)
452 total_loss += gpu_loss
455 def _init_model(self):
456 workspace.RunNetOnce(self.model.param_init_net)
461 input_blobs=[str(i)
for i
in net.external_inputs],
464 create_net(self.model.net)
484 workspace.GlobalInit([
487 '--caffe2_log_level=0',
491 '--caffe2_handle_executor_threads_exceptions=1',
492 '--caffe2_mkl_num_threads=' + str(self.
num_cpus),
498 def __exit__(self, exc_type, exc_value, traceback):
499 workspace.ResetWorkspace()
501 def initialize_from_scratch(self):
502 logger.info(
'Initializing Seq2SeqModelCaffe2 from scratch: Start')
505 logger.info(
'Initializing Seq2SeqModelCaffe2 from scratch: Finish')
507 def get_current_step(self):
510 def inc_current_step(self):
522 batch_obj = prepare_batch(batch)
523 for batch_obj_name, batch_obj_value
in zip(
527 workspace.FeedBlob(batch_obj_name, batch_obj_value)
531 batch_obj = prepare_batch(gpu_batch)
532 for batch_obj_name, batch_obj_value
in zip(
536 name =
'gpu_{}/{}'.format(i, batch_obj_name)
537 if batch_obj_name
in [
'encoder_inputs',
'decoder_inputs']:
538 dev = core.DeviceOption(caffe2_pb2.CPU)
540 dev = core.DeviceOption(workspace.GpuDeviceType, i)
541 workspace.FeedBlob(name, batch_obj_value, device_option=dev)
546 workspace.RunNet(self.model.net)
551 def save(self, checkpoint_path_prefix, current_step):
552 checkpoint_path =
'{0}-{1}'.format(
553 checkpoint_path_prefix,
557 assert workspace.RunOperatorOnce(core.CreateOperator(
559 self.model.GetAllParams(),
566 checkpoint_config_path = os.path.join(
567 os.path.dirname(checkpoint_path_prefix),
570 with open(checkpoint_config_path,
'w')
as checkpoint_config_file:
571 checkpoint_config_file.write(
572 'model_checkpoint_path: "' + checkpoint_path +
'"\n' 573 'all_model_checkpoint_paths: "' + checkpoint_path +
'"\n' 575 logger.info(
'Saved checkpoint file to ' + checkpoint_path)
577 return checkpoint_path
579 def gen_batches(source_corpus, target_corpus, source_vocab, target_vocab,
580 batch_size, max_length):
581 with open(source_corpus)
as source, open(target_corpus)
as target:
582 parallel_sentences = []
583 for source_sentence, target_sentence
in zip(source, target):
584 numerized_source_sentence = seq2seq_util.get_numberized_sentence(
588 numerized_target_sentence = seq2seq_util.get_numberized_sentence(
593 len(numerized_source_sentence) > 0
and 594 len(numerized_target_sentence) > 0
and 596 max_length
is None or (
597 len(numerized_source_sentence) <= max_length
and 598 len(numerized_target_sentence) <= max_length
602 parallel_sentences.append((
603 numerized_source_sentence,
604 numerized_target_sentence,
606 parallel_sentences.sort(key=
lambda s_t: (len(s_t[0]), len(s_t[1])))
608 batches, batch = [], []
609 for sentence_pair
in parallel_sentences:
610 batch.append(sentence_pair)
611 if len(batch) >= batch_size:
612 batches.append(batch)
615 while len(batch) < batch_size:
616 batch.append(batch[-1])
617 assert len(batch) == batch_size
618 batches.append(batch)
619 random.shuffle(batches)
623 def run_seq2seq_model(args, model_params=None):
624 source_vocab = seq2seq_util.gen_vocab(
628 target_vocab = seq2seq_util.gen_vocab(
632 logger.info(
'Source vocab size {}'.format(len(source_vocab)))
633 logger.info(
'Target vocab size {}'.format(len(target_vocab)))
635 batches = gen_batches(args.source_corpus, args.target_corpus, source_vocab,
636 target_vocab, model_params[
'batch_size'],
638 logger.info(
'Number of training batches {}'.format(len(batches)))
640 batches_eval = gen_batches(args.source_corpus_eval, args.target_corpus_eval,
641 source_vocab, target_vocab,
642 model_params[
'batch_size'], args.max_length)
643 logger.info(
'Number of eval batches {}'.format(len(batches_eval)))
646 model_params=model_params,
647 source_vocab_size=len(source_vocab),
648 target_vocab_size=len(target_vocab),
649 num_gpus=args.num_gpus,
652 model_obj.initialize_from_scratch()
653 for i
in range(args.epochs):
654 logger.info(
'Epoch {}'.format(i))
656 for batch
in batches:
657 total_loss += model_obj.step(
661 logger.info(
'\ttraining loss {}'.format(total_loss))
663 for batch
in batches_eval:
664 total_loss += model_obj.step(
668 logger.info(
'\teval loss {}'.format(total_loss))
669 if args.checkpoint
is not None:
670 model_obj.save(args.checkpoint, i)
675 parser = argparse.ArgumentParser(
676 description=
'Caffe2: Seq2Seq Training' 678 parser.add_argument(
'--source-corpus', type=str, default=
None,
679 help=
'Path to source corpus in a text file format. Each ' 680 'line in the file should contain a single sentence',
682 parser.add_argument(
'--target-corpus', type=str, default=
None,
683 help=
'Path to target corpus in a text file format',
685 parser.add_argument(
'--max-length', type=int, default=
None,
686 help=
'Maximal lengths of train and eval sentences')
687 parser.add_argument(
'--unk-threshold', type=int, default=50,
688 help=
'Threshold frequency under which token becomes ' 689 'labeled unknown token')
691 parser.add_argument(
'--batch-size', type=int, default=32,
692 help=
'Training batch size')
693 parser.add_argument(
'--epochs', type=int, default=10,
694 help=
'Number of iterations over training data')
695 parser.add_argument(
'--learning-rate', type=float, default=0.5,
696 help=
'Learning rate')
697 parser.add_argument(
'--max-gradient-norm', type=float, default=1.0,
698 help=
'Max global norm of gradients at the end of each ' 699 'backward pass. We do clipping to match the number.')
700 parser.add_argument(
'--num-gpus', type=int, default=0,
701 help=
'Number of GPUs for data parallel model')
703 parser.add_argument(
'--use-bidirectional-encoder', action=
'store_true',
704 help=
'Set flag to use bidirectional recurrent network ' 705 'for first layer of encoder')
706 parser.add_argument(
'--use-attention', action=
'store_true',
707 help=
'Set flag to use seq2seq with attention model')
708 parser.add_argument(
'--source-corpus-eval', type=str, default=
None,
709 help=
'Path to source corpus for evaluation in a text ' 710 'file format', required=
True)
711 parser.add_argument(
'--target-corpus-eval', type=str, default=
None,
712 help=
'Path to target corpus for evaluation in a text ' 713 'file format', required=
True)
714 parser.add_argument(
'--encoder-cell-num-units', type=int, default=512,
715 help=
'Number of cell units per encoder layer')
716 parser.add_argument(
'--encoder-num-layers', type=int, default=2,
717 help=
'Number encoder layers')
718 parser.add_argument(
'--decoder-cell-num-units', type=int, default=512,
719 help=
'Number of cell units in the decoder layer')
720 parser.add_argument(
'--decoder-num-layers', type=int, default=2,
721 help=
'Number decoder layers')
722 parser.add_argument(
'--encoder-embedding-size', type=int, default=256,
723 help=
'Size of embedding in the encoder layer')
724 parser.add_argument(
'--decoder-embedding-size', type=int, default=512,
725 help=
'Size of embedding in the decoder layer')
726 parser.add_argument(
'--decoder-softmax-size', type=int, default=
None,
727 help=
'Size of softmax layer in the decoder')
729 parser.add_argument(
'--checkpoint', type=str, default=
None,
730 help=
'Path to checkpoint')
732 args = parser.parse_args()
734 encoder_layer_configs = [
736 num_units=args.encoder_cell_num_units,
738 ] * args.encoder_num_layers
740 if args.use_bidirectional_encoder:
741 assert args.encoder_cell_num_units % 2 == 0
742 encoder_layer_configs[0][
'num_units'] /= 2
744 decoder_layer_configs = [
746 num_units=args.decoder_cell_num_units,
748 ] * args.decoder_num_layers
750 run_seq2seq_model(args, model_params=dict(
751 attention=(
'regular' if args.use_attention
else 'none'),
752 decoder_layer_configs=decoder_layer_configs,
754 encoder_layer_configs=encoder_layer_configs,
755 use_bidirectional_encoder=args.use_bidirectional_encoder,
757 batch_size=args.batch_size,
758 optimizer_params=dict(
759 learning_rate=args.learning_rate,
761 encoder_embedding_size=args.encoder_embedding_size,
762 decoder_embedding_size=args.decoder_embedding_size,
763 decoder_softmax_size=args.decoder_softmax_size,
764 max_gradient_norm=args.max_gradient_norm,
768 if __name__ ==
'__main__':
def _calc_norm_ratio(self, model, params, scope, ONE)
def _build_embeddings(self, model)
def _build_shared(self, model)
def forward_model_build_fun(self, model, loss_scale=None)
def _apply_norm_ratio(self, norm_ratio, model, params, learning_rate, scope, ONE)
def norm_clipped_grad_update(self, model, scope)
def get_current_step(self)
def norm_clipped_sparse_grad_update(self, model, scope)
def total_loss_scalar(self)
def _build_model(self, init_params)
def model_build_fun(self, model, forward_only=False, loss_scale=None)
def inc_current_step(self)