Caffe2 - Python API
A deep learning, cross platform ML framework
translate.py
1 ## @package translate
2 # Module caffe2.python.models.seq2seq.translate
3 from __future__ import absolute_import
4 from __future__ import division
5 from __future__ import print_function
6 from __future__ import unicode_literals
7 
8 from abc import ABCMeta, abstractmethod
9 import argparse
10 from future.utils import viewitems
11 import logging
12 import numpy as np
13 from six import with_metaclass
14 import sys
15 
16 from caffe2.python import core, rnn_cell, workspace
17 from caffe2.python.models.seq2seq.beam_search import BeamSearchForwardOnly
18 from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
19 import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
20 
21 
22 logger = logging.getLogger(__name__)
23 logger.setLevel(logging.INFO)
24 logger.addHandler(logging.StreamHandler(sys.stderr))
25 
26 
27 def _weighted_sum(model, values, weight, output_name):
28  values_weights = zip(values, [weight] * len(values))
29  values_weights_flattened = [x for v_w in values_weights for x in v_w]
30  return model.net.WeightedSum(
31  values_weights_flattened,
32  output_name,
33  )
34 
35 
36 class Seq2SeqModelCaffe2EnsembleDecoderBase(with_metaclass(ABCMeta, object)):
37 
38  @abstractmethod
39  def get_model_file(self, model):
40  pass
41 
42  @abstractmethod
43  def get_db_type(self):
44  pass
45 
46  def build_word_rewards(self, vocab_size, word_reward, unk_reward):
47  word_rewards = np.full([vocab_size], word_reward, dtype=np.float32)
48  word_rewards[seq2seq_util.PAD_ID] = 0
49  word_rewards[seq2seq_util.GO_ID] = 0
50  word_rewards[seq2seq_util.EOS_ID] = 0
51  word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward
52  return word_rewards
53 
54  def load_models(self):
55  db_reader = 'reader'
56  for model, scope_name in zip(
57  self.models,
58  self.decoder_scope_names,
59  ):
60  params_for_current_model = [
61  param
62  for param in self.model.GetAllParams()
63  if str(param).startswith(scope_name)
64  ]
65  assert workspace.RunOperatorOnce(core.CreateOperator(
66  'CreateDB',
67  [], [db_reader],
68  db=self.get_model_file(model),
69  db_type=self.get_db_type())
70  ), 'Failed to create db {}'.format(self.get_model_file(model))
71  assert workspace.RunOperatorOnce(core.CreateOperator(
72  'Load',
73  [db_reader],
74  params_for_current_model,
75  load_all=1,
76  add_prefix=scope_name + '/',
77  strip_prefix='gpu_0/',
78  ))
79  logger.info('Model {} is loaded from a checkpoint {}'.format(
80  scope_name, self.get_model_file(model)))
81 
82 
84 
85  def get_model_file(self, model):
86  return model['model_file']
87 
88  def get_db_type(self):
89  return 'minidb'
90 
91  def scope(self, scope_name, blob_name):
92  return (
93  scope_name + '/' + blob_name
94  if scope_name is not None
95  else blob_name
96  )
97 
98  def _build_decoder(
99  self,
100  model,
101  step_model,
102  model_params,
103  scope,
104  previous_tokens,
105  timestep,
106  fake_seq_lengths,
107  ):
108  attention_type = model_params['attention']
109  assert attention_type in ['none', 'regular']
110  use_attention = (attention_type != 'none')
111 
112  with core.NameScope(scope):
113  encoder_embeddings = seq2seq_util.build_embeddings(
114  model=model,
115  vocab_size=self.source_vocab_size,
116  embedding_size=model_params['encoder_embedding_size'],
117  name='encoder_embeddings',
118  freeze_embeddings=False,
119  )
120 
121  (
122  encoder_outputs,
123  weighted_encoder_outputs,
124  final_encoder_hidden_states,
125  final_encoder_cell_states,
126  encoder_units_per_layer,
127  ) = seq2seq_util.build_embedding_encoder(
128  model=model,
129  encoder_params=model_params['encoder_type'],
130  num_decoder_layers=len(model_params['decoder_layer_configs']),
131  inputs=self.encoder_inputs,
132  input_lengths=self.encoder_lengths,
133  vocab_size=self.source_vocab_size,
134  embeddings=encoder_embeddings,
135  embedding_size=model_params['encoder_embedding_size'],
136  use_attention=use_attention,
137  num_gpus=0,
138  forward_only=True,
139  scope=scope,
140  )
141  with core.NameScope(scope):
142  if use_attention:
143  # [max_source_length, beam_size, encoder_output_dim]
144  encoder_outputs = model.net.Tile(
145  encoder_outputs,
146  'encoder_outputs_tiled',
147  tiles=self.beam_size,
148  axis=1,
149  )
150 
151  if weighted_encoder_outputs is not None:
152  weighted_encoder_outputs = model.net.Tile(
153  weighted_encoder_outputs,
154  'weighted_encoder_outputs_tiled',
155  tiles=self.beam_size,
156  axis=1,
157  )
158 
159  decoder_embeddings = seq2seq_util.build_embeddings(
160  model=model,
161  vocab_size=self.target_vocab_size,
162  embedding_size=model_params['decoder_embedding_size'],
163  name='decoder_embeddings',
164  freeze_embeddings=False,
165  )
166  embedded_tokens_t_prev = step_model.net.Gather(
167  [decoder_embeddings, previous_tokens],
168  'embedded_tokens_t_prev',
169  )
170 
171  decoder_cells = []
172  decoder_units_per_layer = []
173  for i, layer_config in enumerate(model_params['decoder_layer_configs']):
174  num_units = layer_config['num_units']
175  decoder_units_per_layer.append(num_units)
176  if i == 0:
177  input_size = model_params['decoder_embedding_size']
178  else:
179  input_size = (
180  model_params['decoder_layer_configs'][i - 1]['num_units']
181  )
182 
183  cell = rnn_cell.LSTMCell(
184  forward_only=True,
185  input_size=input_size,
186  hidden_size=num_units,
187  forget_bias=0.0,
188  memory_optimization=False,
189  )
190  decoder_cells.append(cell)
191 
192  with core.NameScope(scope):
193  if final_encoder_hidden_states is not None:
194  for i in range(len(final_encoder_hidden_states)):
195  if final_encoder_hidden_states[i] is not None:
196  final_encoder_hidden_states[i] = model.net.Tile(
197  final_encoder_hidden_states[i],
198  'final_encoder_hidden_tiled_{}'.format(i),
199  tiles=self.beam_size,
200  axis=1,
201  )
202  if final_encoder_cell_states is not None:
203  for i in range(len(final_encoder_cell_states)):
204  if final_encoder_cell_states[i] is not None:
205  final_encoder_cell_states[i] = model.net.Tile(
206  final_encoder_cell_states[i],
207  'final_encoder_cell_tiled_{}'.format(i),
208  tiles=self.beam_size,
209  axis=1,
210  )
211  initial_states = \
212  seq2seq_util.build_initial_rnn_decoder_states(
213  model=model,
214  encoder_units_per_layer=encoder_units_per_layer,
215  decoder_units_per_layer=decoder_units_per_layer,
216  final_encoder_hidden_states=final_encoder_hidden_states,
217  final_encoder_cell_states=final_encoder_cell_states,
218  use_attention=use_attention,
219  )
220 
221  attention_decoder = seq2seq_util.LSTMWithAttentionDecoder(
222  encoder_outputs=encoder_outputs,
223  encoder_output_dim=encoder_units_per_layer[-1],
224  encoder_lengths=None,
225  vocab_size=self.target_vocab_size,
226  attention_type=attention_type,
227  embedding_size=model_params['decoder_embedding_size'],
228  decoder_num_units=decoder_units_per_layer[-1],
229  decoder_cells=decoder_cells,
230  weighted_encoder_outputs=weighted_encoder_outputs,
231  name=scope,
232  )
233  states_prev = step_model.net.AddExternalInputs(*[
234  '{}/{}_prev'.format(scope, s)
235  for s in attention_decoder.get_state_names()
236  ])
237  decoder_outputs, states = attention_decoder.apply(
238  model=step_model,
239  input_t=embedded_tokens_t_prev,
240  seq_lengths=fake_seq_lengths,
241  states=states_prev,
242  timestep=timestep,
243  )
244 
245  state_configs = [
246  BeamSearchForwardOnly.StateConfig(
247  initial_value=initial_state,
248  state_prev_link=BeamSearchForwardOnly.LinkConfig(
249  blob=state_prev,
250  offset=0,
251  window=1,
252  ),
253  state_link=BeamSearchForwardOnly.LinkConfig(
254  blob=state,
255  offset=1,
256  window=1,
257  ),
258  )
259  for initial_state, state_prev, state in zip(
260  initial_states,
261  states_prev,
262  states,
263  )
264  ]
265 
266  with core.NameScope(scope):
267  decoder_outputs_flattened, _ = step_model.net.Reshape(
268  [decoder_outputs],
269  [
270  'decoder_outputs_flattened',
271  'decoder_outputs_and_contexts_combination_old_shape',
272  ],
273  shape=[-1, attention_decoder.get_output_dim()],
274  )
275  output_logits = seq2seq_util.output_projection(
276  model=step_model,
277  decoder_outputs=decoder_outputs_flattened,
278  decoder_output_size=attention_decoder.get_output_dim(),
279  target_vocab_size=self.target_vocab_size,
280  decoder_softmax_size=model_params['decoder_softmax_size'],
281  )
282  # [1, beam_size, target_vocab_size]
283  output_probs = step_model.net.Softmax(
284  output_logits,
285  'output_probs',
286  )
287  output_log_probs = step_model.net.Log(
288  output_probs,
289  'output_log_probs',
290  )
291  if use_attention:
292  attention_weights = attention_decoder.get_attention_weights()
293  else:
294  attention_weights = step_model.net.ConstantFill(
295  [self.encoder_inputs],
296  'zero_attention_weights_tmp_1',
297  value=0.0,
298  )
299  attention_weights = step_model.net.Transpose(
300  attention_weights,
301  'zero_attention_weights_tmp_2',
302  )
303  attention_weights = step_model.net.Tile(
304  attention_weights,
305  'zero_attention_weights_tmp',
306  tiles=self.beam_size,
307  axis=0,
308  )
309 
310  return (
311  state_configs,
312  output_log_probs,
313  attention_weights,
314  )
315 
316  def __init__(
317  self,
318  translate_params,
319  ):
320  self.models = translate_params['ensemble_models']
321  decoding_params = translate_params['decoding_params']
322  self.beam_size = decoding_params['beam_size']
323 
324  assert len(self.models) > 0
325  source_vocab = self.models[0]['source_vocab']
326  target_vocab = self.models[0]['target_vocab']
327  for model in self.models:
328  assert model['source_vocab'] == source_vocab
329  assert model['target_vocab'] == target_vocab
330 
331  self.source_vocab_size = len(source_vocab)
332  self.target_vocab_size = len(target_vocab)
333 
334  self.decoder_scope_names = [
335  'model{}'.format(i) for i in range(len(self.models))
336  ]
337 
338  self.model = Seq2SeqModelHelper(init_params=True)
339 
340  self.encoder_inputs = self.model.net.AddExternalInput('encoder_inputs')
341  self.encoder_lengths = self.model.net.AddExternalInput(
342  'encoder_lengths'
343  )
344  self.max_output_seq_len = self.model.net.AddExternalInput(
345  'max_output_seq_len'
346  )
347 
348  fake_seq_lengths = self.model.param_init_net.ConstantFill(
349  [],
350  'fake_seq_lengths',
351  shape=[self.beam_size],
352  value=100000,
353  dtype=core.DataType.INT32,
354  )
355 
356  beam_decoder = BeamSearchForwardOnly(
357  beam_size=self.beam_size,
358  model=self.model,
359  go_token_id=seq2seq_util.GO_ID,
360  eos_token_id=seq2seq_util.EOS_ID,
361  )
362  step_model = beam_decoder.get_step_model()
363 
364  state_configs = []
365  output_log_probs = []
366  attention_weights = []
367  for model, scope_name in zip(
368  self.models,
369  self.decoder_scope_names,
370  ):
371  (
372  state_configs_per_decoder,
373  output_log_probs_per_decoder,
374  attention_weights_per_decoder,
375  ) = self._build_decoder(
376  model=self.model,
377  step_model=step_model,
378  model_params=model['model_params'],
379  scope=scope_name,
380  previous_tokens=beam_decoder.get_previous_tokens(),
381  timestep=beam_decoder.get_timestep(),
382  fake_seq_lengths=fake_seq_lengths,
383  )
384  state_configs.extend(state_configs_per_decoder)
385  output_log_probs.append(output_log_probs_per_decoder)
386  if attention_weights_per_decoder is not None:
387  attention_weights.append(attention_weights_per_decoder)
388 
389  assert len(attention_weights) > 0
390  num_decoders_with_attention_blob = (
391  self.model.param_init_net.ConstantFill(
392  [],
393  'num_decoders_with_attention_blob',
394  value=1 / float(len(attention_weights)),
395  shape=[1],
396  )
397  )
398  # [beam_size, encoder_length, 1]
399  attention_weights_average = _weighted_sum(
400  model=step_model,
401  values=attention_weights,
402  weight=num_decoders_with_attention_blob,
403  output_name='attention_weights_average',
404  )
405 
406  num_decoders_blob = self.model.param_init_net.ConstantFill(
407  [],
408  'num_decoders_blob',
409  value=1 / float(len(output_log_probs)),
410  shape=[1],
411  )
412  # [beam_size, target_vocab_size]
413  output_log_probs_average = _weighted_sum(
414  model=step_model,
415  values=output_log_probs,
416  weight=num_decoders_blob,
417  output_name='output_log_probs_average',
418  )
419  word_rewards = self.model.param_init_net.ConstantFill(
420  [],
421  'word_rewards',
422  shape=[self.target_vocab_size],
423  value=0.0,
424  dtype=core.DataType.FLOAT,
425  )
426  (
427  self.output_token_beam_list,
428  self.output_prev_index_beam_list,
429  self.output_score_beam_list,
430  self.output_attention_weights_beam_list,
431  ) = beam_decoder.apply(
432  inputs=self.encoder_inputs,
433  length=self.max_output_seq_len,
434  log_probs=output_log_probs_average,
435  attentions=attention_weights_average,
436  state_configs=state_configs,
437  data_dependencies=[],
438  word_rewards=word_rewards,
439  )
440 
441  workspace.RunNetOnce(self.model.param_init_net)
442  workspace.FeedBlob(
443  'word_rewards',
444  self.build_word_rewards(
445  vocab_size=self.target_vocab_size,
446  word_reward=translate_params['decoding_params']['word_reward'],
447  unk_reward=translate_params['decoding_params']['unk_reward'],
448  )
449  )
450 
451  workspace.CreateNet(
452  self.model.net,
453  input_blobs=[
454  str(self.encoder_inputs),
455  str(self.encoder_lengths),
456  str(self.max_output_seq_len),
457  ],
458  )
459 
460  logger.info('Params created: ')
461  for param in self.model.params:
462  logger.info(param)
463 
464  def decode(self, numberized_input, max_output_seq_len):
465  workspace.FeedBlob(
466  self.encoder_inputs,
467  np.array([
468  [token_id] for token_id in reversed(numberized_input)
469  ]).astype(dtype=np.int32),
470  )
471  workspace.FeedBlob(
472  self.encoder_lengths,
473  np.array([len(numberized_input)]).astype(dtype=np.int32),
474  )
475  workspace.FeedBlob(
476  self.max_output_seq_len,
477  np.array([max_output_seq_len]).astype(dtype=np.int64),
478  )
479 
480  workspace.RunNet(self.model.net)
481 
482  num_steps = max_output_seq_len
483  score_beam_list = workspace.FetchBlob(self.output_score_beam_list)
484  token_beam_list = (
485  workspace.FetchBlob(self.output_token_beam_list)
486  )
487  prev_index_beam_list = (
488  workspace.FetchBlob(self.output_prev_index_beam_list)
489  )
490 
491  attention_weights_beam_list = (
492  workspace.FetchBlob(self.output_attention_weights_beam_list)
493  )
494  best_indices = (num_steps, 0)
495  for i in range(num_steps + 1):
496  for hyp_index in range(self.beam_size):
497  if (
498  (
499  token_beam_list[i][hyp_index][0] ==
500  seq2seq_util.EOS_ID or
501  i == num_steps
502  ) and
503  (
504  score_beam_list[i][hyp_index][0] >
505  score_beam_list[best_indices[0]][best_indices[1]][0]
506  )
507  ):
508  best_indices = (i, hyp_index)
509 
510  i, hyp_index = best_indices
511  output = []
512  attention_weights_per_token = []
513  best_score = -score_beam_list[i][hyp_index][0]
514  while i > 0:
515  output.append(token_beam_list[i][hyp_index][0])
516  attention_weights_per_token.append(
517  attention_weights_beam_list[i][hyp_index]
518  )
519  hyp_index = prev_index_beam_list[i][hyp_index][0]
520  i -= 1
521 
522  attention_weights_per_token = reversed(attention_weights_per_token)
523  # encoder_inputs are reversed, see get_batch func
524  attention_weights_per_token = [
525  list(reversed(attention_weights))[:len(numberized_input)]
526  for attention_weights in attention_weights_per_token
527  ]
528  output = list(reversed(output))
529  return output, attention_weights_per_token, best_score
530 
531 
532 def run_seq2seq_beam_decoder(args, model_params, decoding_params):
533  source_vocab = seq2seq_util.gen_vocab(
534  args.source_corpus,
535  args.unk_threshold,
536  )
537  logger.info('Source vocab size {}'.format(len(source_vocab)))
538  target_vocab = seq2seq_util.gen_vocab(
539  args.target_corpus,
540  args.unk_threshold,
541  )
542  inversed_target_vocab = {v: k for (k, v) in viewitems(target_vocab)}
543  logger.info('Target vocab size {}'.format(len(target_vocab)))
544 
546  translate_params=dict(
547  ensemble_models=[dict(
548  source_vocab=source_vocab,
549  target_vocab=target_vocab,
550  model_params=model_params,
551  model_file=args.checkpoint,
552  )],
553  decoding_params=decoding_params,
554  ),
555  )
556  decoder.load_models()
557 
558  for line in sys.stdin:
559  numerized_source_sentence = seq2seq_util.get_numberized_sentence(
560  line,
561  source_vocab,
562  )
563  translation, alignment, _ = decoder.decode(
564  numerized_source_sentence,
565  2 * len(numerized_source_sentence) + 5,
566  )
567  print(' '.join([inversed_target_vocab[tid] for tid in translation]))
568 
569 
570 def main():
571  parser = argparse.ArgumentParser(
572  description='Caffe2: Seq2Seq Translation',
573  )
574  parser.add_argument('--source-corpus', type=str, default=None,
575  help='Path to source corpus in a text file format. Each '
576  'line in the file should contain a single sentence',
577  required=True)
578  parser.add_argument('--target-corpus', type=str, default=None,
579  help='Path to target corpus in a text file format',
580  required=True)
581  parser.add_argument('--unk-threshold', type=int, default=50,
582  help='Threshold frequency under which token becomes '
583  'labeled unknown token')
584 
585  parser.add_argument('--use-bidirectional-encoder', action='store_true',
586  help='Set flag to use bidirectional recurrent network '
587  'in encoder')
588  parser.add_argument('--use-attention', action='store_true',
589  help='Set flag to use seq2seq with attention model')
590  parser.add_argument('--encoder-cell-num-units', type=int, default=512,
591  help='Number of cell units per encoder layer')
592  parser.add_argument('--encoder-num-layers', type=int, default=2,
593  help='Number encoder layers')
594  parser.add_argument('--decoder-cell-num-units', type=int, default=512,
595  help='Number of cell units in the decoder layer')
596  parser.add_argument('--decoder-num-layers', type=int, default=2,
597  help='Number decoder layers')
598  parser.add_argument('--encoder-embedding-size', type=int, default=256,
599  help='Size of embedding in the encoder layer')
600  parser.add_argument('--decoder-embedding-size', type=int, default=512,
601  help='Size of embedding in the decoder layer')
602  parser.add_argument('--decoder-softmax-size', type=int, default=None,
603  help='Size of softmax layer in the decoder')
604 
605  parser.add_argument('--beam-size', type=int, default=6,
606  help='Size of beam for the decoder')
607  parser.add_argument('--word-reward', type=float, default=0.0,
608  help='Reward per each word generated.')
609  parser.add_argument('--unk-reward', type=float, default=0.0,
610  help='Reward per each UNK token generated. '
611  'Typically should be negative.')
612 
613  parser.add_argument('--checkpoint', type=str, default=None,
614  help='Path to checkpoint', required=True)
615 
616  args = parser.parse_args()
617 
618  encoder_layer_configs = [
619  dict(
620  num_units=args.encoder_cell_num_units,
621  ),
622  ] * args.encoder_num_layers
623 
624  if args.use_bidirectional_encoder:
625  assert args.encoder_cell_num_units % 2 == 0
626  encoder_layer_configs[0]['num_units'] /= 2
627 
628  decoder_layer_configs = [
629  dict(
630  num_units=args.decoder_cell_num_units,
631  ),
632  ] * args.decoder_num_layers
633 
634  run_seq2seq_beam_decoder(
635  args,
636  model_params=dict(
637  attention=('regular' if args.use_attention else 'none'),
638  decoder_layer_configs=decoder_layer_configs,
639  encoder_type=dict(
640  encoder_layer_configs=encoder_layer_configs,
641  use_bidirectional_encoder=args.use_bidirectional_encoder,
642  ),
643  encoder_embedding_size=args.encoder_embedding_size,
644  decoder_embedding_size=args.decoder_embedding_size,
645  decoder_softmax_size=args.decoder_softmax_size,
646  ),
647  decoding_params=dict(
648  beam_size=args.beam_size,
649  word_reward=args.word_reward,
650  unk_reward=args.unk_reward,
651  ),
652  )
653 
654 
655 if __name__ == '__main__':
656  main()
Module caffe2.python.scope.
def _build_decoder(self, model, step_model, model_params, scope, previous_tokens, timestep, fake_seq_lengths)
Definition: translate.py:107
def build_word_rewards(self, vocab_size, word_reward, unk_reward)
Definition: translate.py:46