3 """ A bunch of util functions to build Seq2Seq models with Caffe2.""" 5 from __future__
import absolute_import
6 from __future__
import division
7 from __future__
import print_function
8 from __future__
import unicode_literals
11 from future.utils
import viewitems
13 import caffe2.proto.caffe2_pb2
as caffe2_pb2
27 def gen_vocab(corpus, unk_threshold):
28 vocab = collections.defaultdict(
lambda: len(vocab))
29 freqs = collections.defaultdict(
lambda: 0)
36 with open(corpus)
as f:
38 tokens = sentence.strip().
split()
41 for token, freq
in viewitems(freqs):
42 if freq > unk_threshold:
48 def get_numberized_sentence(sentence, vocab):
49 numerized_sentence = []
50 for token
in sentence.strip().
split():
52 numerized_sentence.append(vocab[token])
54 numerized_sentence.append(vocab[UNK])
55 return numerized_sentence
58 def rnn_unidirectional_layer(
66 return_sequence_output,
70 """ Unidirectional LSTM encoder.""" 71 with core.NameScope(scope):
72 initial_cell_state = model.param_init_net.ConstantFill(
78 initial_hidden_state = model.param_init_net.ConstantFill(
80 'initial_hidden_state',
85 cell = rnn_cell.LSTMCell(
86 input_size=input_size,
87 hidden_size=num_units,
89 memory_optimization=
False,
90 name=(scope +
'/' if scope
else '') +
'lstm',
91 forward_only=forward_only,
95 None if dropout_keep_prob
is None else (1.0 - dropout_keep_prob)
97 if dropout_ratio
is not None:
98 cell = rnn_cell.DropoutCell(
100 dropout_ratio=dropout_ratio,
101 name=(scope +
'/' if scope
else '') +
'dropout',
102 forward_only=forward_only,
106 outputs_with_grads = []
107 if return_sequence_output:
108 outputs_with_grads.append(0)
109 if return_final_state:
110 outputs_with_grads.extend([1, 3])
112 outputs, (_, final_hidden_state, _, final_cell_state) = (
113 cell.apply_over_sequence(
116 seq_lengths=input_lengths,
117 initial_states=(initial_hidden_state, initial_cell_state),
118 outputs_with_grads=outputs_with_grads,
121 return outputs, final_hidden_state, final_cell_state
124 def rnn_bidirectional_layer(
132 return_sequence_output,
136 outputs_fw, final_hidden_fw, final_cell_fw = rnn_unidirectional_layer(
144 return_sequence_output,
146 scope=(scope +
'/' if scope
else '') +
'fw',
148 with core.NameScope(scope):
149 reversed_inputs = model.net.ReversePackedSegs(
150 [inputs, input_lengths],
153 outputs_bw, final_hidden_bw, final_cell_bw = rnn_unidirectional_layer(
161 return_sequence_output,
163 scope=(scope +
'/' if scope
else '') +
'bw',
165 with core.NameScope(scope):
166 outputs_bw = model.net.ReversePackedSegs(
167 [outputs_bw, input_lengths],
172 if return_sequence_output:
173 with core.NameScope(scope):
174 outputs, _ = model.net.Concat(
175 [outputs_fw, outputs_bw],
176 [
'outputs',
'outputs_dim'],
182 if return_final_state:
183 with core.NameScope(scope):
184 final_hidden_state, _ = model.net.Concat(
185 [final_hidden_fw, final_hidden_bw],
186 [
'final_hidden_state',
'final_hidden_state_dim'],
189 final_cell_state, _ = model.net.Concat(
190 [final_cell_fw, final_cell_bw],
191 [
'final_cell_state',
'final_cell_state_dim'],
195 final_hidden_state =
None 196 final_cell_state =
None 198 return outputs, final_hidden_state, final_cell_state
201 def build_embeddings(
208 embeddings = model.param_init_net.GaussianFill(
211 shape=[vocab_size, embedding_size],
214 if not freeze_embeddings:
215 model.params.append(embeddings)
219 def get_layer_scope(scope, layer_type, i):
220 prefix = (scope +
'/' if scope
else '') + layer_type
221 return '{}/layer{}'.format(prefix, i)
224 def build_embedding_encoder(
238 with core.NameScope(scope
or ''):
240 embedded_encoder_inputs = model.net.Gather(
241 [embeddings, inputs],
242 [
'embedded_encoder_inputs'],
245 with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
246 embedded_encoder_inputs_cpu = model.net.Gather(
247 [embeddings, inputs],
248 [
'embedded_encoder_inputs_cpu'],
250 embedded_encoder_inputs = model.CopyCPUToGPU(
251 embedded_encoder_inputs_cpu,
252 'embedded_encoder_inputs',
255 layer_inputs = embedded_encoder_inputs
256 layer_input_size = embedding_size
257 encoder_units_per_layer = []
258 final_encoder_hidden_states = []
259 final_encoder_cell_states = []
261 num_encoder_layers = len(encoder_params[
'encoder_layer_configs'])
262 use_bidirectional_encoder = encoder_params.get(
263 'use_bidirectional_encoder',
267 for i, layer_config
in enumerate(encoder_params[
'encoder_layer_configs']):
269 if use_bidirectional_encoder
and i == 0:
270 layer_func = rnn_bidirectional_layer
271 output_dims = 2 * layer_config[
'num_units']
273 layer_func = rnn_unidirectional_layer
274 output_dims = layer_config[
'num_units']
275 encoder_units_per_layer.append(output_dims)
277 is_final_layer = (i == num_encoder_layers - 1)
279 dropout_keep_prob = layer_config.get(
284 return_final_state = i >= (num_encoder_layers - num_decoder_layers)
287 final_layer_hidden_state,
288 final_layer_cell_state,
292 input_lengths=input_lengths,
293 input_size=layer_input_size,
294 num_units=layer_config[
'num_units'],
295 dropout_keep_prob=dropout_keep_prob,
296 forward_only=forward_only,
297 return_sequence_output=(
not is_final_layer)
or use_attention,
298 return_final_state=return_final_state,
299 scope=get_layer_scope(scope,
'encoder', i),
302 if not is_final_layer:
303 layer_inputs = layer_outputs
304 layer_input_size = output_dims
305 final_encoder_hidden_states.append(final_layer_hidden_state)
306 final_encoder_cell_states.append(final_layer_cell_state)
308 encoder_outputs = layer_outputs
309 weighted_encoder_outputs =
None 313 weighted_encoder_outputs,
314 final_encoder_hidden_states,
315 final_encoder_cell_states,
316 encoder_units_per_layer,
322 def scope(self, name):
323 return self.
name +
'/' + name
if self.
name is not None else name
325 def _get_attention_type(self, attention_type_as_string):
326 if attention_type_as_string ==
'regular':
327 return attention.AttentionType.Regular
328 elif attention_type_as_string ==
'recurrent':
329 return attention.AttentionType.Recurrent
331 assert False,
'Unknown type ' + attention_type_as_string
343 residual_output_layers=
None,
345 weighted_encoder_outputs=
None,
349 if attention_type ==
'none':
352 name=self.
scope(
'decoder'),
353 residual_output_layers=residual_output_layers,
361 name=self.
scope(
'decoder'),
362 residual_output_layers=residual_output_layers,
365 encoder_output_dim=encoder_output_dim,
366 encoder_outputs=encoder_outputs,
367 encoder_lengths=encoder_lengths,
368 decoder_cell=decoder_cell,
369 decoder_state_dim=decoder_num_units,
370 name=self.
scope(
'attention_decoder'),
372 weighted_encoder_outputs=weighted_encoder_outputs,
373 attention_memory_optimization=
True,
379 self.output_indices.append(2 * self.
num_layers)
381 def get_state_names(self):
382 return self.cell.get_state_names()
384 def get_outputs_with_grads(self):
388 def get_output_dim(self):
391 def get_attention_weights(self):
394 return self.cell.get_attention_weights()
404 return self.cell.apply(
407 seq_lengths=seq_lengths,
412 def apply_over_sequence(
419 return self.cell.apply_over_sequence(
422 seq_lengths=seq_lengths,
423 initial_states=initial_states,
428 def build_initial_rnn_decoder_states(
430 encoder_units_per_layer,
431 decoder_units_per_layer,
432 final_encoder_hidden_states,
433 final_encoder_cell_states,
436 num_encoder_layers = len(encoder_units_per_layer)
437 num_decoder_layers = len(decoder_units_per_layer)
438 if num_encoder_layers > num_decoder_layers:
439 offset = num_encoder_layers - num_decoder_layers
444 for i, decoder_num_units
in enumerate(decoder_units_per_layer):
447 final_encoder_hidden_states
and 448 len(final_encoder_hidden_states) > (i + offset)
450 final_encoder_hidden_state = final_encoder_hidden_states[i + offset]
452 final_encoder_hidden_state =
None 454 if final_encoder_hidden_state
is None:
455 decoder_initial_hidden_state = model.param_init_net.ConstantFill(
457 'decoder_initial_hidden_state_{}'.format(i),
458 shape=[decoder_num_units],
461 model.params.append(decoder_initial_hidden_state)
462 elif decoder_num_units != encoder_units_per_layer[i + offset]:
463 decoder_initial_hidden_state = brew.fc(
465 final_encoder_hidden_state,
466 'decoder_initial_hidden_state_{}'.format(i),
467 encoder_units_per_layer[i + offset],
472 decoder_initial_hidden_state = final_encoder_hidden_state
473 initial_states.append(decoder_initial_hidden_state)
476 final_encoder_cell_states
and 477 len(final_encoder_cell_states) > (i + offset)
479 final_encoder_cell_state = final_encoder_cell_states[i + offset]
481 final_encoder_cell_state =
None 483 if final_encoder_cell_state
is None:
484 decoder_initial_cell_state = model.param_init_net.ConstantFill(
486 'decoder_initial_cell_state_{}'.format(i),
487 shape=[decoder_num_units],
490 model.params.append(decoder_initial_cell_state)
491 elif decoder_num_units != encoder_units_per_layer[i + offset]:
492 decoder_initial_cell_state = brew.fc(
494 final_encoder_cell_state,
495 'decoder_initial_cell_state_{}'.format(i),
496 encoder_units_per_layer[i + offset],
501 decoder_initial_cell_state = final_encoder_cell_state
502 initial_states.append(decoder_initial_cell_state)
505 initial_attention_weighted_encoder_context = (
506 model.param_init_net.ConstantFill(
508 'initial_attention_weighted_encoder_context',
509 shape=[encoder_units_per_layer[-1]],
513 model.params.append(initial_attention_weighted_encoder_context)
514 initial_states.append(initial_attention_weighted_encoder_context)
516 return initial_states
519 def build_embedding_decoder(
521 decoder_layer_configs,
526 weighted_encoder_outputs,
527 final_encoder_hidden_states,
528 final_encoder_cell_states,
529 encoder_units_per_layer,
538 with core.NameScope(scope
or ''):
540 embedded_decoder_inputs = model.net.Gather(
541 [embeddings, inputs],
542 [
'embedded_decoder_inputs'],
545 with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
546 embedded_decoder_inputs_cpu = model.net.Gather(
547 [embeddings, inputs],
548 [
'embedded_decoder_inputs_cpu'],
550 embedded_decoder_inputs = model.CopyCPUToGPU(
551 embedded_decoder_inputs_cpu,
552 'embedded_decoder_inputs',
556 decoder_units_per_layer = []
557 for i, layer_config
in enumerate(decoder_layer_configs):
558 num_units = layer_config[
'num_units']
559 decoder_units_per_layer.append(num_units)
562 input_size = embedding_size
564 input_size = decoder_cells[-1].get_output_dim()
567 forward_only=forward_only,
568 input_size=input_size,
569 hidden_size=num_units,
571 memory_optimization=
False,
574 dropout_keep_prob = layer_config.get(
'dropout_keep_prob',
None)
575 if dropout_keep_prob
is not None:
576 dropout_ratio = 1.0 - layer_config.dropout_keep_prob
579 dropout_ratio=dropout_ratio,
580 forward_only=forward_only,
582 name=get_layer_scope(scope,
'decoder_dropout', i),
585 decoder_cells.append(cell)
587 states = build_initial_rnn_decoder_states(
589 encoder_units_per_layer=encoder_units_per_layer,
590 decoder_units_per_layer=decoder_units_per_layer,
591 final_encoder_hidden_states=final_encoder_hidden_states,
592 final_encoder_cell_states=final_encoder_cell_states,
593 use_attention=(attention_type !=
'none'),
596 encoder_outputs=encoder_outputs,
597 encoder_output_dim=encoder_units_per_layer[-1],
598 encoder_lengths=encoder_lengths,
599 vocab_size=vocab_size,
600 attention_type=attention_type,
601 embedding_size=embedding_size,
602 decoder_num_units=decoder_units_per_layer[-1],
603 decoder_cells=decoder_cells,
604 weighted_encoder_outputs=weighted_encoder_outputs,
607 decoder_outputs, _ = attention_decoder.apply_over_sequence(
609 inputs=embedded_decoder_inputs,
610 seq_lengths=input_lengths,
611 initial_states=states,
617 decoder_outputs_flattened, _ = model.net.Reshape(
620 'decoder_outputs_flattened',
621 'decoder_outputs_and_contexts_combination_old_shape',
623 shape=[-1, attention_decoder.get_output_dim()],
626 decoder_outputs = decoder_outputs_flattened
627 decoder_output_dim = attention_decoder.get_output_dim()
629 return (decoder_outputs, decoder_output_dim)
632 def output_projection(
637 decoder_softmax_size,
639 if decoder_softmax_size
is not None:
640 decoder_outputs = brew.fc(
643 'decoder_outputs_scaled',
644 dim_in=decoder_output_size,
645 dim_out=decoder_softmax_size,
647 decoder_output_size = decoder_softmax_size
649 output_projection_w = model.param_init_net.XavierFill(
651 'output_projection_w',
652 shape=[target_vocab_size, decoder_output_size],
655 output_projection_b = model.param_init_net.XavierFill(
657 'output_projection_b',
658 shape=[target_vocab_size],
660 model.params.extend([
664 output_logits = model.net.FC(
def get_outputs_with_grads(self)
Module caffe2.python.layers.split.
Module caffe2.python.scope.
def _get_attention_type(self, attention_type_as_string)