Caffe2 - Python API
A deep learning, cross platform ML framework
char_rnn.py
1 ## @package char_rnn
2 # Module caffe2.python.examples.char_rnn
3 from __future__ import absolute_import
4 from __future__ import division
5 from __future__ import print_function
6 from __future__ import unicode_literals
7 
8 from caffe2.python import core, workspace, model_helper, utils, brew
9 from caffe2.python.rnn_cell import LSTM
10 from caffe2.proto import caffe2_pb2
11 from caffe2.python.optimizer import build_sgd
12 
13 
14 import argparse
15 import logging
16 import numpy as np
17 from datetime import datetime
18 
19 '''
20 This script takes a text file as input and uses a recurrent neural network
21 to learn to predict next character in a sequence.
22 '''
23 
24 logging.basicConfig()
25 log = logging.getLogger("char_rnn")
26 log.setLevel(logging.DEBUG)
27 
28 
29 # Default set() here is intentional as it would accumulate values like a global
30 # variable
31 def CreateNetOnce(net, created_names=set()): # noqa
32  name = net.Name()
33  if name not in created_names:
34  created_names.add(name)
35  workspace.CreateNet(net)
36 
37 
38 class CharRNN(object):
39  def __init__(self, args):
40  self.seq_length = args.seq_length
41  self.batch_size = args.batch_size
42  self.iters_to_report = args.iters_to_report
43  self.hidden_size = args.hidden_size
44 
45  with open(args.train_data) as f:
46  self.text = f.read()
47 
48  self.vocab = list(set(self.text))
49  self.char_to_idx = {ch: idx for idx, ch in enumerate(self.vocab)}
50  self.idx_to_char = {idx: ch for idx, ch in enumerate(self.vocab)}
51  self.D = len(self.char_to_idx)
52 
53  print("Input has {} characters. Total input size: {}".format(
54  len(self.vocab), len(self.text)))
55 
56  def CreateModel(self):
57  log.debug("Start training")
58  model = model_helper.ModelHelper(name="char_rnn")
59 
60  input_blob, seq_lengths, hidden_init, cell_init, target = \
61  model.net.AddExternalInputs(
62  'input_blob',
63  'seq_lengths',
64  'hidden_init',
65  'cell_init',
66  'target',
67  )
68 
69  hidden_output_all, self.hidden_output, _, self.cell_state = LSTM(
70  model, input_blob, seq_lengths, (hidden_init, cell_init),
71  self.D, self.hidden_size, scope="LSTM")
72  output = brew.fc(
73  model,
74  hidden_output_all,
75  None,
76  dim_in=self.hidden_size,
77  dim_out=self.D,
78  axis=2
79  )
80 
81  # axis is 2 as first two are T (time) and N (batch size).
82  # We treat them as one big batch of size T * N
83  softmax = model.net.Softmax(output, 'softmax', axis=2)
84 
85  softmax_reshaped, _ = model.net.Reshape(
86  softmax, ['softmax_reshaped', '_'], shape=[-1, self.D])
87 
88  # Create a copy of the current net. We will use it on the forward
89  # pass where we don't need loss and backward operators
90  self.forward_net = core.Net(model.net.Proto())
91 
92  xent = model.net.LabelCrossEntropy([softmax_reshaped, target], 'xent')
93  # Loss is average both across batch and through time
94  # Thats why the learning rate below is multiplied by self.seq_length
95  loss = model.net.AveragedLoss(xent, 'loss')
96  model.AddGradientOperators([loss])
97 
98  # use build_sdg function to build an optimizer
99  build_sgd(
100  model,
101  base_learning_rate=0.1 * self.seq_length,
102  policy="step",
103  stepsize=1,
104  gamma=0.9999
105  )
106 
107  self.model = model
108  self.predictions = softmax
109  self.loss = loss
110 
111  self.prepare_state = core.Net("prepare_state")
112  self.prepare_state.Copy(self.hidden_output, hidden_init)
113  self.prepare_state.Copy(self.cell_state, cell_init)
114 
115  def _idx_at_pos(self, pos):
116  return self.char_to_idx[self.text[pos]]
117 
118  def TrainModel(self):
119  log.debug("Training model")
120 
121  workspace.RunNetOnce(self.model.param_init_net)
122 
123  # As though we predict the same probability for each character
124  smooth_loss = -np.log(1.0 / self.D) * self.seq_length
125  last_n_iter = 0
126  last_n_loss = 0.0
127  num_iter = 0
128  N = len(self.text)
129 
130  # We split text into batch_size pieces. Each piece will be used only
131  # by a corresponding batch during the training process
132  text_block_positions = np.zeros(self.batch_size, dtype=np.int32)
133  text_block_size = N // self.batch_size
134  text_block_starts = list(range(0, N, text_block_size))
135  text_block_sizes = [text_block_size] * self.batch_size
136  text_block_sizes[self.batch_size - 1] += N % self.batch_size
137  assert sum(text_block_sizes) == N
138 
139  # Writing to output states which will be copied to input
140  # states within the loop below
141  workspace.FeedBlob(self.hidden_output, np.zeros(
142  [1, self.batch_size, self.hidden_size], dtype=np.float32
143  ))
144  workspace.FeedBlob(self.cell_state, np.zeros(
145  [1, self.batch_size, self.hidden_size], dtype=np.float32
146  ))
147  workspace.CreateNet(self.prepare_state)
148 
149  # We iterate over text in a loop many times. Each time we peak
150  # seq_length segment and feed it to LSTM as a sequence
151  last_time = datetime.now()
152  progress = 0
153  while True:
154  workspace.FeedBlob(
155  "seq_lengths",
156  np.array([self.seq_length] * self.batch_size,
157  dtype=np.int32)
158  )
159  workspace.RunNet(self.prepare_state.Name())
160 
161  input = np.zeros(
162  [self.seq_length, self.batch_size, self.D]
163  ).astype(np.float32)
164  target = np.zeros(
165  [self.seq_length * self.batch_size]
166  ).astype(np.int32)
167 
168  for e in range(self.batch_size):
169  for i in range(self.seq_length):
170  pos = text_block_starts[e] + text_block_positions[e]
171  input[i][e][self._idx_at_pos(pos)] = 1
172  target[i * self.batch_size + e] =\
173  self._idx_at_pos((pos + 1) % N)
174  text_block_positions[e] = (
175  text_block_positions[e] + 1) % text_block_sizes[e]
176  progress += 1
177 
178  workspace.FeedBlob('input_blob', input)
179  workspace.FeedBlob('target', target)
180 
181  CreateNetOnce(self.model.net)
182  workspace.RunNet(self.model.net.Name())
183 
184  num_iter += 1
185  last_n_iter += 1
186 
187  if num_iter % self.iters_to_report == 0:
188  new_time = datetime.now()
189  print("Characters Per Second: {}". format(
190  int(progress / (new_time - last_time).total_seconds())
191  ))
192  print("Iterations Per Second: {}". format(
193  int(self.iters_to_report /
194  (new_time - last_time).total_seconds())
195  ))
196 
197  last_time = new_time
198  progress = 0
199 
200  print("{} Iteration {} {}".
201  format('-' * 10, num_iter, '-' * 10))
202 
203  loss = workspace.FetchBlob(self.loss) * self.seq_length
204  smooth_loss = 0.999 * smooth_loss + 0.001 * loss
205  last_n_loss += loss
206 
207  if num_iter % self.iters_to_report == 0:
208  self.GenerateText(500, np.random.choice(self.vocab))
209 
210  log.debug("Loss since last report: {}"
211  .format(last_n_loss / last_n_iter))
212  log.debug("Smooth loss: {}".format(smooth_loss))
213 
214  last_n_loss = 0.0
215  last_n_iter = 0
216 
217  def GenerateText(self, num_characters, ch):
218  # Given a starting symbol we feed a fake sequence of size 1 to
219  # our RNN num_character times. After each time we use output
220  # probabilities to pick a next character to feed to the network.
221  # Same character becomes part of the output
222  CreateNetOnce(self.forward_net)
223 
224  text = '' + ch
225  for _i in range(num_characters):
226  workspace.FeedBlob(
227  "seq_lengths", np.array([1] * self.batch_size, dtype=np.int32))
228  workspace.RunNet(self.prepare_state.Name())
229 
230  input = np.zeros([1, self.batch_size, self.D]).astype(np.float32)
231  input[0][0][self.char_to_idx[ch]] = 1
232 
233  workspace.FeedBlob("input_blob", input)
234  workspace.RunNet(self.forward_net.Name())
235 
236  p = workspace.FetchBlob(self.predictions)
237  next = np.random.choice(self.D, p=p[0][0])
238 
239  ch = self.idx_to_char[next]
240  text += ch
241 
242  print(text)
243 
244 
245 @utils.debug
246 def main():
247  parser = argparse.ArgumentParser(
248  description="Caffe2: Char RNN Training"
249  )
250  parser.add_argument("--train_data", type=str, default=None,
251  help="Path to training data in a text file format",
252  required=True)
253  parser.add_argument("--seq_length", type=int, default=25,
254  help="One training example sequence length")
255  parser.add_argument("--batch_size", type=int, default=1,
256  help="Training batch size")
257  parser.add_argument("--iters_to_report", type=int, default=500,
258  help="How often to report loss and generate text")
259  parser.add_argument("--hidden_size", type=int, default=100,
260  help="Dimension of the hidden representation")
261  parser.add_argument("--gpu", action="store_true",
262  help="If set, training is going to use GPU 0")
263 
264  args = parser.parse_args()
265 
266  device = core.DeviceOption(
267  workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 0)
268  with core.DeviceScope(device):
269  model = CharRNN(args)
270  model.CreateModel()
271  model.TrainModel()
272 
273 
274 if __name__ == '__main__':
275  workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
276  main()
def GenerateText(self, num_characters, ch)
Definition: char_rnn.py:217