Caffe2 - Python API
A deep learning, cross platform ML framework
char_rnn.py
1 # Copyright (c) 2016-present, Facebook, Inc.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 ##############################################################################
15 
16 ## @package char_rnn
17 # Module caffe2.python.examples.char_rnn
18 from __future__ import absolute_import
19 from __future__ import division
20 from __future__ import print_function
21 from __future__ import unicode_literals
22 
23 from caffe2.python import core, workspace, model_helper, utils, brew
24 from caffe2.python.rnn_cell import LSTM
25 from caffe2.proto import caffe2_pb2
26 from caffe2.python.optimizer import build_sgd
27 
28 
29 import argparse
30 import logging
31 import numpy as np
32 from datetime import datetime
33 
34 '''
35 This script takes a text file as input and uses a recurrent neural network
36 to learn to predict next character in a sequence.
37 '''
38 
39 logging.basicConfig()
40 log = logging.getLogger("char_rnn")
41 log.setLevel(logging.DEBUG)
42 
43 
44 # Default set() here is intentional as it would accumulate values like a global
45 # variable
46 def CreateNetOnce(net, created_names=set()): # noqa
47  name = net.Name()
48  if name not in created_names:
49  created_names.add(name)
50  workspace.CreateNet(net)
51 
52 
53 class CharRNN(object):
54  def __init__(self, args):
55  self.seq_length = args.seq_length
56  self.batch_size = args.batch_size
57  self.iters_to_report = args.iters_to_report
58  self.hidden_size = args.hidden_size
59 
60  with open(args.train_data) as f:
61  self.text = f.read()
62 
63  self.vocab = list(set(self.text))
64  self.char_to_idx = {ch: idx for idx, ch in enumerate(self.vocab)}
65  self.idx_to_char = {idx: ch for idx, ch in enumerate(self.vocab)}
66  self.D = len(self.char_to_idx)
67 
68  print("Input has {} characters. Total input size: {}".format(
69  len(self.vocab), len(self.text)))
70 
71  def CreateModel(self):
72  log.debug("Start training")
73  model = model_helper.ModelHelper(name="char_rnn")
74 
75  input_blob, seq_lengths, hidden_init, cell_init, target = \
76  model.net.AddExternalInputs(
77  'input_blob',
78  'seq_lengths',
79  'hidden_init',
80  'cell_init',
81  'target',
82  )
83 
84  hidden_output_all, self.hidden_output, _, self.cell_state = LSTM(
85  model, input_blob, seq_lengths, (hidden_init, cell_init),
86  self.D, self.hidden_size, scope="LSTM")
87  output = brew.fc(
88  model,
89  hidden_output_all,
90  None,
91  dim_in=self.hidden_size,
92  dim_out=self.D,
93  axis=2
94  )
95 
96  # axis is 2 as first two are T (time) and N (batch size).
97  # We treat them as one big batch of size T * N
98  softmax = model.net.Softmax(output, 'softmax', axis=2)
99 
100  softmax_reshaped, _ = model.net.Reshape(
101  softmax, ['softmax_reshaped', '_'], shape=[-1, self.D])
102 
103  # Create a copy of the current net. We will use it on the forward
104  # pass where we don't need loss and backward operators
105  self.forward_net = core.Net(model.net.Proto())
106 
107  xent = model.net.LabelCrossEntropy([softmax_reshaped, target], 'xent')
108  # Loss is average both across batch and through time
109  # Thats why the learning rate below is multiplied by self.seq_length
110  loss = model.net.AveragedLoss(xent, 'loss')
111  model.AddGradientOperators([loss])
112 
113  # use build_sdg function to build an optimizer
114  build_sgd(
115  model,
116  base_learning_rate=0.1 * self.seq_length,
117  policy="step",
118  stepsize=1,
119  gamma=0.9999
120  )
121 
122  self.model = model
123  self.predictions = softmax
124  self.loss = loss
125 
126  self.prepare_state = core.Net("prepare_state")
127  self.prepare_state.Copy(self.hidden_output, hidden_init)
128  self.prepare_state.Copy(self.cell_state, cell_init)
129 
130  def _idx_at_pos(self, pos):
131  return self.char_to_idx[self.text[pos]]
132 
133  def TrainModel(self):
134  log.debug("Training model")
135 
136  workspace.RunNetOnce(self.model.param_init_net)
137 
138  # As though we predict the same probability for each character
139  smooth_loss = -np.log(1.0 / self.D) * self.seq_length
140  last_n_iter = 0
141  last_n_loss = 0.0
142  num_iter = 0
143  N = len(self.text)
144 
145  # We split text into batch_size pieces. Each piece will be used only
146  # by a corresponding batch during the training process
147  text_block_positions = np.zeros(self.batch_size, dtype=np.int32)
148  text_block_size = N // self.batch_size
149  text_block_starts = list(range(0, N, text_block_size))
150  text_block_sizes = [text_block_size] * self.batch_size
151  text_block_sizes[self.batch_size - 1] += N % self.batch_size
152  assert sum(text_block_sizes) == N
153 
154  # Writing to output states which will be copied to input
155  # states within the loop below
156  workspace.FeedBlob(self.hidden_output, np.zeros(
157  [1, self.batch_size, self.hidden_size], dtype=np.float32
158  ))
159  workspace.FeedBlob(self.cell_state, np.zeros(
160  [1, self.batch_size, self.hidden_size], dtype=np.float32
161  ))
162  workspace.CreateNet(self.prepare_state)
163 
164  # We iterate over text in a loop many times. Each time we peak
165  # seq_length segment and feed it to LSTM as a sequence
166  last_time = datetime.now()
167  progress = 0
168  while True:
169  workspace.FeedBlob(
170  "seq_lengths",
171  np.array([self.seq_length] * self.batch_size,
172  dtype=np.int32)
173  )
174  workspace.RunNet(self.prepare_state.Name())
175 
176  input = np.zeros(
177  [self.seq_length, self.batch_size, self.D]
178  ).astype(np.float32)
179  target = np.zeros(
180  [self.seq_length * self.batch_size]
181  ).astype(np.int32)
182 
183  for e in range(self.batch_size):
184  for i in range(self.seq_length):
185  pos = text_block_starts[e] + text_block_positions[e]
186  input[i][e][self._idx_at_pos(pos)] = 1
187  target[i * self.batch_size + e] =\
188  self._idx_at_pos((pos + 1) % N)
189  text_block_positions[e] = (
190  text_block_positions[e] + 1) % text_block_sizes[e]
191  progress += 1
192 
193  workspace.FeedBlob('input_blob', input)
194  workspace.FeedBlob('target', target)
195 
196  CreateNetOnce(self.model.net)
197  workspace.RunNet(self.model.net.Name())
198 
199  num_iter += 1
200  last_n_iter += 1
201 
202  if num_iter % self.iters_to_report == 0:
203  new_time = datetime.now()
204  print("Characters Per Second: {}". format(
205  int(progress / (new_time - last_time).total_seconds())
206  ))
207  print("Iterations Per Second: {}". format(
208  int(self.iters_to_report /
209  (new_time - last_time).total_seconds())
210  ))
211 
212  last_time = new_time
213  progress = 0
214 
215  print("{} Iteration {} {}".
216  format('-' * 10, num_iter, '-' * 10))
217 
218  loss = workspace.FetchBlob(self.loss) * self.seq_length
219  smooth_loss = 0.999 * smooth_loss + 0.001 * loss
220  last_n_loss += loss
221 
222  if num_iter % self.iters_to_report == 0:
223  self.GenerateText(500, np.random.choice(self.vocab))
224 
225  log.debug("Loss since last report: {}"
226  .format(last_n_loss / last_n_iter))
227  log.debug("Smooth loss: {}".format(smooth_loss))
228 
229  last_n_loss = 0.0
230  last_n_iter = 0
231 
232  def GenerateText(self, num_characters, ch):
233  # Given a starting symbol we feed a fake sequence of size 1 to
234  # our RNN num_character times. After each time we use output
235  # probabilities to pick a next character to feed to the network.
236  # Same character becomes part of the output
237  CreateNetOnce(self.forward_net)
238 
239  text = '' + ch
240  for _i in range(num_characters):
241  workspace.FeedBlob(
242  "seq_lengths", np.array([1] * self.batch_size, dtype=np.int32))
243  workspace.RunNet(self.prepare_state.Name())
244 
245  input = np.zeros([1, self.batch_size, self.D]).astype(np.float32)
246  input[0][0][self.char_to_idx[ch]] = 1
247 
248  workspace.FeedBlob("input_blob", input)
249  workspace.RunNet(self.forward_net.Name())
250 
251  p = workspace.FetchBlob(self.predictions)
252  next = np.random.choice(self.D, p=p[0][0])
253 
254  ch = self.idx_to_char[next]
255  text += ch
256 
257  print(text)
258 
259 
260 @utils.debug
261 def main():
262  parser = argparse.ArgumentParser(
263  description="Caffe2: Char RNN Training"
264  )
265  parser.add_argument("--train_data", type=str, default=None,
266  help="Path to training data in a text file format",
267  required=True)
268  parser.add_argument("--seq_length", type=int, default=25,
269  help="One training example sequence length")
270  parser.add_argument("--batch_size", type=int, default=1,
271  help="Training batch size")
272  parser.add_argument("--iters_to_report", type=int, default=500,
273  help="How often to report loss and generate text")
274  parser.add_argument("--hidden_size", type=int, default=100,
275  help="Dimension of the hidden representation")
276  parser.add_argument("--gpu", action="store_true",
277  help="If set, training is going to use GPU 0")
278 
279  args = parser.parse_args()
280 
281  device = core.DeviceOption(
282  caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
283  with core.DeviceScope(device):
284  model = CharRNN(args)
285  model.CreateModel()
286  model.TrainModel()
287 
288 
289 if __name__ == '__main__':
290  workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
291  main()
def _idx_at_pos(self, pos)
Definition: char_rnn.py:130
def GenerateText(self, num_characters, ch)
Definition: char_rnn.py:232