Caffe2 - Python API
A deep learning, cross platform ML framework
tt_core.py
1 ## @package tt_core
2 # Module caffe2.python.tt_core
3 from __future__ import absolute_import
4 from __future__ import division
5 from __future__ import print_function
6 
7 import numpy as np
8 
9 """
10 The following methods are various utility methods for using the Tensor-Train
11 decomposition, or TT-decomposition introduced by I. V. Oseledets (2011) in his
12 paper (http://epubs.siam.org/doi/abs/10.1137/090752286).
13 
14 Broadly speaking, these methods are used to replace fully connected layers in
15 neural networks with Tensor-Train layers introduced by A. Novikov et. al. (2015)
16 in their paper (http://arxiv.org/abs/1509.06569). More details about each of
17 the methods are provided in each respective docstring.
18 """
19 
20 
21 def init_tt_cores(inp_sizes, out_sizes, tt_ranks, seed=1234):
22  """
23  Initialize randomized orthogonalized TT-cores.
24 
25  This method should be used when a TT-layer is trained from scratch. The
26  sizes of each of the cores are specified by the inp_sizes and out_sizes, and
27  the respective tt_ranks will dictate the ranks of each of the cores. Note
28  that a larger set of tt_ranks will result in slower computation but will
29  result in more accurate approximations. The size of the ith core is:
30 
31  tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].
32 
33  Note that the following relationships of lengths of each input is expected:
34 
35  len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.
36 
37  Args:
38  inp_sizes: list of the input dimensions of the respective cores
39  out_sizes: list of the output dimensions of the respective cores
40  tt_ranks: list of the ranks of the respective cores
41  seed: integer to seed the random number generator
42 
43  Returns:
44  cores: One-dimensional list of cores concatentated along an axis
45  """
46  np.random.seed(seed)
47 
48  # Assert that the sizes of each input is correct
49  assert(len(inp_sizes) == len(out_sizes)), \
50  "The number of input dimensions (" + str(len(inp_sizes)) + \
51  ") must be equal to the number of output dimensions (" + \
52  str(len(out_sizes)) + ")."
53 
54  assert(len(tt_ranks) == len(inp_sizes) + 1), \
55  "The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
56  "one more than the number of input and output dims (" + \
57  str(len(out_sizes)) + ")."
58 
59  # Convert to numpy arrays
60  inp_sizes = np.array(inp_sizes)
61  out_sizes = np.array(out_sizes)
62  tt_ranks = np.array(tt_ranks)
63 
64  # Initialize the cores array
65  cores_len = np.sum(
66  inp_sizes * out_sizes * tt_ranks[1:] * tt_ranks[:-1])
67  cores = np.zeros(cores_len)
68  cores_idx = 0
69  rv = 1
70 
71  # Compute the full list of cores by computing each individual one
72  for i in range(inp_sizes.shape[0]):
73  shape = [tt_ranks[i],
74  inp_sizes[i],
75  out_sizes[i],
76  tt_ranks[i + 1]]
77 
78  # Precompute the shape of each core
79  tall_shape = (np.prod(shape[:3]), shape[3])
80 
81  # Randomly initialize the current core using a normal distribution
82  curr_core = np.dot(rv, np.random.normal(
83  0, 1, size=(shape[0], np.prod(shape[1:]))))
84  curr_core = curr_core.reshape(tall_shape)
85 
86  # Orthogonalize the initialized current core and append to cores list
87  if i < inp_sizes.shape[0] - 1:
88  curr_core, rv = np.linalg.qr(curr_core)
89  cores[cores_idx:cores_idx +
90  curr_core.size] = curr_core.flatten()
91  cores_idx += curr_core.size
92 
93  # Normalize the list of arrays using this Glarot trick
94  glarot_style = (np.prod(inp_sizes) *
95  np.prod(tt_ranks))**(1.0 / inp_sizes.shape[0])
96 
97  return (0.1 / glarot_style) * np.array(cores).astype(np.float32)
98 
99 
100 def matrix_to_tt(W, inp_sizes, out_sizes, tt_ranks):
101  """
102  Convert a matrix into the TT-format.
103 
104  This method will consume a 2D weight matrix such as those used in fully
105  connected layers in a neural network and will compute the TT-decomposition
106  of the weight matrix and return the TT-cores of the resulting computation.
107  This method should be used when converting a trained, fully connected layer,
108  into a TT-layer for increased speed and decreased parameter size. The size
109  of the ith core is:
110 
111  tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].
112 
113  Note that the following relationships of lengths of each input is expected:
114 
115  len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.
116 
117  We also require that np.prod(inp_sizes) == W.shape[0] and that
118  np.prod(out_sizes) == W.shape[1].
119 
120  Args:
121  W: two-dimensional weight matrix numpy array representing a fully
122  connected layer to be converted to TT-format; note that the weight
123  matrix is transposed before decomposed because we want to emulate the
124  X * W^T operation that the FC layer performs.
125  inp_sizes: list of the input dimensions of the respective cores
126  out_sizes: list of the output dimensions of the respective cores
127  tt_ranks: list of the ranks of the respective cores
128 
129  Returns:
130  new_cores: One-dimensional list of cores concatentated along an axis
131  """
132 
133  # Assert that the sizes of each input is correct
134  assert(len(inp_sizes) == len(out_sizes)), \
135  "The number of input dimensions (" + str(len(inp_sizes)) + \
136  ") must be equal to the number of output dimensions (" + \
137  str(len(out_sizes)) + ")."
138 
139  assert(len(tt_ranks) == len(inp_sizes) + 1), \
140  "The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
141  "one more than the number of input and output dimensions (" + \
142  str(len(out_sizes)) + ")."
143 
144  assert(W.shape[0] == np.prod(inp_sizes)), \
145  "The product of the input sizes (" + str(np.prod(inp_sizes)) + \
146  ") must be equal to first dimension of W (" + str(W.shape[0]) + ")."
147 
148  assert(W.shape[1] == np.prod(out_sizes)), \
149  "The product of the output sizes (" + str(np.prod(out_sizes)) + \
150  ") must be equal to second dimension of W (" + str(W.shape[1]) + ")."
151 
152  # W is transposed so that the multiplication X * W^T can be computed, just
153  # as it is in the FC layer.
154  W = W.transpose()
155 
156  # Convert to numpy arrays
157  inp_sizes = np.array(inp_sizes)
158  out_sizes = np.array(out_sizes)
159  tt_ranks = np.array(tt_ranks)
160 
161  # Copy the original weight matrix in order to permute and reshape the weight
162  # matrix. In addition, the inp_sizes and out_sizes are combined to a single
163  # sizes array to use the tt_svd helper method, which only consumes a single
164  # sizes array.
165  W_copy = W.copy()
166  total_inp_size = inp_sizes.size
167  W_copy = np.reshape(W_copy, np.concatenate((inp_sizes, out_sizes)))
168  order = np.repeat(np.arange(0, total_inp_size), 2) + \
169  np.tile([0, total_inp_size], total_inp_size)
170  W_copy = np.transpose(W_copy, axes=order)
171  W_copy = np.reshape(W_copy, inp_sizes * out_sizes)
172 
173  # Use helper method to convert the W matrix copy into the preliminary
174  # cores array.
175  cores = tt_svd(W_copy, inp_sizes * out_sizes, tt_ranks)
176 
177  # Permute the dimensions of each of the cores to be compatible with the
178  # TT-layer.
179  new_cores = np.zeros(cores.shape).astype(np.float32)
180  idx = 0
181  for i in range(len(inp_sizes)):
182  shape = (tt_ranks[i], inp_sizes[i], out_sizes[i], tt_ranks[i + 1])
183  current_core = cores[idx:idx + np.prod(shape)].reshape(shape)
184  current_core = current_core.transpose((1, 3, 0, 2))
185  new_cores[new_cores.shape[0] - idx - np.prod(shape):
186  new_cores.shape[0] - idx] \
187  = current_core.flatten()
188  idx += np.prod(shape)
189 
190  return new_cores
191 
192 
193 def tt_svd(W, sizes, tt_ranks):
194  """
195  Helper method for the matrix_to_tt() method performing the TT-SVD
196  decomposition.
197 
198  Uses the TT-decomposition algorithm to convert a matrix to TT-format using
199  multiple reduced SVD operations.
200 
201  Args:
202  W: two-dimensional weight matrix representing a fully connected layer to
203  be converted to TT-format preprocessed by the matrix_to_tt() method.
204  sizes: list of the dimensions of each of the cores
205  tt_ranks: list of the ranks of the respective cores
206 
207  Returns:
208  cores: One-dimensional list of cores concatentated along an axis
209  """
210 
211  assert(len(tt_ranks) == len(sizes) + 1)
212 
213  C = W.copy()
214  total_size = sizes.size
215  core = np.zeros(np.sum(tt_ranks[:-1] * sizes * tt_ranks[1:]),
216  dtype='float32')
217 
218  # Compute iterative reduced SVD operations and store each resulting U matrix
219  # as an individual core.
220  pos = 0
221  for i in range(0, total_size - 1):
222  shape = tt_ranks[i] * sizes[i]
223  C = np.reshape(C, [shape, -1])
224  U, S, V = np.linalg.svd(C, full_matrices=False)
225  U = U[:, 0:tt_ranks[i + 1]]
226  S = S[0:tt_ranks[i + 1]]
227  V = V[0:tt_ranks[i + 1], :]
228 
229  core[pos:pos + tt_ranks[i] * sizes[i] * tt_ranks[i + 1]] = U.ravel()
230  pos += tt_ranks[i] * sizes[i] * tt_ranks[i + 1]
231  C = np.dot(np.diag(S), V)
232 
233  core[pos:pos + tt_ranks[total_size - 1] *
234  sizes[total_size - 1] * tt_ranks[total_size]] = C.ravel()
235  return core
236 
237 
238 # TODO(Surya) Write a method to convert an entire network where all fully
239 # connected layers are replaced by an TT layer.
240 def fc_net_to_tt_net(net):
241  pass