Caffe2 - Python API
A deep learning, cross platform ML framework
tt_core.py
1 # Copyright (c) 2016-present, Facebook, Inc.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 ##############################################################################
15 
16 ## @package tt_core
17 # Module caffe2.python.tt_core
18 from __future__ import absolute_import
19 from __future__ import division
20 from __future__ import print_function
21 
22 import numpy as np
23 
24 """
25 The following methods are various utility methods for using the Tensor-Train
26 decomposition, or TT-decomposition introduced by I. V. Oseledets (2011) in his
27 paper (http://epubs.siam.org/doi/abs/10.1137/090752286).
28 
29 Broadly speaking, these methods are used to replace fully connected layers in
30 neural networks with Tensor-Train layers introduced by A. Novikov et. al. (2015)
31 in their paper (http://arxiv.org/abs/1509.06569). More details about each of
32 the methods are provided in each respective docstring.
33 """
34 
35 
36 def init_tt_cores(inp_sizes, out_sizes, tt_ranks, seed=1234):
37  """
38  Initialize randomized orthogonalized TT-cores.
39 
40  This method should be used when a TT-layer is trained from scratch. The
41  sizes of each of the cores are specified by the inp_sizes and out_sizes, and
42  the respective tt_ranks will dictate the ranks of each of the cores. Note
43  that a larger set of tt_ranks will result in slower computation but will
44  result in more accurate approximations. The size of the ith core is:
45 
46  tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].
47 
48  Note that the following relationships of lengths of each input is expected:
49 
50  len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.
51 
52  Args:
53  inp_sizes: list of the input dimensions of the respective cores
54  out_sizes: list of the output dimensions of the respective cores
55  tt_ranks: list of the ranks of the respective cores
56  seed: integer to seed the random number generator
57 
58  Returns:
59  cores: One-dimensional list of cores concatentated along an axis
60  """
61  np.random.seed(seed)
62 
63  # Assert that the sizes of each input is correct
64  assert(len(inp_sizes) == len(out_sizes)), \
65  "The number of input dimensions (" + str(len(inp_sizes)) + \
66  ") must be equal to the number of output dimensions (" + \
67  str(len(out_sizes)) + ")."
68 
69  assert(len(tt_ranks) == len(inp_sizes) + 1), \
70  "The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
71  "one more than the number of input and output dims (" + \
72  str(len(out_sizes)) + ")."
73 
74  # Convert to numpy arrays
75  inp_sizes = np.array(inp_sizes)
76  out_sizes = np.array(out_sizes)
77  tt_ranks = np.array(tt_ranks)
78 
79  # Initialize the cores array
80  cores_len = np.sum(
81  inp_sizes * out_sizes * tt_ranks[1:] * tt_ranks[:-1])
82  cores = np.zeros(cores_len)
83  cores_idx = 0
84  rv = 1
85 
86  # Compute the full list of cores by computing each individual one
87  for i in range(inp_sizes.shape[0]):
88  shape = [tt_ranks[i],
89  inp_sizes[i],
90  out_sizes[i],
91  tt_ranks[i + 1]]
92 
93  # Precompute the shape of each core
94  tall_shape = (np.prod(shape[:3]), shape[3])
95 
96  # Randomly initialize the current core using a normal distribution
97  curr_core = np.dot(rv, np.random.normal(
98  0, 1, size=(shape[0], np.prod(shape[1:]))))
99  curr_core = curr_core.reshape(tall_shape)
100 
101  # Orthogonalize the initialized current core and append to cores list
102  if i < inp_sizes.shape[0] - 1:
103  curr_core, rv = np.linalg.qr(curr_core)
104  cores[cores_idx:cores_idx +
105  curr_core.size] = curr_core.flatten()
106  cores_idx += curr_core.size
107 
108  # Normalize the list of arrays using this Glarot trick
109  glarot_style = (np.prod(inp_sizes) *
110  np.prod(tt_ranks))**(1.0 / inp_sizes.shape[0])
111 
112  return (0.1 / glarot_style) * np.array(cores).astype(np.float32)
113 
114 
115 def matrix_to_tt(W, inp_sizes, out_sizes, tt_ranks):
116  """
117  Convert a matrix into the TT-format.
118 
119  This method will consume a 2D weight matrix such as those used in fully
120  connected layers in a neural network and will compute the TT-decomposition
121  of the weight matrix and return the TT-cores of the resulting computation.
122  This method should be used when converting a trained, fully connected layer,
123  into a TT-layer for increased speed and decreased parameter size. The size
124  of the ith core is:
125 
126  tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].
127 
128  Note that the following relationships of lengths of each input is expected:
129 
130  len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.
131 
132  We also require that np.prod(inp_sizes) == W.shape[0] and that
133  np.prod(out_sizes) == W.shape[1].
134 
135  Args:
136  W: two-dimensional weight matrix numpy array representing a fully
137  connected layer to be converted to TT-format; note that the weight
138  matrix is transposed before decomposed because we want to emulate the
139  X * W^T operation that the FC layer performs.
140  inp_sizes: list of the input dimensions of the respective cores
141  out_sizes: list of the output dimensions of the respective cores
142  tt_ranks: list of the ranks of the respective cores
143 
144  Returns:
145  new_cores: One-dimensional list of cores concatentated along an axis
146  """
147 
148  # Assert that the sizes of each input is correct
149  assert(len(inp_sizes) == len(out_sizes)), \
150  "The number of input dimensions (" + str(len(inp_sizes)) + \
151  ") must be equal to the number of output dimensions (" + \
152  str(len(out_sizes)) + ")."
153 
154  assert(len(tt_ranks) == len(inp_sizes) + 1), \
155  "The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
156  "one more than the number of input and output dimensions (" + \
157  str(len(out_sizes)) + ")."
158 
159  assert(W.shape[0] == np.prod(inp_sizes)), \
160  "The product of the input sizes (" + str(np.prod(inp_sizes)) + \
161  ") must be equal to first dimension of W (" + str(W.shape[0]) + ")."
162 
163  assert(W.shape[1] == np.prod(out_sizes)), \
164  "The product of the output sizes (" + str(np.prod(out_sizes)) + \
165  ") must be equal to second dimension of W (" + str(W.shape[1]) + ")."
166 
167  # W is transposed so that the multiplication X * W^T can be computed, just
168  # as it is in the FC layer.
169  W = W.transpose()
170 
171  # Convert to numpy arrays
172  inp_sizes = np.array(inp_sizes)
173  out_sizes = np.array(out_sizes)
174  tt_ranks = np.array(tt_ranks)
175 
176  # Copy the original weight matrix in order to permute and reshape the weight
177  # matrix. In addition, the inp_sizes and out_sizes are combined to a single
178  # sizes array to use the tt_svd helper method, which only consumes a single
179  # sizes array.
180  W_copy = W.copy()
181  total_inp_size = inp_sizes.size
182  W_copy = np.reshape(W_copy, np.concatenate((inp_sizes, out_sizes)))
183  order = np.repeat(np.arange(0, total_inp_size), 2) + \
184  np.tile([0, total_inp_size], total_inp_size)
185  W_copy = np.transpose(W_copy, axes=order)
186  W_copy = np.reshape(W_copy, inp_sizes * out_sizes)
187 
188  # Use helper method to convert the W matrix copy into the preliminary
189  # cores array.
190  cores = tt_svd(W_copy, inp_sizes * out_sizes, tt_ranks)
191 
192  # Permute the dimensions of each of the cores to be compatible with the
193  # TT-layer.
194  new_cores = np.zeros(cores.shape).astype(np.float32)
195  idx = 0
196  for i in range(len(inp_sizes)):
197  shape = (tt_ranks[i], inp_sizes[i], out_sizes[i], tt_ranks[i + 1])
198  current_core = cores[idx:idx + np.prod(shape)].reshape(shape)
199  current_core = current_core.transpose((1, 3, 0, 2))
200  new_cores[new_cores.shape[0] - idx - np.prod(shape):
201  new_cores.shape[0] - idx] \
202  = current_core.flatten()
203  idx += np.prod(shape)
204 
205  return new_cores
206 
207 
208 def tt_svd(W, sizes, tt_ranks):
209  """
210  Helper method for the matrix_to_tt() method performing the TT-SVD
211  decomposition.
212 
213  Uses the TT-decomposition algorithm to convert a matrix to TT-format using
214  multiple reduced SVD operations.
215 
216  Args:
217  W: two-dimensional weight matrix representing a fully connected layer to
218  be converted to TT-format preprocessed by the matrix_to_tt() method.
219  sizes: list of the dimensions of each of the cores
220  tt_ranks: list of the ranks of the respective cores
221 
222  Returns:
223  cores: One-dimensional list of cores concatentated along an axis
224  """
225 
226  assert(len(tt_ranks) == len(sizes) + 1)
227 
228  C = W.copy()
229  total_size = sizes.size
230  core = np.zeros(np.sum(tt_ranks[:-1] * sizes * tt_ranks[1:]),
231  dtype='float32')
232 
233  # Compute iterative reduced SVD operations and store each resulting U matrix
234  # as an individual core.
235  pos = 0
236  for i in range(0, total_size - 1):
237  shape = tt_ranks[i] * sizes[i]
238  C = np.reshape(C, [shape, -1])
239  U, S, V = np.linalg.svd(C, full_matrices=False)
240  U = U[:, 0:tt_ranks[i + 1]]
241  S = S[0:tt_ranks[i + 1]]
242  V = V[0:tt_ranks[i + 1], :]
243 
244  core[pos:pos + tt_ranks[i] * sizes[i] * tt_ranks[i + 1]] = U.ravel()
245  pos += tt_ranks[i] * sizes[i] * tt_ranks[i + 1]
246  C = np.dot(np.diag(S), V)
247 
248  core[pos:pos + tt_ranks[total_size - 1] *
249  sizes[total_size - 1] * tt_ranks[total_size]] = C.ravel()
250  return core
251 
252 
253 # TODO(Surya) Write a method to convert an entire network where all fully
254 # connected layers are replaced by an TT layer.
255 def fc_net_to_tt_net(net):
256  pass