Caffe2 - Python API
A deep learning, cross platform ML framework
feature_sparse_to_dense.py
1 # Copyright (c) 2016-present, Facebook, Inc.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 ##############################################################################
15 
16 # @package sparse_to_dense
17 # Module caffe2.python.layers.sparse_to_dense
18 from __future__ import absolute_import
19 from __future__ import division
20 from __future__ import print_function
21 from __future__ import unicode_literals
22 
23 from caffe2.python import schema
24 from caffe2.python.layers.layers import (
25  ModelLayer,
26 )
27 import numpy as np
28 
29 
30 class FeatureSparseToDense(ModelLayer):
31 
32  def __init__(self, model, input_record, input_specs,
33  name='feature_sparse_to_dense', **kwargs):
34  """
35  `input_specs` follows the format of FeatureSpec from schema. To be more
36  precise it's a namedtuple that should have:
37  'feature_type', 'feature_names', 'feature_ids'
38  """
39  super(FeatureSparseToDense, self).__init__(model, name,
40  input_record, **kwargs)
41 
42  self.input_specs = input_specs
43 
44  outputs = []
45  for field, feature_specs in self.input_specs:
46  assert len(feature_specs.feature_names) ==\
47  len(feature_specs.feature_ids)
48  if feature_specs.feature_type == 'FLOAT':
49  outputs.append((
50  field,
52  (np.float32, (len(feature_specs.feature_ids), )),
53  self.get_next_blob_reference(field + '_output')
54  )
55  ))
56  elif feature_specs.feature_type == 'ID_LIST':
57  outputs.append((
58  field,
60  ('ranges',
62  (
63  np.int32,
64  (len(feature_specs.feature_ids), 2)
65  ),
66  self.get_next_blob_reference(
67  field + '_ranges')
68  ),
69  ),
70  ('values',
71  schema.Scalar(np.int64,
72  self.get_next_blob_reference(
73  field + '_values')
74  ),
75  )
76  )
77  ))
78  elif feature_specs.feature_type == 'ID_SCORE_LIST':
79  outputs.append((
80  field,
82  ('ranges',
84  (
85  np.int32,
86  (len(feature_specs.feature_ids), 2)
87  ),
88  self.get_next_blob_reference(
89  field + '_ranges')
90  ),
91  ),
92  ('ids',
93  schema.Scalar(np.int64,
94  self.get_next_blob_reference(
95  field + '_ids')
96  ),
97  ),
98  ('scores',
99  schema.Scalar(np.float32,
100  self.get_next_blob_reference(
101  field + '_scores')
102  ),
103  )
104  )
105  ))
106  elif feature_specs.feature_type == 'EMBEDDING':
107  # We don't know dimensions of embeddings in input data.
108  # Even though they should match dimensions from feature config,
109  # we keep ranges blob to check input data later.
110  outputs.append((
111  field,
113  ('ranges',
115  (
116  np.int32,
117  (len(feature_specs.feature_ids), 2)
118  ),
119  self.get_next_blob_reference(
120  field + '_ranges')
121  ),
122  ),
123  ('values',
124  schema.Scalar(np.float32,
125  self.get_next_blob_reference(
126  field + '_values')
127  ),
128  )
129  )
130  ))
131  else:
132  raise TypeError(
133  "Unsupported input type: {0}".
134  format(feature_specs.feature_type))
135 
136  # TODO(amalevich): This schema is producing ranges. And thus if there is
137  # something using it it should support ranges as well. It might be
138  # confusing, if we don't add better support for ranges/have it as a
139  # first layer
141  *outputs
142  )
143 
144  # TODO(amalevich): Consider moving this data to schema, instead
145  # Structs doens't support attaching metadata to them and clonning
146  # will break things badly, but this is the most elegant way to pass
147  # this info around. Should we change it or it'll be too much work and
148  # not worse it?
149  for field, feature_specs in input_specs:
150  schema.attach_metadata_to_scalars(
151  self.output_schema[field],
153  feature_specs=feature_specs)
154  )
155  self.zero = model.global_constants['ZERO']
156  self.zero_range = model.global_constants['ZERO_RANGE']
157 
158  # Add operators to all types that need to be densified
159  def add_ops(self, net):
160  record = self.input_record
161  for field, feature_specs in self.input_specs:
162  if feature_specs.feature_type == 'FLOAT':
163  net.SparseToDenseMask(
164  [
165  record[field].keys(),
166  record[field].values(),
167  self.zero,
168  record[field].lengths(),
169  ],
170  [
171  self.output_schema[field](),
172  ],
173  mask=feature_specs.feature_ids,
174  )
175  elif feature_specs.feature_type == 'ID_LIST':
176  id_list_ranges = net.LengthsToRanges(
177  record[field].values.lengths(),
178  net.NextScopedBlob('id_list_ranges')
179  )
180  net.SparseToDenseMask(
181  [
182  record[field].keys(), id_list_ranges, self.zero_range,
183  record[field].lengths()
184  ],
185  self.output_schema[field].ranges(),
186  mask=feature_specs.feature_ids,
187  )
188  # Alias helps to enforce the fact that all SparseToDense calls
189  # produce new blobs.
190  # Reusing blob names might result in some weird consequences
191  # during the delivery time, when content of the blobs is
192  # generated based on the inputSpecs.
193  net.Alias(record[field].values.items(),
194  self.output_schema[field].values())
195  elif feature_specs.feature_type == 'ID_SCORE_LIST':
196  # TODO: merge this to the case above?
197  id_list_ranges = net.LengthsToRanges(
198  record[field].values.lengths(),
199  net.NextScopedBlob('id_score_list_ranges')
200  )
201  net.SparseToDenseMask(
202  [
203  record[field].keys(), id_list_ranges, self.zero_range,
204  record[field].lengths()
205  ],
206  self.output_schema[field].ranges(),
207  mask=feature_specs.feature_ids,
208  )
209  # Alias helps to enforce the fact that all SparseToDense calls
210  # produce new blobs.
211  # Reusing blob names might result in some weird consequences
212  # during the delivery time, when content of the blobs is
213  # generated based on the inputSpecs.
214  net.Alias(record[field].values.keys(),
215  self.output_schema[field].ids())
216  net.Alias(record[field].values.values(),
217  self.output_schema[field].scores())
218  elif feature_specs.feature_type == 'EMBEDDING':
219  ranges = net.LengthsToRanges(
220  record[field].values.lengths(),
221  net.NextScopedBlob('embeddings_ranges')
222  )
223  net.SparseToDenseMask(
224  [
225  record[field].keys(),
226  ranges,
227  self.zero_range,
228  record[field].lengths()
229  ],
230  self.output_schema[field].ranges(),
231  mask=feature_specs.feature_ids,
232  )
233  # Alias helps to enforce the fact that all SparseToDense calls
234  # produce new blobs.
235  # Reusing blob names might result in some weird consequences
236  # during the delivery time, when content of the blobs is
237  # generated based on the inputSpecs.
238  net.Alias(record[field].values.items(),
239  self.output_schema[field].values())
240 
241  def get_metadata(self):
242  metadata = []
243  for field, feature_specs in self.input_specs:
244  metadata.append(
245  (
246  {
247  'type': feature_specs.feature_type,
248  'names': feature_specs.feature_names,
249  'ids': feature_specs.feature_ids,
250  },
251  self.output_schema[field].field_blobs(),
252  self.output_schema[field].field_types()
253  )
254  )
255  if feature_specs.feature_type == 'FLOAT':
256  metadata[-1][0]['cardinality'] = 1
257  return metadata
def __init__(self, model, input_record, input_specs, name='feature_sparse_to_dense', kwargs)