Caffe2 - Python API
A deep learning, cross platform ML framework
sparse_feature_hash.py
1 # Copyright (c) 2016-present, Facebook, Inc.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 ##############################################################################
15 
16 ## @package sparse_feature_hash
17 # Module caffe2.python.layers.sparse_feature_hash
18 from __future__ import absolute_import
19 from __future__ import division
20 from __future__ import print_function
21 from __future__ import unicode_literals
22 
23 from caffe2.python import schema
24 from caffe2.python.layers.layers import (
25  ModelLayer,
26  IdList,
27  IdScoreList,
28 )
29 from caffe2.python.layers.tags import (
30  Tags
31 )
32 
33 import numpy as np
34 
35 
36 class SparseFeatureHash(ModelLayer):
37 
38  def __init__(self, model, input_record, seed=0, modulo=None,
39  use_hashing=True, name='sparse_feature_hash', **kwargs):
40  super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs)
41 
42  self.seed = seed
43  self.use_hashing = use_hashing
44  if schema.equal_schemas(input_record, IdList):
45  self.modulo = modulo or self.extract_hash_size(input_record.items.metadata)
46  metadata = schema.Metadata(
47  categorical_limit=self.modulo,
48  feature_specs=input_record.items.metadata.feature_specs,
49  )
50  hashed_indices = schema.Scalar(
51  np.int64,
52  self.get_next_blob_reference("hashed_idx")
53  )
54  hashed_indices.set_metadata(metadata)
56  values=hashed_indices,
57  lengths_blob=input_record.lengths,
58  )
59  elif schema.equal_schemas(input_record, IdScoreList):
60  self.modulo = modulo or self.extract_hash_size(input_record.keys.metadata)
61  metadata = schema.Metadata(
62  categorical_limit=self.modulo,
63  feature_specs=input_record.keys.metadata.feature_specs,
64  )
65  hashed_indices = schema.Scalar(
66  np.int64,
67  self.get_next_blob_reference("hashed_idx")
68  )
69  hashed_indices.set_metadata(metadata)
70  self.output_schema = schema.Map(
71  keys=hashed_indices,
72  values=input_record.values,
73  lengths_blob=input_record.lengths,
74  )
75  else:
76  assert False, "Input type must be one of (IdList, IdScoreList)"
77 
78  assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo)
79 
80  # operators in this layer do not have CUDA implementation yet.
81  # In addition, since the sparse feature keys that we are hashing are
82  # typically on CPU originally, it makes sense to have this layer on CPU.
83  self.tags.update([Tags.CPU_ONLY])
84 
85  def extract_hash_size(self, metadata):
86  if metadata.feature_specs and metadata.feature_specs.desired_hash_size:
87  return metadata.feature_specs.desired_hash_size
88  elif metadata.categorical_limit is not None:
89  return metadata.categorical_limit
90  else:
91  assert False, "desired_hash_size or categorical_limit must be set"
92 
93  def add_ops(self, net):
94  if schema.equal_schemas(self.output_schema, IdList):
95  input_blob = self.input_record.items()
96  output_blob = self.output_schema.items()
97  elif schema.equal_schemas(self.output_schema, IdScoreList):
98  input_blob = self.input_record.keys()
99  output_blob = self.output_schema.keys()
100  else:
101  raise NotImplementedError()
102 
103  if self.use_hashing:
104  net.IndexHash(
105  input_blob, output_blob, seed=self.seed, modulo=self.modulo
106  )
107  else:
108  net.Mod(
109  input_blob, output_blob, divisor=self.modulo, sign_follow_divisor=True
110  )