Caffe2 - C++ API
A deep learning, cross platform ML framework
lengths_reducer_rowwise_8bit_ops.h
1 
2 #ifndef CAFFE2_OPERATORS_LENGTHS_REDUCER_ROWWISE_8bits_OP_H_
3 #define CAFFE2_OPERATORS_LENGTHS_REDUCER_ROWWISE_8bits_OP_H_
4 // SparseLengthsSum8bits
5 
6 #include "caffe2/core/context.h"
7 #include "caffe2/core/logging.h"
8 #include "caffe2/core/operator.h"
9 #include "caffe2/operators/reducer_functors.h"
10 #include "caffe2/perfkernels/embedding_lookup.h"
11 #include "caffe2/utils/eigen_utils.h"
12 #include "caffe2/utils/math.h"
13 
14 namespace caffe2 {
15 
16 namespace {
17 const float kEqualityThreshold = 1e-10f;
18 }
19 
20 template <
21  class Context,
22  bool USE_WEIGHTS = 0,
23  bool USE_MEAN = 0,
24  class OutDataT = float>
25 class SparseLengths8BitsRowwiseOp : public Operator<Context> {
26  public:
27  USE_OPERATOR_CONTEXT_FUNCTIONS;
28  USE_SIMPLE_CTOR_DTOR(SparseLengths8BitsRowwiseOp);
29 
30  bool RunOnDevice() override {
32  this, Input(INDICES));
33  }
34 
35  template <typename IndexType>
36  bool DoRunWithType() {
37  auto& dataInput = Input(DATA);
38  auto& lengthsInput = Input(LENGTHS);
39 
40  auto* scale_bias = Input(SCALE_BIAS).template data<float>();
41  CAFFE_ENFORCE_EQ(1, lengthsInput.dim(), "LENGTHS must be a vector");
42  const int64_t outputSize = lengthsInput.size(0);
43 
44  auto& indicesInput = Input(INDICES);
45  CAFFE_ENFORCE_EQ(2, Input(SCALE_BIAS).dim(), "scale_bias has to be matrix");
46  CAFFE_ENFORCE_EQ(
47  dataInput.size(0),
48  Input(SCALE_BIAS).size(0),
49  "scale_bias must have the same first dim as data");
50  CAFFE_ENFORCE_EQ(
51  2,
52  Input(SCALE_BIAS).size(1),
53  "the second dim of scale_bias has to be equal to 2");
54  CAFFE_ENFORCE_EQ(1, indicesInput.dim(), "INDICES must be a vector");
55  const IndexType* indices = indicesInput.template data<IndexType>();
56  int64_t dataToReduceSize = indicesInput.size(0);
57 
58  const int* lengths = lengthsInput.template data<int>();
59  vector<int64_t> shape = dataInput.sizes().vec();
60  shape[0] = outputSize;
61  auto* output = Output(0, shape, at::dtype<OutDataT>());
62  const float* w = nullptr;
63  if (USE_WEIGHTS) {
64  w = Input(WEIGHTS).template data<float>();
65  }
66  int64_t in_block_size = dataInput.size_from_dim(1);
67  OutDataT* out = output->template mutable_data<OutDataT>();
68  const uint8_t* input_data = dataInput.template data<uint8_t>();
69 
70  // delegate work to perfkernel that branches based on architecture
71  const int64_t indices_size = indicesInput.numel();
72  const int64_t N = dataInput.size(0);
74  in_block_size,
75  outputSize,
76  indices_size,
77  N, // embeding table length
78  input_data,
79  indices,
80  lengths,
81  w,
82  scale_bias,
83  USE_MEAN,
84  out);
85 
86  return true;
87  }
88 
89  enum {
90  DATA = 0,
91  WEIGHTS = 1,
92  INDICES = 1 + USE_WEIGHTS,
93  LENGTHS = 2 + USE_WEIGHTS,
94  SCALE_BIAS = 3 + USE_WEIGHTS
95  };
96 };
97 
98 template <class Context>
99 class FloatToRowwiseQuantized8BitsOp : public Operator<Context> {
100  public:
101  USE_OPERATOR_CONTEXT_FUNCTIONS;
102  USE_SIMPLE_CTOR_DTOR(FloatToRowwiseQuantized8BitsOp);
103  bool RunOnDevice() override {
104  auto& input = Input(DATA_FLOAT);
105 
106  auto* input_data = input.template data<float>();
107  auto* output = Output(DATA_UINT8, input.sizes(), at::dtype<uint8_t>());
108  vector<int64_t> scale_bias_dims = {input.size(0), 2};
109  auto* scale_bias = Output(SCALE_BIAS, scale_bias_dims, at::dtype<float>());
110  auto* output_data = output->template mutable_data<uint8_t>();
111  float* scale_bias_data = scale_bias->template mutable_data<float>();
112  size_t n_blocks = input.size(0);
113  size_t block_size = input.size_from_dim(1);
114  for (size_t i = 0; i < n_blocks; ++i) {
115  ConstEigenVectorArrayMap<float> input_row(
116  input_data + i * block_size, block_size);
117  EigenVectorArrayMap<uint8_t> output_row(
118  output_data + i * block_size, block_size);
119  auto min_element = input_row.minCoeff();
120  auto max_element = input_row.maxCoeff();
121  if (max_element - min_element < kEqualityThreshold) {
122  scale_bias_data[2 * i] = 1.0f;
123  scale_bias_data[2 * i + 1] = min_element;
124  memset(output_data + i * block_size, 0, block_size);
125  } else {
126  scale_bias_data[2 * i] = (max_element - min_element) / 255.0f;
127  scale_bias_data[2 * i + 1] = min_element;
128  const float inv_scale = 1.0f / scale_bias_data[2 * i];
129  output_row = ((input_row - scale_bias_data[2 * i + 1]) * inv_scale)
130  .round()
131  .template cast<uint8_t>();
132  }
133  }
134  return true;
135  }
136 
137  private:
138  INPUT_TAGS(DATA_FLOAT);
139  OUTPUT_TAGS(DATA_UINT8, SCALE_BIAS);
140 };
141 
142 template <class Context>
143 class Rowwise8BitQuantizedToFloatOp : public Operator<Context> {
144  public:
145  USE_OPERATOR_CONTEXT_FUNCTIONS;
146  USE_SIMPLE_CTOR_DTOR(Rowwise8BitQuantizedToFloatOp);
147  bool RunOnDevice() override {
148  auto& input = Input(DATA_UINT8);
149  auto& scale_bias = Input(SCALE_BIAS);
150 
151  CAFFE_ENFORCE_EQ(2, scale_bias.dim(), "scale_bias has to be matrix");
152  CAFFE_ENFORCE_EQ(
153  input.size(0),
154  scale_bias.size(0),
155  "scale_bias must have the same first dim as data");
156  CAFFE_ENFORCE_EQ(
157  2,
158  scale_bias.size(1),
159  "the second dim of scale_bias has to be equal to 2");
160  auto* output = Output(DATA_FLOAT, input.sizes(), at::dtype<float>());
161  auto* input_data = input.template data<uint8_t>();
162  auto* scale_bias_data = scale_bias.template data<float>();
163 
164  auto* output_data = output->template mutable_data<float>();
165  size_t block_size = input.size_from_dim(1);
166  size_t n_blocks = input.size(0);
167 
168  for (size_t i = 0; i < n_blocks; ++i) {
169  ConstEigenVectorArrayMap<uint8_t> input_row(
170  input_data + i * block_size, block_size);
171  EigenVectorArrayMap<float> output_row(
172  output_data + i * block_size, block_size);
173  output_row = input_row.template cast<float>() * scale_bias_data[2 * i] +
174  scale_bias_data[2 * i + 1];
175  }
176  return true;
177  }
178 
179  private:
180  INPUT_TAGS(DATA_UINT8, SCALE_BIAS);
181  OUTPUT_TAGS(DATA_FLOAT);
182 };
183 }
184 #endif // CAFFE2_OPERATORS_LENGTHS_REDUCER_ROWWISE_8bits_H_
const Tensor & Input(int idx, DeviceType type=Context::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
void EmbeddingLookup(const std::int64_t block_size, const std::int64_t output_size, const std::int64_t index_size, const std::int64_t data_size, const InType *input, const IndexType *indices, const int *lengths, const float *weights, const float *scale_bias, bool normalize_by_lengths, OutType *out)
Embedding lookup with reduction.