Caffe2 - C++ API
A deep learning, cross platform ML framework
elementwise_add_dnnlowp_op.cc
1 #include "caffe2/operators/elementwise_add_op.h"
2 #include "caffe2/quantization/server/sigmoid.h"
3 #include "elementwise_dnnlowp_op.h"
4 #include "op_wrapper.h"
5 
6 namespace caffe2 {
7 
8 using namespace std;
9 using namespace dnnlowp;
10 
11 using AddFp32Op =
12  BinaryElementwiseOp<NumericTypes, CPUContext, AddFunctor<CPUContext>>;
13 
14 template <typename T>
15 class AddDNNLowPOp : public BinaryElementwiseDNNLowPOp<T, AddFp32Op> {
16  public:
17  USE_OPERATOR_FUNCTIONS(CPUContext);
18  USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, AddFp32Op);
22 
23  AddDNNLowPOp(const OperatorDef& operator_def, Workspace* ws)
24  : BinaryElementwiseDNNLowPOp<T, AddFp32Op>(operator_def, ws) {}
25 
26  bool RunOnDevice() override {
27  if (!GetQuantizationParameters_()) {
28  return false;
29  }
30 
31  const auto& A = InputTensorCPU_(0);
32  const auto& B = InputTensorCPU_(1);
33  auto* C = OutputTensorCPU_(0);
34  CAFFE_ENFORCE(
35  &B != C || !enable_broadcast_,
36  "In-place is allowed only with the first tensor when broadcasting");
37  C->ResizeLike(A);
38 
39  // Quantize inputs if needed
40  vector<int32_t> A_quantized(A.numel()), B_quantized(B.numel());
41  for (int i = 0; i < 2; ++i) {
42  int32_t* quantized_in = i == 0 ? A_quantized.data() : B_quantized.data();
43  if (InputTensorCPU_(i).template IsType<T>()) {
44  float real_multiplier =
45  in_qparams_[i].scale / intermediate_qparams_.scale;
46  RequantizationParams in_requantization_params =
47  qfactory_->ChooseRequantizationMultiplier(
48  real_multiplier, intermediate_qparams_);
49 
50  const T* input_data = InputTensorCPU_(i).template data<T>();
51 #ifdef _OPENMP
52 #pragma omp parallel for
53 #endif
54  for (int j = 0; j < InputTensorCPU_(i).numel(); ++j) {
55  quantized_in[j] = fbgemm::Requantize<int32_t>(
56  input_data[j] - in_qparams_[i].zero_point,
57  in_requantization_params);
58  }
59  } else {
60  assert(A.template IsType<float>());
61  const float* input_data = InputTensorCPU_(i).template data<float>();
62 #ifdef _OPENMP
63 #pragma omp parallel for
64 #endif
65  for (int j = 0; j < InputTensorCPU_(i).numel(); ++j) {
66  quantized_in[j] = fbgemm::Quantize<uint32_t>(
67  input_data[j],
68  intermediate_qparams_.zero_point,
69  intermediate_qparams_.scale,
70  qfactory_->GetEltwiseQuantizePrecision());
71  }
72  }
73  }
74 
75  int32_t intermediate_zero_point =
76  intermediate_qparams_.zero_point * InputSize();
77 
78  T* C_quantized = GetQuantizedOutputData_();
79 
80  if (!enable_broadcast_) {
81  CAFFE_ENFORCE_EQ(
82  A.sizes(),
83  B.sizes(),
84  "Dimension mismatch - did you forget to set broadcast=1?");
85 #ifdef _OPENMP
86 #pragma omp parallel for
87 #endif
88  for (int i = 0; i < C->numel(); ++i) {
89  int32_t raw = A_quantized[i] + B_quantized[i] - intermediate_zero_point;
90  C_quantized[i] = fbgemm::Requantize<T>(raw, requantization_params_);
91  }
92  } else if (B.numel() == 1) {
93 #ifdef _OPENMP
94 #pragma omp parallel for
95 #endif
96  for (int i = 0; i < C->numel(); ++i) {
97  int32_t raw = A_quantized[i] + B_quantized[0] - intermediate_zero_point;
98  C_quantized[i] = fbgemm::Requantize<T>(raw, requantization_params_);
99  }
100  } else {
101  size_t pre, n, post;
102  std::tie(pre, n, post) =
103  elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
104 #ifdef _OPENMP
105 #pragma omp parallel for
106 #endif
107  for (int i = 0; i < pre; ++i) {
108  for (int j = 0; j < n; ++j) {
109  for (int k = 0; k < post; ++k) {
110  int32_t raw = A_quantized[((i * n) + j) * post + k] +
111  B_quantized[j] - intermediate_zero_point;
112  C_quantized[((i * n) + j) * post + k] =
113  fbgemm::Requantize<T>(raw, requantization_params_);
114  }
115  }
116  }
117  }
118 
119  RunOnDeviceEpilogue_();
120 
121  return true;
122  }
123 
124  private:
125  bool GetQuantizationParameters_() {
126  // Find global min and max of all inputs
127  float global_min = numeric_limits<float>::max(),
128  global_max = numeric_limits<float>::lowest();
129 
130  for (int i = 0; i < InputSize(); ++i) {
131  in_qparams_[i] =
132  GetInputTensorQuantizationParamsOf(this, i, qfactory_.get());
133 
134  global_min = std::min(global_min, in_qparams_[i].Min());
135  global_max = std::max(global_max, in_qparams_[i].Max());
136  }
137 
138  intermediate_qparams_ = qfactory_->ChooseQuantizationParams(
139  global_min,
140  global_max,
141  qfactory_->GetEltwiseQuantizePrecision(),
142  qfactory_->GetPreserveActivationSparsity());
143 
144  GetOutputQuantizationParams_();
145 
146  float real_multiplier = intermediate_qparams_.scale / out_qparams_.scale;
147  requantization_params_ = qfactory_->ChooseRequantizationMultiplier(
148  real_multiplier, out_qparams_);
149 
150  return true;
151  }
152 
153  dnnlowp::TensorQuantizationParams intermediate_qparams_;
154 }; // class AddDNNLowPOp
155 
156 REGISTER_CPU_OPERATOR_WITH_ENGINE(Add, DNNLOWP, AddDNNLowPOp<uint8_t>);
157 REGISTER_CPU_OPERATOR_WITH_ENGINE(Int8Add, DNNLOWP, AddDNNLowPOp<uint8_t>);
158 
159 } // namespace caffe2
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:40
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
does bound shape inference given a C2 net.
Definition: static.cpp:64
Definition: static.cpp:58
Definition: OpClasses.h:659