Caffe2 - C++ API
A deep learning, cross platform ML framework
dnnlowp_op.h
1 #pragma once
2 
3 #ifdef _OPENMP
4 #include <omp.h>
5 #endif
6 
7 #include "caffe2/core/operator.h"
8 #include "caffe2/core/tensor_int8.h"
9 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
10 #include "caffe2/quantization/server/dnnlowp.h"
11 #include "caffe2/quantization/server/fbgemm_pack_blob.h"
12 #include "caffe2/quantization/server/op_wrapper.h"
13 #include "caffe2/quantization/server/sigmoid.h"
14 #include "caffe2/quantization/server/tanh.h"
15 
16 #ifdef _OPENMP
17 C10_DECLARE_int(caffe2_omp_num_threads);
18 #endif
19 
20 namespace caffe2 {
21 
76 template <typename T, typename FP32_OP>
77 class DNNLowPOp : public Operator<CPUContext> {
78  static_assert(std::is_integral<T>::value, "Integral required.");
79 
80  public:
81  USE_OPERATOR_FUNCTIONS(CPUContext);
82  DNNLowPOp(const OperatorDef& operator_def, Workspace* ws)
83  : Operator<CPUContext>(operator_def, ws),
84  in_qparams_(InputSize()),
85  qfactory_(dnnlowp::GetQuantizationFactoryOf(this)) {
86 #ifdef _OPENMP
87  if (FLAGS_caffe2_omp_num_threads > 0) {
88  omp_set_num_threads(FLAGS_caffe2_omp_num_threads);
89  }
90 #endif
91  if (this->debug_def().engine() == "DNNLOWP_16" ||
92  this->debug_def().engine() == "DNNLOWP_ROWWISE_16") {
93  LOG(WARNING)
94  << this->debug_def().engine()
95  << " is an experimental feature mostly for testing accuracy with "
96  "fixed-point precision higher than 8 and performance is very slow";
97  }
98  }
99 
100  virtual ~DNNLowPOp() {
101  if (measure_quantization_error_) {
102  dnnlowp::ReportQuantizationError(this, quantization_error_stats_);
103  }
104  }
105 
106  protected:
107  const TensorCPU& InputTensorCPU_(int idx) {
108  if (InputIsType<int8::Int8TensorCPU>(idx)) {
109  return this->Input<int8::Int8TensorCPU>(idx).t;
110  } else if (InputIsType<Int8FCDNNLowPPackedWeightBlob>(idx)) {
111  return this->Input<Int8FCDNNLowPPackedWeightBlob>(idx).original_tensor;
112  } else {
113  return Input(idx);
114  }
115  }
116 
117  TensorCPU* OutputTensorCPU_(int idx) {
118  if (dequantize_output_) {
119  return Output(idx);
120  } else {
121  return &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
122  }
123  }
124 
125  Tensor* OutputTensorCPU_(int idx, at::IntList dims, at::TensorOptions options) {
126  if (dequantize_output_) {
127  return Output(idx, dims, options.device(CPU));
128  } else {
129  auto* t = &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
130  ReinitializeTensor(t, dims, options.device(CPU));
131  return t;
132  }
133  }
134 
135  T* GetQuantizedOutputData_() {
136  if (dequantize_output_) {
137  out_temp_.resize(Output(0)->numel());
138  return out_temp_.data();
139  } else {
140  return OutputTensorCPU_(0)->template mutable_data<T>();
141  }
142  }
143 
144  void MeasureQuantizationError_() {
145  if (!measure_quantization_error_ || !Fp32Op_()) {
146  return;
147  }
148 
149  const float* actual = nullptr;
150  vector<float> actual_temp;
151  if (OutputTensorCPU_(0)->template IsType<float>()) {
152  actual = OutputTensorCPU_(0)->template data<float>();
153  } else {
154  actual_temp.resize(OutputTensorCPU_(0)->numel());
155  fbgemm::Dequantize<T>(
156  OutputTensorCPU_(0)->template data<T>(),
157  actual_temp.data(),
158  OutputTensorCPU_(0)->numel(),
159  out_qparams_);
160  actual = actual_temp.data();
161  }
162 
163  float* ref = Fp32Op_()->Get()->Output(0)->template mutable_data<float>();
164  if (followed_by_ == "Relu") {
165  for (int i = 0; i < Output(0)->numel(); ++i) {
166  ref[i] = std::max(0.f, ref[i]);
167  }
168  }
169 
170  dnnlowp::MeasureQuantizationError(
171  actual, ref, OutputTensorCPU_(0)->numel(), &quantization_error_stats_);
172  }
173 
174  void RunOnDeviceEpilogue_() {
175  if (dequantize_output_) {
176  fbgemm::Dequantize<T>(
177  out_temp_.data(),
178  OutputTensorCPU_(0)->template mutable_data<float>(),
179  OutputTensorCPU_(0)->numel(),
180  out_qparams_);
181  } else {
182  dnnlowp::PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
183  }
184 
185  MeasureQuantizationError_();
186  }
187 
188  void ParseDNNLowPOperatorArguments_() {
189  // Ideally, this should be done in constructor but any modification of
190  // arguments in ParseDNNLowPOperatorArguments will be ignored if we call
191  // this from constructor.
192  // Make sure all derived classes call this "early enough" so that they
193  // use correct parameters.
194  if (!arguments_parsed_) {
195  dnnlowp::ParseDNNLowPOperatorArguments(
196  this,
197  &dequantize_output_,
198  &measure_quantization_error_,
199  &followed_by_);
200  arguments_parsed_ = true;
201  }
202  }
203 
204  void GetOutputQuantizationParams_() {
205  using namespace dnnlowp;
206 
207  ParseDNNLowPOperatorArguments_();
208 
209  if (HasStaticQuantization(this)) {
210  out_qparams_ = GetStaticQuantizationParamsOf(this, 0);
211 
212  if (measure_quantization_error_) {
213  // To measure quantization error, run ref fp32 impl.
214  // This doesn't really belong here but we need to run the reference fp32
215  // implementation before quantized computation of some inplace operators
216  // will overwrite their inputs.
217  Fp32Op_()->DequantizeInput();
218  Fp32Op_()->Get()->RunOnDevice();
219  }
220  } else {
221  // TODO: this is only needed when dequantize_output_ == false but leave
222  // as it is now because some code relies on out_qparams_ initialized even
223  // though it never actually uses it.
224  Fp32Op_()->DequantizeInput();
225  Fp32Op_()->Get()->RunOnDevice();
226  out_qparams_ = Fp32Op_()->GetOutputQuantizationParams(qfactory_.get());
227  }
228  }
229 
230  OpWrapper<FP32_OP, T>* Fp32Op_() {
231  if (!fp32_op_) {
232  fp32_op_.reset(new OpWrapper<FP32_OP, T>(this, qfactory_.get()));
233  }
234  return fp32_op_.get();
235  }
236 
237  bool dequantize_output_{false}, measure_quantization_error_{false};
238  std::string followed_by_;
239 
240  std::vector<dnnlowp::TensorQuantizationParams> in_qparams_;
241  dnnlowp::TensorQuantizationParams out_qparams_;
242 
243  std::unique_ptr<OpWrapper<FP32_OP, T>> fp32_op_;
244  std::unique_ptr<dnnlowp::QuantizationFactory> qfactory_;
245 
246  std::vector<T> out_temp_;
247  // Buffer to store quantized output temporarily
248  // when we output dequantized values.
249 
250  dnnlowp::QuantizationErrorStats quantization_error_stats_;
251 
252  bool arguments_parsed_{false};
253 };
254 
255 #define USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, FP32_OP) \
256  /* using override */ using BaseType = DNNLowPOp<T, FP32_OP>; \
257  /* using override */ using BaseType::GetOutputQuantizationParams_; \
258  /* using override */ using BaseType::GetQuantizedOutputData_; \
259  /* using override */ using BaseType::Fp32Op_; \
260  /* using override */ using BaseType::InputTensorCPU_; \
261  /* using override */ using BaseType::MeasureQuantizationError_; \
262  /* using override */ using BaseType::OutputTensorCPU_; \
263  /* using override */ using BaseType::RunOnDeviceEpilogue_; \
264  /* using override */ using BaseType::dequantize_output_; \
265  /* using override */ using BaseType::followed_by_; \
266  /* using override */ using BaseType::in_qparams_; \
267  /* using override */ using BaseType::measure_quantization_error_; \
268  /* using override */ using BaseType::out_qparams_; \
269  /* using override */ using BaseType::qfactory_;
270 
271 inline int dnnlowp_get_num_threads() {
272 #ifdef _OPENMP
273  return omp_get_num_threads();
274 #else
275  return 1;
276 #endif
277 }
278 
279 inline int dnnlowp_get_max_threads() {
280 #ifdef _OPENMP
281  return omp_get_max_threads();
282 #else
283  return 1;
284 #endif
285 }
286 
287 inline int dnnlowp_get_thread_num() {
288 #ifdef _OPENMP
289  return omp_get_thread_num();
290 #else
291  return 0;
292 #endif
293 }
294 
295 } // namespace caffe2
C10_NODISCARD TensorOptions device(c10::optional< Device > device) const noexcept
Return a copy of TensorOptions with device set to the given one, or cleared if device is nullopt...
void ReinitializeTensor(Tensor *tensor, at::IntArrayRef dims, at::TensorOptions options)
Reinitialize a Tensor to given dims and options if necessary, note that this will not do anything if ...
Definition: tensor.cc:127
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:40
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:47
const Tensor & Input(int idx, DeviceType type=CPUContext::GetDeviceType())
Retrieve a non-owning reference to the input at position &#39;idx&#39; for this operator. ...
Definition: operator.h:702
A global dictionary that holds information about what Caffe2 modules have been loaded in the current ...
Definition: blob.h:13
A convenient base class for C2 operators with DNNLOWP engine.
Definition: dnnlowp_op.h:77
Wrap a floating-point operator with quantized inputs with type T.
Definition: op_wrapper.h:15